In [None]:
import os
import json
import cv2
from datetime import datetime
from typing import List

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")



class ResNetFeatureExtractor(nn.Module):
    """ResNet‑34 up to global‑pooling. 512‑D output per frame.

    Parameters
    ----------
    fine_tune : bool
        If **False** (default) the backbone is frozen to save memory.
    """

    def __init__(self, fine_tune: bool = False):
        super().__init__()
        backbone = models.resnet34(weights=models.ResNet34_Weights.DEFAULT)
        self.backbone = nn.Sequential(*list(backbone.children())[:-1])  # strip FC
        self.fine_tune = fine_tune
        if not fine_tune:
            for p in self.backbone.parameters():
                p.requires_grad = False

    def forward(self, x):  # (N, 3, 224, 224)
        if self.fine_tune:
            feats = self.backbone(x)
        else:
            with torch.no_grad():
                feats = self.backbone(x)
        return feats.view(x.size(0), -1)  # (N, 512)


class GRUWithResNet(nn.Module):
    """Video classifier = frozen ResNet‑34 + temporal GRU + FC head."""

    def __init__(self, feature_size: int, hidden_size: int, output_size: int,
                 num_layers: int = 2, dropout: float = 0.3, fine_tune_cnn: bool = False):
        super().__init__()
        self.feature_extractor = ResNetFeatureExtractor(fine_tune=fine_tune_cnn)
        self.gru = nn.GRU(feature_size,
                          hidden_size,
                          num_layers=num_layers,
                          batch_first=True,
                          dropout=dropout if num_layers > 1 else 0.0)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 2, output_size)
        )

    def forward(self, x):  # (B, T, C, H, W)
        b, t, c, h, w = x.size()
        x = x.view(-1, c, h, w)                      # (B*T, C, H, W)
        feats = self.feature_extractor(x)            # (B*T, 512)
        feats = feats.view(b, t, -1)                 # (B, T, 512)
        gru_out, _ = self.gru(feats)                 # (B, T, H)
        logits = self.classifier(gru_out[:, -1, :])  # last step
        return logits


class VideoDataset(Dataset):
    def __init__(self,
                 video_paths: List[str],
                 labels: List[str],
                 label_to_index: dict,
                 max_frames: int = 64,
                 transform=None):
        self.video_paths = video_paths
        self.labels = labels
        self.label_to_index = label_to_index
        self.max_frames = max_frames
        self.transform = transform

    def __len__(self):
        return len(self.video_paths)

    def _sample_frames(self, frames: List[np.ndarray]) -> List[np.ndarray]:
        if len(frames) <= self.max_frames:
            return frames
        # uniform sampling
        idxs = np.linspace(0, len(frames) - 1, self.max_frames, dtype=int)
        return [frames[i] for i in idxs]

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        # --- decode video --- #
        frames = []
        cap = cv2.VideoCapture(video_path)
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, (224, 224))
            frame = frame[:, :, ::-1]  # BGR→RGB
            frame = frame / 255.0
            if self.transform:
                frame = self.transform(frame)
            frames.append(frame)
        cap.release()

        if not frames:
            raise RuntimeError(f"Could not read frames from {video_path}")

        frames = self._sample_frames(frames)
        frames = np.stack(frames)                      # (T, H, W, C)
        frames_tensor = torch.tensor(frames).permute(0, 3, 1, 2).float()
        label_idx = self.label_to_index[label]
        return frames_tensor, label_idx


def load_data(root_directory):
    video_paths, labels = [], []
    for folder_name in os.listdir(root_directory):
        folder_path = os.path.join(root_directory, folder_name)
        if os.path.isdir(folder_path):
            for fname in os.listdir(folder_path):
                if fname.lower().endswith((".mp4", ".avi", ".mov")):
                    video_paths.append(os.path.join(folder_path, fname))
                    labels.append(folder_name)
    return video_paths, labels



root_directory = r"/kaggle/input/highlight-label-extracted/extracted"  
video_paths, labels = load_data(root_directory)

unique_labels = sorted(set(labels))
label_to_index = {lbl: i for i, lbl in enumerate(unique_labels)}
index_to_label = {i: lbl for lbl, i in label_to_index.items()}

train_paths, val_paths, train_labels, val_labels = train_test_split(
    video_paths, labels, test_size=0.2, random_state=42, stratify=labels)



feature_size = 512
hidden_size = 512
output_size = len(unique_labels)
num_layers   = 2
num_epochs   = 20
batch_size   = 2  
learning_rate = 1e-4
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)



train_dataset = VideoDataset(train_paths, train_labels, label_to_index)
val_dataset   = VideoDataset(val_paths,   val_labels,   label_to_index)
train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  pin_memory=True)
val_loader    = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, pin_memory=True)


model = GRUWithResNet(feature_size, hidden_size, output_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
scaler = torch.cuda.amp.GradScaler(enabled=device.type == "cuda")



for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for vids, lbls in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        vids, lbls = vids.to(device, non_blocking=True), lbls.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=device.type == "cuda"):
            outputs = model(vids)
            loss = criterion(outputs, lbls)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1:02d} | train loss: {avg_loss:.4f}")

    # -------------------- validation -------------------- #
    # model.eval()
    # correct = total = 0
    # with torch.no_grad():
    #     for vids, lbls in val_loader:
    #         vids, lbls = vids.to(device, non_blocking=True), lbls.to(device, non_blocking=True)
    #         with torch.cuda.amp.autocast(enabled=device.type == "cuda"):
    #             outputs = model(vids)
    #         preds = outputs.argmax(dim=1)
    #         correct += (preds == lbls).sum().item()
    #         total   += lbls.size(0)
    # val_acc = correct / total if total else 0.0
    # print(f"Epoch {epoch+1:02d} | val acc : {val_acc:.4f}")

    # -------------------- checkpoint -------------------- #
    ckpt_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch+1:02d}.pth")
    torch.save({
        "epoch": epoch + 1,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "train_loss": avg_loss,
        # "val_acc": val_acc,
        "timestamp": datetime.utcnow().isoformat()
    }, ckpt_path)
    print(f"✔ Saved checkpoint → {ckpt_path}\n")

final_model_path = "resnet_gru_highlight_model.pth"
torch.save(model.state_dict(), final_model_path)
print(f"Training complete – final model saved as '{final_model_path}'.")



model.eval()
correct = total = 0
with torch.no_grad():
    for vids, lbls in val_loader:
        vids, lbls = vids.to(device, non_blocking=True), lbls.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(enabled=device.type == "cuda"):
            outputs = model(vids)
        preds = outputs.argmax(dim=1)
        correct += (preds == lbls).sum().item()
        total   += lbls.size(0)
val_acc = correct / total if total else 0.0
print(f"Validation accuracy : {val_acc:.4f}")


Using device: cuda
GPU: Tesla P100-PCIE-16GB


Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100%|██████████| 83.3M/83.3M [00:00<00:00, 187MB/s]
  scaler = torch.cuda.amp.GradScaler(enabled=device.type == "cuda")
  with torch.cuda.amp.autocast(enabled=device.type == "cuda"):
Epoch 1/20: 100%|██████████| 562/562 [20:42<00:00,  2.21s/it]


Epoch 01 | train loss: 2.4744
✔ Saved checkpoint → checkpoints/checkpoint_epoch_01.pth



Epoch 2/20: 100%|██████████| 562/562 [20:26<00:00,  2.18s/it]


Epoch 02 | train loss: 2.1943
✔ Saved checkpoint → checkpoints/checkpoint_epoch_02.pth



Epoch 3/20: 100%|██████████| 562/562 [20:33<00:00,  2.19s/it]


Epoch 03 | train loss: 1.9119
✔ Saved checkpoint → checkpoints/checkpoint_epoch_03.pth



Epoch 4/20: 100%|██████████| 562/562 [20:26<00:00,  2.18s/it]


Epoch 04 | train loss: 1.7438
✔ Saved checkpoint → checkpoints/checkpoint_epoch_04.pth



Epoch 5/20: 100%|██████████| 562/562 [20:29<00:00,  2.19s/it]


Epoch 05 | train loss: 1.5724
✔ Saved checkpoint → checkpoints/checkpoint_epoch_05.pth



Epoch 6/20: 100%|██████████| 562/562 [20:26<00:00,  2.18s/it]


Epoch 06 | train loss: 1.3705
✔ Saved checkpoint → checkpoints/checkpoint_epoch_06.pth



Epoch 7/20: 100%|██████████| 562/562 [20:12<00:00,  2.16s/it]


Epoch 07 | train loss: 1.2788
✔ Saved checkpoint → checkpoints/checkpoint_epoch_07.pth



Epoch 8/20: 100%|██████████| 562/562 [20:20<00:00,  2.17s/it]


Epoch 08 | train loss: 1.1250
✔ Saved checkpoint → checkpoints/checkpoint_epoch_08.pth



Epoch 9/20: 100%|██████████| 562/562 [20:35<00:00,  2.20s/it]


Epoch 09 | train loss: 1.0401
✔ Saved checkpoint → checkpoints/checkpoint_epoch_09.pth



Epoch 10/20: 100%|██████████| 562/562 [20:30<00:00,  2.19s/it]


Epoch 10 | train loss: 0.9306
✔ Saved checkpoint → checkpoints/checkpoint_epoch_10.pth



Epoch 11/20: 100%|██████████| 562/562 [20:36<00:00,  2.20s/it]


Epoch 11 | train loss: 0.8243
✔ Saved checkpoint → checkpoints/checkpoint_epoch_11.pth



Epoch 12/20: 100%|██████████| 562/562 [20:36<00:00,  2.20s/it]


Epoch 12 | train loss: 0.7230
✔ Saved checkpoint → checkpoints/checkpoint_epoch_12.pth



Epoch 13/20: 100%|██████████| 562/562 [20:36<00:00,  2.20s/it]


Epoch 13 | train loss: 0.6800
✔ Saved checkpoint → checkpoints/checkpoint_epoch_13.pth



Epoch 14/20: 100%|██████████| 562/562 [20:15<00:00,  2.16s/it]


Epoch 14 | train loss: 0.6103
✔ Saved checkpoint → checkpoints/checkpoint_epoch_14.pth



Epoch 15/20: 100%|██████████| 562/562 [20:17<00:00,  2.17s/it]


Epoch 15 | train loss: 0.5787
✔ Saved checkpoint → checkpoints/checkpoint_epoch_15.pth



Epoch 16/20: 100%|██████████| 562/562 [20:15<00:00,  2.16s/it]


Epoch 16 | train loss: 0.5053
✔ Saved checkpoint → checkpoints/checkpoint_epoch_16.pth



Epoch 17/20: 100%|██████████| 562/562 [20:20<00:00,  2.17s/it]


Epoch 17 | train loss: 0.3974
✔ Saved checkpoint → checkpoints/checkpoint_epoch_17.pth



Epoch 18/20: 100%|██████████| 562/562 [20:23<00:00,  2.18s/it]


Epoch 18 | train loss: 0.3931
✔ Saved checkpoint → checkpoints/checkpoint_epoch_18.pth



Epoch 19/20: 100%|██████████| 562/562 [20:13<00:00,  2.16s/it]


Epoch 19 | train loss: 0.3557
✔ Saved checkpoint → checkpoints/checkpoint_epoch_19.pth



Epoch 20/20: 100%|██████████| 562/562 [20:21<00:00,  2.17s/it]


Epoch 20 | train loss: 0.3234
✔ Saved checkpoint → checkpoints/checkpoint_epoch_20.pth

Training complete – final model saved as 'resnet_gru_highlight_model.pth'.


  with torch.cuda.amp.autocast(enabled=device.type == "cuda"):


Validation accuracy : 0.5355
