In [2]:
import os
import cv2
import torch
import random
import numpy as np
import torch.nn as nn
from pathlib import Path
from tqdm import tqdm
import albumentations as A
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torchvision.models.video as video_models
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
CONFIG = {
    'data_root': '../data/temp',
    'frame_height': 224,
    'frame_width': 224,
    'num_frames': 32,
    'batch_size': 4,
    'epochs': 10,
    'learning_rate': 1e-4,
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
}

# Set seed
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)


In [4]:
def load_video_paths(data_root):
    video_paths = []
    labels = []
    class_names = sorted(os.listdir(data_root))
    label_to_idx = {name: i for i, name in enumerate(class_names)}

    for class_name in class_names:
        class_dir = Path(data_root) / class_name
        for video_file in class_dir.glob("*.mp4"):
            video_paths.append(str(video_file))
            labels.append(label_to_idx[class_name])
    return video_paths, labels, label_to_idx

video_paths, labels, label_to_idx = load_video_paths(CONFIG['data_root'])


In [5]:
train_paths, val_paths, train_labels, val_labels = train_test_split(
    video_paths, labels, test_size=0.2, stratify=labels, random_state=42
)


In [6]:
class SoccerDataset(Dataset):
    def __init__(self, video_paths, labels, config):
        self.video_paths = video_paths
        self.labels = labels
        self.config = config
        self.transform = A.Compose([
            A.Resize(height=config['frame_height'], width=config['frame_width']),
            A.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989])
        ])

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        path = self.video_paths[idx]
        label = self.labels[idx]
        frames = self._load_video(path)
        return frames, label

    def _load_video(self, path):
        cap = cv2.VideoCapture(path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        indices = np.linspace(0, frame_count - 1, self.config['num_frames'], dtype=int)
        frames = []

        for i in indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if not ret:
                continue
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = self.transform(image=frame)['image']
            frames.append(frame)

        cap.release()

        frames = np.array(frames)
        frames = np.transpose(frames, (3, 0, 1, 2))
        return torch.from_numpy(frames).float()


In [7]:
class R3DClassifier(nn.Module):
    def __init__(self, num_classes):
        super(R3DClassifier, self).__init__()
        self.model = video_models.r3d_18(pretrained=True)
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

    def forward(self, x):
        return self.model(x)

model = R3DClassifier(num_classes=len(label_to_idx)).to(CONFIG['device'])




In [8]:
train_loader = DataLoader(SoccerDataset(train_paths, train_labels, CONFIG), batch_size=CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(SoccerDataset(val_paths, val_labels, CONFIG), batch_size=CONFIG['batch_size'])

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['learning_rate'])


In [9]:
train_acc_list, val_acc_list = [], []

for epoch in range(CONFIG['epochs']):
    model.train()
    train_preds, train_targets = [], []

    for videos, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        videos = videos.to(CONFIG['device'])
        labels = labels.to(CONFIG['device'])

        outputs = model(videos)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_preds.extend(torch.argmax(outputs, 1).cpu().numpy())
        train_targets.extend(labels.cpu().numpy())

    train_acc = accuracy_score(train_targets, train_preds)
    train_acc_list.append(train_acc)

    model.eval()
    val_preds, val_targets = [], []

    with torch.no_grad():
        for videos, labels in val_loader:
            videos = videos.to(CONFIG['device'])
            labels = labels.to(CONFIG['device'])

            outputs = model(videos)
            val_preds.extend(torch.argmax(outputs, 1).cpu().numpy())
            val_targets.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_targets, val_preds)
    val_acc_list.append(val_acc)

    print(f"✅ Epoch {epoch+1}: Train Acc = {train_acc:.4f}, Val Acc = {val_acc:.4f}")


Epoch 1:   2%|▏         | 1/41 [02:19<1:33:07, 139.68s/it]


KeyboardInterrupt: 