In [10]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.io import read_video
import torchvision.transforms as transforms
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
import copy
from tqdm import tqdm
import warnings


In [11]:
# Suppress torchvision video deprecation warning
warnings.filterwarnings("ignore", category=UserWarning, module="torchvision.io")

In [12]:
# Define parameters
BATCH_SIZE = 8
NUM_EPOCHS = 10
LEARNING_RATE = 1e-4
NUM_CLASSES = 3
FRAMES_PER_CLIP = 16
IMG_SIZE = 112  # R(2+1)D expects 112x112


In [13]:
class VideoDataset(Dataset):
    def __init__(self, df, root_dir, frames_per_clip=16, transform=None):
        self.df = df
        self.root_dir = root_dir
        self.frames_per_clip = frames_per_clip
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Retrieve relative path from dataframe
        # Assuming clip_path is like './dataset/Diving/v_Diving_g03_c01.avi'
        # We need to construct the full absolute path or correct relative path.
        # Since the script is running from the project root, './dataset/...' is correct.

        video_path = self.df.iloc[idx]['clip_path']
        label = int(self.df.iloc[idx]['encoded_label'])

        # Determine actual file path
        # If running from /home/yogendra/workspace/python/university/Video-Analytics-Assignment1/
        # and clip_path is ./dataset/..., it works locally.

        try:
            # read_video returns (T, H, W, C) in [0, 255]
            video, _, info = read_video(video_path, pts_unit='sec')
            # Note: using default THWC format
        except Exception as e:
            # Handle read errors (e.g. corrupt video) by returning a zero tensor or skipping
            print(f"Error reading {video_path}: {e}")
            return torch.zeros((3, self.frames_per_clip, IMG_SIZE, IMG_SIZE)), label

        # video is (T, H, W, C)
        total_frames = video.shape[0]

        # Temporal resampling
        if total_frames >= self.frames_per_clip:
            # Uniformly sample frames
            indices = np.linspace(0, total_frames - 1, self.frames_per_clip).astype(int)
            video = video[indices]
        else:
            # Loop video to fill frames
            needed = self.frames_per_clip - total_frames
            # Simple padding: repeat last frame or cycle
            # Let's cycle
            indices = np.resize(np.arange(total_frames), self.frames_per_clip)
            video = video[indices]

        # Current shape: (F, H, W, C)
        # Permute to (C, F, H, W) for transforms/model
        video = video.permute(3, 0, 1, 2)

        # Convert to float and normalize to [0, 1]
        video = video.float() / 255.0

        # Apply transforms
        if self.transform:
            video = self.transform(video)

        return video, label


In [14]:
def get_train_transform():
    # Helper to apply spatial transforms to (C, F, H, W)
    # Note: torchvision.transforms usually expects (C, H, W).
    # We can treat (F*C, H, W) or create custom wrapper.
    # For simplicity and R(2+1)D:
    # Resize and CenterCrop are standard.

    return transforms.Compose([
        transforms.Resize((128, 171)), # Standard kinetic resize
        transforms.RandomCrop(IMG_SIZE),
        transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989])
    ])

def get_val_transform():
    return transforms.Compose([
        transforms.Resize((128, 171)),
        transforms.CenterCrop(IMG_SIZE),
        transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989])
    ])


In [15]:
# Custom Transform wrapper to handle video tensor (C, F, H, W)
# Standard transforms work on (C, H, W). We can reshape to (C*F, H, W) -> transform -> (C, F, H, W)
# OR manually apply to each frame.
# Reshape approach is cleaner for spatial transforms that don't depend on frame idx (like Resize, Crop).
# BUT RandomCrop needs to be same for all frames.
# Using ReplayTransform or specialized video transforms is best.
# For this assignment, let's implement a simple wrapper class.

class VideoTransform:
    def __init__(self, transform, is_train=True):
        self.transform = transform
        self.is_train = is_train

    def __call__(self, x):
        # x is (C, F, H, W)
        C, F, H, W = x.shape
        # Permute to (F, C, H, W) to apply transform frame by frame?
        # No, that's slow and random transforms would be different per frame.
        # We assume x is a tensor.

        # If we use pytorch transforms that support batch, we can pass (F, C, H, W)
        # But Resize/Crop usually work on (..., H, W).

        # Let's reshape to (F, C, H, W) first
        x = x.permute(1, 0, 2, 3) # (F, C, H, W)

        # Standardize RandomCrop: use torch.seed or apply same params.
        # However, simplistic approach for 'RandomCrop':
        # Just use CenterCrop for validation, and for training if we accept slight jitter per frame (bad)
        # OR use functional API.

        # Better approach for this script:
        # Resize generally.
        x = transforms.functional.resize(x, (128, 171))

        if self.is_train:
            i, j, h, w = transforms.RandomCrop.get_params(x, output_size=(IMG_SIZE, IMG_SIZE))
            x = transforms.functional.crop(x, i, j, h, w)
            # Maybe Horizontal Flip
            if torch.rand(1) < 0.5:
                x = transforms.functional.hflip(x)
        else:
            x = transforms.functional.center_crop(x, (IMG_SIZE, IMG_SIZE))

        # Normalize
        # Normalize expect (C, H, W). We have (F, C, H, W).
        # We can iterate or transpose.
        mean = torch.tensor([0.43216, 0.394666, 0.37645]).view(1, 3, 1, 1)
        std = torch.tensor([0.22803, 0.22145, 0.216989]).view(1, 3, 1, 1)

        # Normalize
        # Normalize expect (C, H, W) or (B, C, H, W). We have (F, C, H, W).
        mean = torch.tensor([0.43216, 0.394666, 0.37645]).view(1, 3, 1, 1)
        std = torch.tensor([0.22803, 0.22145, 0.216989]).view(1, 3, 1, 1)

        x = (x - mean) / std

        # x is (F, C, H, W). Permute back to (C, F, H, W)
        x = x.permute(1, 0, 2, 3) # (C, F, H, W)

        return x


In [16]:
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_labels = []

    for inputs, labels in tqdm(loader, desc="Training"):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    epoch_loss = running_loss / len(loader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)

    return epoch_loss, epoch_acc


In [17]:
def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(loader, desc="Validation"):
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    epoch_loss = running_loss / len(loader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)

    return epoch_loss, epoch_acc, all_labels, all_preds


In [18]:
def main(dry_run=False):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load Dataframes
    train_df = pd.read_csv("./dataset/splits/train.csv", index_col='index')
    val_df = pd.read_csv("./dataset/splits/validation.csv", index_col='index')
    test_df = pd.read_csv("./dataset/splits/test.csv", index_col='index')

    if dry_run:
        print("DRY RUN MODE: limiting dataset size")
        train_df = train_df.iloc[:20]
        val_df = val_df.iloc[:10]
        test_df = test_df.iloc[:10]
        global NUM_EPOCHS
        NUM_EPOCHS = 2

    # Datasets
    train_dataset = VideoDataset(train_df, root_dir='./', transform=VideoTransform(None, is_train=True))
    val_dataset = VideoDataset(val_df, root_dir='./', transform=VideoTransform(None, is_train=False))
    test_dataset = VideoDataset(test_df, root_dir='./', transform=VideoTransform(None, is_train=False))

    # DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

    # Model Setup
    print("Initializing R(2+1)D model...")
    weights = torchvision.models.video.R2Plus1D_18_Weights.DEFAULT
    model = torchvision.models.video.r2plus1d_18(weights=weights)

    # Modify final layer
    in_features = model.fc.in_features
    model.fc = nn.Linear(in_features, NUM_CLASSES)

    model = model.to(device)

    # Optimization
    criterion = nn.CrossEntropyLoss()
    # SGD with momentum as suggested, or Adam
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

    best_acc = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    # Training Loop
    for epoch in range(NUM_EPOCHS):
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}")

        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc, _, _ = validate(model, val_loader, criterion, device)

        scheduler.step()

        print(f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}")

        if val_acc > best_acc:
            best_acc = val_acc
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), "best_model_r2plus1d.pth")
            print("Saved best model.")

    print(f"Best Validation Accuracy: {best_acc:.4f}")

    # Load best model for testing
    model.load_state_dict(best_model_wts)

    # Test Evaluation
    print("\nEvaluating on Test Set...")
    test_loss, test_acc, true_labels, pred_labels = validate(model, test_loader, criterion, device)

    precision = precision_score(true_labels, pred_labels, average='weighted', zero_division=0)
    recall = recall_score(true_labels, pred_labels, average='weighted', zero_division=0)
    f1 = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)

    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")


In [19]:
main()

Using device: cuda
Initializing R(2+1)D model...
Epoch 1/10


Training: 100%|██████████| 30/30 [00:10<00:00,  2.77it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.58it/s]


Train Loss: 1.0144 Acc: 0.5125
Val Loss: 0.7812 Acc: 0.8667
Saved best model.
Epoch 2/10


Training: 100%|██████████| 30/30 [00:10<00:00,  2.79it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.44it/s]


Train Loss: 0.7828 Acc: 0.8167
Val Loss: 0.5593 Acc: 1.0000
Saved best model.
Epoch 3/10


Training: 100%|██████████| 30/30 [00:12<00:00,  2.48it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.71it/s]


Train Loss: 0.6009 Acc: 0.9167
Val Loss: 0.4153 Acc: 1.0000
Epoch 4/10


Training: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.71it/s]


Train Loss: 0.4525 Acc: 0.9792
Val Loss: 0.3079 Acc: 1.0000
Epoch 5/10


Training: 100%|██████████| 30/30 [00:10<00:00,  2.81it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.70it/s]


Train Loss: 0.3721 Acc: 0.9875
Val Loss: 0.2374 Acc: 1.0000
Epoch 6/10


Training: 100%|██████████| 30/30 [00:10<00:00,  2.78it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.53it/s]


Train Loss: 0.3586 Acc: 0.9625
Val Loss: 0.1918 Acc: 1.0000
Epoch 7/10


Training: 100%|██████████| 30/30 [00:10<00:00,  2.81it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.70it/s]


Train Loss: 0.3081 Acc: 0.9667
Val Loss: 0.1535 Acc: 1.0000
Epoch 8/10


Training: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.72it/s]


Train Loss: 0.2664 Acc: 0.9792
Val Loss: 0.1534 Acc: 1.0000
Epoch 9/10


Training: 100%|██████████| 30/30 [00:10<00:00,  2.86it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.53it/s]


Train Loss: 0.3195 Acc: 0.9667
Val Loss: 0.1520 Acc: 1.0000
Epoch 10/10


Training: 100%|██████████| 30/30 [00:10<00:00,  2.79it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  4.55it/s]


Train Loss: 0.2956 Acc: 0.9792
Val Loss: 0.1460 Acc: 1.0000
Best Validation Accuracy: 1.0000

Evaluating on Test Set...


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.67it/s]

Test Accuracy: 0.9333
Test Precision: 0.9444
Test Recall: 0.9333
Test F1 Score: 0.9327



