<a href="https://colab.research.google.com/github/kelvin17/ml-project-notebook/blob/main/video_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Dataloader

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_dir = '/content/drive/MyDrive/project/ufc10/'
model_output = '/content/drive/MyDrive/model_output/video_classification/'

Mounted at /content/drive


In [None]:
from glob import glob
import os
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms as T

class FrameImageDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir='/work3/ppar/data/ucf101', split='train', transform=None):
        self.frame_paths = sorted(glob(f'{root_dir}/frames/{split}/*/*/*.jpg'))
        self.df = pd.read_csv(f'{root_dir}/metadata/{split}.csv')
        self.split = split
        self.transform = transform

    def __len__(self):
        return len(self.frame_paths)

    def _get_meta(self, attr, value):
        return self.df.loc[self.df[attr] == value]

    def __getitem__(self, idx):
        frame_path = self.frame_paths[idx]
        video_name = frame_path.split('/')[-2]
        video_meta = self._get_meta('video_name', video_name)
        label = video_meta['label'].item()

        frame = Image.open(frame_path).convert("RGB")

        if self.transform:
            frame = self.transform(frame)
        else:
            frame = T.ToTensor()(frame)

        return frame, label


class FrameVideoDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir = '/work3/ppar/data/ucf101', split = 'train', transform = None, stack_frames = True):
        self.video_paths = sorted(glob(f'{root_dir}/videos/{split}/*/*.avi'))
        self.df = pd.read_csv(f'{root_dir}/metadata/{split}.csv')
        self.split = split
        self.transform = transform
        self.stack_frames = stack_frames

        self.video_names = self.df['video_name'].tolist()

        self.n_sampled_frames = 10

    def __len__(self):
        return len(self.video_paths)

    def _get_meta(self, attr, value):
        return self.df.loc[self.df[attr] == value]

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        video_name = video_path.split('/')[-1].split('.avi')[0]
        video_meta = self._get_meta('video_name', video_name)
        label = video_meta['label'].item()

        video_frames_dir = self.video_paths[idx].split('.avi')[0].replace('videos', 'frames')
        video_frames = self.load_frames(video_frames_dir)

        if self.transform:
            frames = [self.transform(frame) for frame in video_frames]
        else:
            frames = [T.ToTensor()(frame) for frame in video_frames]

        if self.stack_frames:
          frames = torch.stack(frames).permute(1, 0, 2, 3)
          #frames 本身是之前frame的list. stack把这个list堆起来了
          #permute的作用是重新排列维度的顺序。这里就是把T和C顺序互换了

        return frames, label, video_name

    def load_frames(self, frames_dir):
        frames = []
        for i in range(1, self.n_sampled_frames + 1):
            frame_file = os.path.join(frames_dir, f"frame_{i}.jpg")
            frame = Image.open(frame_file).convert("RGB")
            frames.append(frame)

        return frames

In [None]:
if __name__ == '__main__':
    from torch.utils.data import DataLoader


    root_dir = root_dir

    transform = T.Compose([T.Resize((64, 64)),T.ToTensor()])
    frameimage_dataset = FrameImageDataset(root_dir=root_dir, split='val', transform=transform)
    framevideostack_dataset = FrameVideoDataset(root_dir=root_dir, split='val', transform=transform, stack_frames = True)
    framevideolist_dataset = FrameVideoDataset(root_dir=root_dir, split='val', transform=transform, stack_frames = False)


    frameimage_loader = DataLoader(frameimage_dataset,  batch_size=8, shuffle=False)
    framevideostack_loader = DataLoader(framevideostack_dataset,  batch_size=2, shuffle=False)
    framevideolist_loader = DataLoader(framevideolist_dataset,  batch_size=2, shuffle=False)

    print( "Pure frame"+ "---" * 45) # 无法补充时间维度信息了
    # for frames, labels in frameimage_loader:
    #     print(frames.shape, labels.shape) # [batch, channels, height, width]
    frames, labels = next(iter(frameimage_loader))
    print(frames.shape, labels.shape) # [batch, channels, height, width]


    print( "List frames in a video" + "---" * 45)
    # for video_frames, labels, video_names in framevideolist_loader:
    #     for frame in video_frames: # loop through number of frames # 每个video_frames是按时间顺序的
    #         print(frame.shape, labels.shape, video_names.shape)# [batch, channels, height, width]
    video_frames, labels, video_names = next(iter(framevideolist_loader))

    for frame in video_frames: # loop through number of frames # 每个video_frames是按时间顺序的
        print(frame.shape, labels.shape, video_names)# [batch, channels, height, width]

    print( "Stack frames in a video" + "---" * 45)
    # for video_frames, labels in framevideostack_loader: # 每个video_frames是一个video按时间顺序的
    #     print(video_frames.shape, labels.shape) # [batch, channels, number of frames, height, width]
    video_frames, labels, video_names = next(iter(framevideostack_loader))
    print(video_frames.shape, labels.shape, video_names)


Pure frame---------------------------------------------------------------------------------------------------------------------------------------
torch.Size([8, 3, 64, 64]) torch.Size([8])
List frames in a video---------------------------------------------------------------------------------------------------------------------------------------
torch.Size([2, 3, 64, 64]) torch.Size([2]) ('v_BodyWeightSquats_g03_c03', 'v_BodyWeightSquats_g05_c04')
torch.Size([2, 3, 64, 64]) torch.Size([2]) ('v_BodyWeightSquats_g03_c03', 'v_BodyWeightSquats_g05_c04')
torch.Size([2, 3, 64, 64]) torch.Size([2]) ('v_BodyWeightSquats_g03_c03', 'v_BodyWeightSquats_g05_c04')
torch.Size([2, 3, 64, 64]) torch.Size([2]) ('v_BodyWeightSquats_g03_c03', 'v_BodyWeightSquats_g05_c04')
torch.Size([2, 3, 64, 64]) torch.Size([2]) ('v_BodyWeightSquats_g03_c03', 'v_BodyWeightSquats_g05_c04')
torch.Size([2, 3, 64, 64]) torch.Size([2]) ('v_BodyWeightSquats_g03_c03', 'v_BodyWeightSquats_g05_c04')
torch.Size([2, 3, 64, 64]) to

## 1. Classification based on Per-frame models/late fusion/early fusion

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
import torchvision.transforms as T
import torchvision.models as models
import cv2
import os
import random
from tqdm import tqdm

In [None]:
if torch.cuda.is_available():
    print("The code will run on GPU.")
else:
    print("The code will run on CPU. Go to Edit->Notebook Settings and choose GPU as the hardware accelerator")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

The code will run on GPU.


In [None]:
class FrameClassifier(nn.Module):
  def __init__(self, num_classes=10, backbone_name="resnet18"):
    super().__init__()
    self.backbone = getattr(models, backbone_name)(weights=None)
    self.backbone.fc = nn.Linear(self.backbone.fc.in_features, num_classes) # 把resnet18 的fc层替换成一个空操作。

  def forward(self, frames):
    return self.backbone(frames)

In [None]:
from logging import log
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss, total_correct = 0, 0
    total_samples = 0
    for frames, labels in tqdm(dataloader, desc="Training", leave=False):
        optimizer.zero_grad()
        frames, labels = frames.to(device), labels.to(device)
        outputs = model(frames)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * frames.size(0)
        preds = outputs.argmax(dim=1)
        total_correct += (preds == labels).sum().item()
        total_samples += frames.size(0)
    return total_loss / total_samples, total_correct / total_samples


def evaluate(model, dataloader, device, desc="Evaluating"):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for frames, labels, video_names in tqdm(dataloader, desc=desc, leave=False):
            frames = frames.to(device)
            labels = labels.to(device)
            B, C, T, H, W = frames.shape
            frames_reshaped = frames.permute(0, 2, 1, 3, 4).reshape(B*T, C, H, W) # batch*视频 维度合并为1维。从而并行求每帧的logiit

            logits_reshaped = model(frames_reshaped)
            logits = logits_reshaped.view(B, T, -1) # 恢复出视频维度。每行是一个视频
            video_logits = logits.mean(dim=1) # 对每行（即一个视频）的每个class做求均值的操作。

            preds = video_logits.argmax(dim=1) # 求出这行中得分最高的class
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

In [None]:
transform = T.Compose([
    T.Resize((64, 64)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = FrameImageDataset(root_dir=root_dir, split='train', transform=transform)
val_dataset = FrameVideoDataset(root_dir=root_dir, split='val', transform=transform, stack_frames=True)
test_dataset = FrameVideoDataset(root_dir=root_dir, split='test', transform=transform, stack_frames=True)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=2)

model = FrameClassifier(num_classes=10).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

best_acc = 0
epoch_num = 30
output_path = model_output + 'best_model.pth'
for epoch in range(epoch_num):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_acc = evaluate(model, val_loader, device, desc="Evaluating")
    print(f"Epoch {epoch+1}/{epoch_num}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
       best_acc = val_acc
       torch.save(model.state_dict(), output_path)

best_model = FrameClassifier(num_classes=10)
best_model.load_state_dict(torch.load(output_path))
best_model.to(device)

test_acc = evaluate(best_model, test_loader, device, desc="Testing")
print(f"Test Accuracy: {test_acc:.4f}")



Epoch 1/30: Train Loss: 0.6846, Train Acc: 0.8042, Val Acc: 0.8750




Epoch 2/30: Train Loss: 0.1052, Train Acc: 0.9743, Val Acc: 0.8667




Epoch 3/30: Train Loss: 0.0835, Train Acc: 0.9749, Val Acc: 0.8917




Epoch 4/30: Train Loss: 0.0588, Train Acc: 0.9858, Val Acc: 0.8500




Epoch 5/30: Train Loss: 0.0651, Train Acc: 0.9787, Val Acc: 0.8750




Epoch 6/30: Train Loss: 0.0444, Train Acc: 0.9870, Val Acc: 0.9000




Epoch 7/30: Train Loss: 0.0408, Train Acc: 0.9888, Val Acc: 0.8917




Epoch 8/30: Train Loss: 0.0297, Train Acc: 0.9918, Val Acc: 0.8583




Epoch 9/30: Train Loss: 0.0453, Train Acc: 0.9870, Val Acc: 0.8667




Epoch 10/30: Train Loss: 0.0302, Train Acc: 0.9908, Val Acc: 0.8833




Epoch 11/30: Train Loss: 0.0370, Train Acc: 0.9892, Val Acc: 0.8833




Epoch 12/30: Train Loss: 0.0344, Train Acc: 0.9906, Val Acc: 0.8750




Epoch 13/30: Train Loss: 0.0197, Train Acc: 0.9944, Val Acc: 0.8917




Epoch 14/30: Train Loss: 0.0046, Train Acc: 0.9996, Val Acc: 0.9083




Epoch 15/30: Train Loss: 0.0496, Train Acc: 0.9829, Val Acc: 0.8667




Epoch 16/30: Train Loss: 0.0233, Train Acc: 0.9938, Val Acc: 0.8917




Epoch 17/30: Train Loss: 0.0099, Train Acc: 0.9978, Val Acc: 0.8750




Epoch 18/30: Train Loss: 0.0314, Train Acc: 0.9910, Val Acc: 0.9083




Epoch 19/30: Train Loss: 0.0104, Train Acc: 0.9978, Val Acc: 0.9000




Epoch 20/30: Train Loss: 0.0060, Train Acc: 0.9980, Val Acc: 0.8667




Epoch 21/30: Train Loss: 0.0187, Train Acc: 0.9946, Val Acc: 0.8750




Epoch 22/30: Train Loss: 0.0282, Train Acc: 0.9906, Val Acc: 0.8750




Epoch 23/30: Train Loss: 0.0064, Train Acc: 0.9980, Val Acc: 0.8833




Epoch 24/30: Train Loss: 0.0150, Train Acc: 0.9968, Val Acc: 0.9083




Epoch 25/30: Train Loss: 0.0079, Train Acc: 0.9978, Val Acc: 0.9167




Epoch 26/30: Train Loss: 0.0375, Train Acc: 0.9890, Val Acc: 0.9000




Epoch 27/30: Train Loss: 0.0083, Train Acc: 0.9976, Val Acc: 0.9000




Epoch 28/30: Train Loss: 0.0031, Train Acc: 0.9996, Val Acc: 0.9000




Epoch 29/30: Train Loss: 0.0008, Train Acc: 1.0000, Val Acc: 0.9083




Epoch 30/30: Train Loss: 0.0311, Train Acc: 0.9902, Val Acc: 0.8833


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/model_output/video_classification/best_model.pth'

## Late & Early Fusion
Late Fusion uses MLP

In [None]:
class VideoClassifier(nn.Module):
  def __init__(self, num_classes=10, backbone_name="resnet18", fusion='late'):
    super().__init__()
    assert fusion in ['late', 'early'], "Fusion method should be 'late' or 'early'"
    self.fusion = fusion
    backbone = getattr(models, backbone_name)(weights=None)
    backbone.fc = nn.Identity()
    self.backbone = backbone

    if self.fusion == 'early':
       self.classifier = nn.Linear(512, num_classes)
    else:
       self.temporal_mlp = nn.Sequential(
           nn.Linear(512, 512),
           nn.ReLU(inplace=True),
           nn.Linear(512, 512),
           nn.ReLU(inplace=True),
       )
       self.classifier = nn.Linear(512, num_classes)


  def forward(self, x):
      if x.dim() != 5:
         raise ValueError(f"Expected input dim=5(B,C,T,H,W), got:{x.dim()}")

      B, C, T, H, W = x.shape
      # if self.fusion == 'early':
      #   x_early = x.view(B, C*T, H, W)
      #   return logits
      if self.fusion == 'late':
         frames_all = x.permute(0,2,1,3,4).reshape(B*T, C, H, W)
         feats_all = self.backbone(frames_all) # [B*T, 512]
         feats = feats_all.view(B, T, -1) # [B, T, 512]

         B_, T_, D = feats.shape
         feats_flat = feats.view(B_*T_, D)
         fused_flat = self.temporal_mlp(feats_flat)
         fused = fused_flat.view(B_, T_, D)
         video_feat = fused.mean(dim=1)

         logits = self.classifier(video_feat)
         return logits

      else:
        raise ValueError(f"Invalid fusion method: {self.fusion}")

In [None]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss, total_correct, total_samples = 0,0,0
    for frames, labels, *_ in tqdm(dataloader, desc="Training", leave=False):
        optimizer.zero_grad()
        frames, labels = frames.to(device), labels.to(device)
        logits = model(frames)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        batch = frames.size(0)
        total_loss += loss.item() * batch
        preds = logits.argmax(dim=1)
        total_correct += (preds == labels).sum().item()
        total_samples += batch
    return total_loss / total_samples, total_correct / total_samples


@torch.no_grad()
def evaluate(model, dataloader, criterion, device, desc="Evaluating"):
    model.eval()
    total_loss, total_correct, total_samples = 0,0,0
    for frames, labels, *_ in tqdm(dataloader, desc=desc, leave=False):
        frames, labels = frames.to(device), labels.to(device)

        logits = model(frames)
        loss = criterion(logits, labels)

        batch = frames.size(0)
        total_loss += loss.item() * batch
        preds = logits.argmax(dim=1)
        total_correct += (preds == labels).sum().item()
        total_samples += batch
    return total_loss / total_samples, total_correct / total_samples

In [None]:
transform = T.Compose([
    T.Resize((64, 64)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = FrameVideoDataset(root_dir=root_dir, split='train', transform=transform, stack_frames=True)
val_dataset = FrameVideoDataset(root_dir=root_dir, split='val', transform=transform, stack_frames=True)
test_dataset = FrameVideoDataset(root_dir=root_dir, split='test', transform=transform, stack_frames=True)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)

model = VideoClassifier(num_classes=10, fusion='late').to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

best_acc = 0
epoch_num = 30
output_path = model_output + 'best_late_fusion_model.pth'
for epoch in range(epoch_num):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device, desc="Evaluating")
    print(f"Epoch {epoch+1}/{epoch_num}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
       best_acc = val_acc
       torch.save(model.state_dict(), output_path)

best_model = VideoClassifier(num_classes=10, fusion='late')
best_model.load_state_dict(torch.load(output_path))
best_model.to(device)

_, test_acc = evaluate(best_model, test_loader, criterion, device, desc="Testing")
print(f"Test Accuracy: {test_acc:.4f}")

Training:  59%|█████▊    | 37/63 [23:43<16:22, 37.79s/it]