In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install pytorchvideo

In [None]:
videos_path = "drive/MyDrive/clips/"
annotations_path = "drive/MyDrive/clips/action_labels.csv"

In [None]:
import pandas as pd
import os

In [None]:
#load csv
df = pd.read_csv(annotations_path)
print(df.head())

df["filename"] = df["filename"].apply(lambda x: os.path.join(videos_path, x))
fixed_csv_path = "/content/fixed_actions_labels_absolute.csv"
df.to_csv(fixed_csv_path, index=False)

train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

train_csv = "/content/train_annotations.csv"
test_csv = "/content/test_annotations.csv"
train_df.to_csv(train_csv)
test_df.to_csv(test_csv)



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
train_label_counts = train_df["action_label"].value_counts()
test_label_counts = test_df["action_label"].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=train_label_counts.index, y=train_label_counts.values, color="blue", alpha=0.7, label="Train")
sns.barplot(x=test_label_counts.index, y=test_label_counts.values, color="red", alpha=0.7, label="Test")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Count")
plt.title("Label Distribution in Train and Test Splits")
plt.legend()
plt.show()


I tested balancing the dataset by limiting the population in each set, results weren't as good on the validation set and overfitting happened instantly, mainly due to a lot less data - so I just kept the train/test split 80/20

In [None]:
pip install torchvision

## NOTE ABOUT PYTORCH VIDEO TRANSFORMS: augmentations.py needs to be changed from funcional_tensor to functional

In [None]:
#This code was obtained from https://pytorch.org/hub/facebookresearch_pytorchvideo_slowfast/ , specifically the packpathway
import torch
from pytorchvideo.transforms import ApplyTransformToKey, UniformTemporalSubsample, RandomShortSideScale, \
    ShortSideScale, Normalize
from torch import nn
from torchvision.transforms import Compose, Lambda, RandomCrop, RandomHorizontalFlip, CenterCrop

side_size = 256
max_size = 320
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
clip_duration = (num_frames * sampling_rate) / frames_per_second


class PackPathway(nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """

    def __init__(self, alpha=4):
        super().__init__()
        self.alpha = alpha

    def forward(self, frames):
        fast_pathway = frames
        # perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(frames, 1,
                                          torch.linspace(0, frames.shape[1] - 1, frames.shape[1] // self.alpha).long())
        frame_list = [slow_pathway, fast_pathway]
        return frame_list


train_transform = ApplyTransformToKey(key="video", transform=Compose(
    [UniformTemporalSubsample(num_frames), Lambda(lambda x: x / 255.0), Normalize(mean, std),
     RandomShortSideScale(min_size=side_size, max_size=max_size), RandomCrop(crop_size), RandomHorizontalFlip(),
     PackPathway()]))
test_transform = ApplyTransformToKey(key="video", transform=Compose(
    [UniformTemporalSubsample(num_frames), Lambda(lambda x: x / 255.0), Normalize(mean, std),
     ShortSideScale(size=side_size), CenterCrop(crop_size), PackPathway()]))

## Train Split


In [None]:
csv_path = "/content/train_annotations.csv"

df = pd.read_csv(csv_path)

df["filename"] = df["filename"].apply(lambda x: os.path.join("/content", x.replace("\\", "/")).replace("\\", "/"))

fixed_csv_path = "/content/fixed_actions_labels_absolute_train1.csv"
df.to_csv(fixed_csv_path, index=False)


In [None]:
csv_path_test = '/content/test_annotations.csv'

df = pd.read_csv(csv_path_test)

df["filename"] = df["filename"].apply(lambda x: os.path.join("/content", x.replace("\\", "/")).replace("\\", "/"))

fixed_csv_path_test = "/content/fixed_actions_labels_absolute_test.csv"
df.to_csv(fixed_csv_path_test, index=False)


In [None]:
df = pd.read_csv(fixed_csv_path)

#ensure correct path for training

labeled_video_paths_train = [(row['filename'].replace('\\', '/'), {"action_number": row['action_number']}) for _, row in df.iterrows()]

print(labeled_video_paths_train[:10])


In [None]:
df = pd.read_csv(fixed_csv_path_test)

#ensure correct path for testing
labeled_video_paths_test = [(row['filename'].replace('\\', '/'), {"action_number": row['action_number']}) for _, row in df.iterrows()]

print(labeled_video_paths_test[:10])


## Load dataset

In [None]:
print(f"Total videos in dataset: {len(labeled_video_paths_train)}")
print(f"Total videos in dataset: {len(labeled_video_paths_test)}")

In [None]:
from torch.utils.data import DataLoader
from pytorchvideo.data import make_clip_sampler, LabeledVideoDataset
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    UniformTemporalSubsample,
)
#dataset load
train_data = LabeledVideoDataset(
  labeled_video_paths=labeled_video_paths_train,
  clip_sampler=make_clip_sampler("random", 2.0),
  transform=train_transform,
  decode_audio=False
)
test_data = LabeledVideoDataset(
  labeled_video_paths=labeled_video_paths_test,
  clip_sampler=make_clip_sampler("random", 2.0),
  transform=test_transform,
  decode_audio=False
)

#get the length of an iterable dataset
class LabeledVideoDatasetWrapper(torch.utils.data.IterableDataset):
  def __init__(self, dataset, length_estimate):
    self.dataset = dataset
    self.length_estimate = length_estimate

  def __len__(self):
    return self.length_estimate

  def __iter__(self):
    return iter(self.dataset)

#change based on dataset size, I used 4915 (80%) and 1229 (20%)
dataset_length = 4915
wrapped_train_data = LabeledVideoDatasetWrapper(train_data, dataset_length)
dataset_test_length = 1229
wrapped_test_data = LabeledVideoDatasetWrapper(test_data, dataset_test_length)


#dataloading, using A100 16 batch size works, 8 for T4.
train_loader = DataLoader(wrapped_train_data, batch_size=16, num_workers=4, persistent_workers=True)
test_loader = DataLoader(wrapped_test_data, batch_size=16, num_workers=4, persistent_workers=True)


## Load SlowFast

In [None]:
#load slowfast model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "slowfast_r50"
#pretrained on kinetics-400 and finetuning on multisports-football
model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)
model = model.to(device).eval()


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "slowfast_r50"
model.load_state_dict(torch.load("/content/drive/MyDrive/trained_model_epoch_new_bs_16_lr_0.0001_new1.pth"))

In [None]:
for module in model.modules():
  if isinstance(module, torch.nn.Dropout):
    #check dropout, can change if needed for overfitting
    print(module.p)
    module.p = 0.5

In [None]:
num_classes = len(df["action_number"].unique())
#final layer changed to 15 classes
model.blocks[-1].proj = nn.Linear(model.blocks[-1].proj.in_features, num_classes)
model = model.to(device)
print(num_classes)


Used ADAM and SGD, ADAM began overfitting ~3-4 epochs, SGD is more stable but takes more time and will usually eventually give better results.

In [None]:
#lr = 0.0001, wd= 0.0001, bs = 8, epochs = 3, val acc = 0.539
#lr = 0.0001, wd= 0.0001, bs = 16, epochs = 3, val acc = 0.545

In [None]:
from torch.optim import Adam,SGD
from torch.nn import CrossEntropyLoss

#optimiser and loss
loss_fn = CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)



In [None]:
from tqdm import tqdm

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import StepLR

def pad_videos(videos, device):
  #find the max length clip, pad everything to that
  max_length = max(v.shape[2] for v in videos)
  padded_videos = [torch.nn.functional.pad(v, (0, 0, 0, 0, 0, max_length - v.shape[2])) for v in videos]
  return torch.stack(padded_videos).to(device)

#minimum label value
LABEL_OFFSET = 33

def train(model, dataloader, optimiser, loss_fn, device):
  #training loop
  model.train()
  total_loss, total_acc = 0, 0

  for batch in tqdm(dataloader, desc="Training"):
    videos = [v.to(device) for v in batch["video"]]
    labels = batch["action_number"].to(device) - LABEL_OFFSET
    labels = labels.long()

    optimiser.zero_grad()
    preds = model(videos)
    loss = loss_fn(preds, labels)
    loss.backward()
    optimiser.step()

    total_loss += loss.item()
    total_acc += (preds.argmax(dim=-1) == labels).sum().item()

  return total_loss / len(dataloader), total_acc / len(dataloader.dataset)

def evaluate(model, dataloader, loss_fn, device):
  #evaluation loop
  model.eval()
  total_loss, total_acc = 0, 0
  with torch.no_grad():
    for batch in tqdm(dataloader, desc="Evaluating"):
      videos = [v.to(device) for v in batch["video"]]
      labels = batch["action_number"].to(device) - LABEL_OFFSET
      labels = labels.long()

      preds = model(videos)
      loss = loss_fn(preds, labels)

      total_loss += loss.item()
      total_acc += (preds.argmax(dim=-1) == labels).sum().item()

  return total_loss / len(dataloader), total_acc / len(dataloader.dataset)

#train for x amount of epochs (30 here)
epochs = 8
for epoch in range(1, epochs + 1):
  #main loop
  loss, acc = train(model, train_loader, optimiser, loss_fn, device)
  print("Epoch: ", epoch,"Loss: ", loss,"Accuracy: ", acc)

  val_loss, val_acc = evaluate(model, test_loader, loss_fn, device)
  print("Epoch: ", epoch,"Validation Loss: ", val_loss,"Validation Accuracy: ", val_acc)

#model saved for
  #save for each epoch
  model_save_path = "/content/drive/MyDrive/trained_model_epoch_new_bs_16_lr_0.0001_new" + str(epoch) + ".pth"
  torch.save(model.state_dict(), model_save_path)
  print("Model Saved")

print("Training Done")


In [None]:
# Training: 100%|██████████| 308/308 [32:35<00:00,  6.35s/it]

# Epoch:  1 Loss:  1.8155140389095654 Accuracy:  0.37029501525940994

# Evaluating: 80it [08:06,  6.08s/it]

# Epoch:  1 Validation Loss:  1.615652519387084 Validation Accuracy:  0.4646053702196908

# Training: 100%|██████████| 308/308 [25:56<00:00,  5.05s/it]

# Epoch:  1 Loss:  1.5011588400834566 Accuracy:  0.4799593082400814

# Evaluating: 80it [06:26,  4.83s/it]

# Epoch:  1 Validation Loss:  1.3868585677890035 Validation Accuracy:  0.5207485760781123
# Model Saved

# Training: 100%|██████████| 308/308 [25:58<00:00,  5.06s/it]

# Epoch:  2 Loss:  1.3306402080244832 Accuracy:  0.5340793489318413

# Evaluating: 80it [06:32,  4.91s/it]

# Epoch:  2 Validation Loss:  1.409806552645448 Validation Accuracy:  0.5305126118795769
# Model Saved

# Training: 100%|██████████| 308/308 [26:10<00:00,  5.10s/it]

# Epoch:  3 Loss:  1.2170358844004667 Accuracy:  0.5623601220752797

# Evaluating: 80it [06:31,  4.89s/it]

# Epoch:  3 Validation Loss:  1.3789514612841915 Validation Accuracy:  0.5443449959316518
# Model Saved

# Training: 100%|██████████| 308/308 [26:05<00:00,  5.08s/it]

# Epoch:  4 Loss:  1.1126136767206254 Accuracy:  0.5973550356052899

# Evaluating: 80it [06:31,  4.90s/it]

# Epoch:  4 Validation Loss:  1.3977933146736838 Validation Accuracy:  0.5305126118795769
# Model Saved

# Training: 100%|██████████| 308/308 [26:18<00:00,  5.13s/it]

# Epoch:  5 Loss:  1.0285702541277006 Accuracy:  0.6313326551373347

# Evaluating: 80it [06:31,  4.89s/it]

# Epoch:  5 Validation Loss:  1.5538488897410305 Validation Accuracy:  0.5036615134255492
# Model Saved

# Training: 100%|██████████| 308/308 [26:08<00:00,  5.09s/it]

# Epoch:  6 Loss:  0.9428888381301582 Accuracy:  0.6573753814852492

# Evaluating: 80it [06:30,  4.88s/it]

# Epoch:  6 Validation Loss:  1.55042182238071 Validation Accuracy:  0.5264442636289667
# Model Saved

# Training: 100%|██████████| 308/308 [26:05<00:00,  5.08s/it]

# Epoch:  7 Loss:  0.8717771956285874 Accuracy:  0.6826042726347915

# Evaluating: 80it [06:25,  4.82s/it]

# Epoch:  7 Validation Loss:  1.5754852310403602 Validation Accuracy:  0.5191212367778681
# Model Saved

# Training: 100%|██████████| 308/308 [26:06<00:00,  5.08s/it]

# Epoch:  8 Loss:  0.8049682320712449 Accuracy:  0.7133265513733469

# Evaluating: 80it [06:31,  4.89s/it]

# Epoch:  8 Validation Loss:  1.6961428283096909 Validation Accuracy:  0.48494711147274205
# Model Saved
# Training Done


In [None]:
# Training: 100%|██████████| 308/308 [35:21<00:00,  6.89s/it]

# Epoch:  1 Loss:  1.9765217513233035 Accuracy:  0.37741607324516785

# Evaluating: 80it [07:04,  5.31s/it]

# Epoch:  1 Validation Loss:  1.777498489076441 Validation Accuracy:  0.40602115541090317
# Model Saved

# Training: 100%|██████████| 308/308 [28:11<00:00,  5.49s/it]

# Epoch:  2 Loss:  1.678143762535863 Accuracy:  0.4233977619532045

# Evaluating: 80it [07:06,  5.33s/it]

# Epoch:  2 Validation Loss:  1.7982588526490446 Validation Accuracy:  0.4157851912123678
# Model Saved

# Training: 100%|██████████| 308/308 [28:08<00:00,  5.48s/it]

# Epoch:  3 Loss:  1.6139012399044903 Accuracy:  0.4516785350966429

# Evaluating: 80it [07:09,  5.37s/it]

# Epoch:  3 Validation Loss:  4.347201814899197 Validation Accuracy:  0.33848657445077296
# Model Saved

# Training: 100%|██████████| 308/308 [28:01<00:00,  5.46s/it]

# Epoch:  4 Loss:  1.5645483021999333 Accuracy:  0.46897253306205494

# Evaluating: 80it [07:07,  5.35s/it]

# Epoch:  4 Validation Loss:  1.6486872976476497 Validation Accuracy:  0.4499593165174939
# Model Saved
# Training Done


In [None]:
# Training: 100%|██████████| 308/308 [27:58<00:00,  5.45s/it]

# Epoch:  1 Loss:  1.782033293858751 Accuracy:  0.392675483214649

# Evaluating: 80it [06:59,  5.24s/it]

# Epoch:  1 Validation Loss:  1.557073233189521 Validation Accuracy:  0.47843775427176566
# Model Saved

# Training: 100%|██████████| 308/308 [28:11<00:00,  5.49s/it]

# Epoch:  2 Loss:  1.4655607015668572 Accuracy:  0.488911495422177

# Evaluating: 80it [07:04,  5.30s/it]

# Epoch:  2 Validation Loss:  1.3902774453163147 Validation Accuracy:  0.5451586655817738
# Model Saved

# Training: 100%|██████████| 308/308 [28:10<00:00,  5.49s/it]

# Epoch:  3 Loss:  1.322447316987174 Accuracy:  0.5310274669379451

# Evaluating: 80it [07:10,  5.38s/it]

# Epoch:  3 Validation Loss:  1.3320982061423265 Validation Accuracy:  0.5573637103336045
# Model Saved

# Training: 100%|██████████| 308/308 [28:25<00:00,  5.54s/it]

# Epoch:  4 Loss:  1.1938220848897836 Accuracy:  0.5778229908443541

# Evaluating: 80it [07:04,  5.30s/it]

# Epoch:  4 Validation Loss:  1.3893521422302568 Validation Accuracy:  0.5378356387306753
# Model Saved
# Training Done


In [None]:
# Training: 616it [34:49,  3.39s/it]

# Epoch:  1 Loss:  1.7977875458515757 Accuracy:  0.3873855544252289

# Evaluating: 156it [08:32,  3.28s/it]

# Epoch:  1 Validation Loss:  1.5691923293974492 Validation Accuracy:  0.4882017900732303
# Model Saved

# Training: 616it [28:26,  2.77s/it]

# Epoch:  2 Loss:  1.5245900104685528 Accuracy:  0.47466937945066123

# Evaluating: 156it [07:00,  2.70s/it]

# Epoch:  2 Validation Loss:  1.4821216936235304 Validation Accuracy:  0.5117982099267697
# Model Saved

# Training: 616it [28:40,  2.79s/it]

# Epoch:  3 Loss:  1.376072660306605 Accuracy:  0.5143438453713123

# Evaluating: 156it [06:56,  2.67s/it]

# Epoch:  3 Validation Loss:  1.3971749685414425 Validation Accuracy:  0.5386493083807974
# Model Saved

# Training: 616it [28:33,  2.78s/it]

# Epoch:  4 Loss:  1.2782971857039909 Accuracy:  0.5481180061037639

# Evaluating: 156it [06:59,  2.69s/it]

# Epoch:  4 Validation Loss:  1.441618792422406 Validation Accuracy:  0.5394629780309195
# Model Saved


In [None]:
model.load_state_dict(torch.load("/content/drive/MyDrive/trained_model_epoch_new_bs_16_lr_0.0001_new5.pth"))
model.to(device)
model.eval()

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import numpy as np
from tqdm import tqdm
LABEL_OFFSET = 33
def evaluate_with_metrics(model, dataloader, loss_fn, device, num_classes):
  #evaluation with top-k and classification metrics
  model.eval()
  total_loss = 0
  top1_correct = 0
  top3_correct = 0
  top5_correct = 0

  all_preds = []
  all_labels = []

  with torch.no_grad():
    for batch in tqdm(dataloader, desc="evaluating with metrics"):
      videos = [v.to(device) for v in batch["video"]]
      labels = batch["action_number"].to(device) - LABEL_OFFSET
      labels = labels.long()

      preds = model(videos)
      loss = loss_fn(preds, labels)
      total_loss += loss.item()

      topk_preds = preds.topk(5, dim=1).indices
      labels_expanded = labels.view(-1, 1)

      top1_correct += (topk_preds[:, :1] == labels_expanded).sum().item()
      top3_correct += (topk_preds[:, :3] == labels_expanded).any(dim=1).sum().item()
      top5_correct += (topk_preds == labels_expanded).any(dim=1).sum().item()

      all_preds.extend(preds.argmax(dim=1).cpu().numpy())
      all_labels.extend(labels.cpu().numpy())

  avg_loss = total_loss / len(dataloader)
  total_samples = len(dataloader.dataset)

  top1_acc = top1_correct / total_samples
  top3_acc = top3_correct / total_samples
  top5_acc = top5_correct / total_samples

  conf_matrix = confusion_matrix(all_labels, all_preds, labels=list(range(num_classes)))
  precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
  recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
  f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

  #print metrics
  print("Validation loss:", avg_loss)
  print("top-1 accuracy:", top1_acc)
  print("top-3 accuracy:", top3_acc)
  print("top-5 accuracy:", top5_acc)
  print("Precision (macro):", precision)
  print("Recall (macro):", recall)
  print("F1 score (macro):", f1)

  return avg_loss, top1_acc, top3_acc, top5_acc, conf_matrix, precision, recall, f1

In [None]:
_, _, _, _, conf_matrix, _, _, _ = evaluate_with_metrics(model, test_loader, loss_fn, device, 15)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

football_action_labels = {
    0: "football shoot",
    1: "football long pass",
    2: "football short pass",
    3: "football through pass",
    4: "football cross",
    5: "football dribble",
    6: "football trap",
    7: "football throw",
    8: "football diving",
    9: "football tackle",
    10: "football steal",
    11: "football clearance",
    12: "football block",
    13: "football press",
    14: "football aerial duels"
}

action_labels = football_action_labels
class_names = [action_labels[i] for i in range(len(action_labels))]

conf_matrix_n = conf_matrix.astype('float') / conf_matrix.sum(axis=1, keepdims=True)
conf_matrix_n = np.nan_to_num(conf_matrix_n)

plt.figure(figsize=(12, 10))
sns.heatmap(conf_matrix_n, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Normalised confusion matrix")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()

#save the plot
plt.savefig("confusion_matrix_normalized.png", dpi=300)
plt.show()

In [None]:
plt.savefig("confusion_matrix_normalized.png", dpi=300)
plt.show()