In [1]:
# Run once
!pip install torch torchvision pytorchvideo opencv-python scikit-learn



## Imports

In [2]:
import torch
# Initialize model
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True) #else some imports won't work for some reason
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import cv2
import numpy as np
import os
from torchvision.transforms import Compose, Resize, Normalize

from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)

Using cache found in C:\Users\kaiav/.cache\torch\hub\facebookresearch_pytorchvideo_main


## CSV/DF

In [3]:
# Import, clean up csv file, add clip paths
csv_filename = 'labels.csv'
df = pd.read_csv(csv_filename)

print( 'Unique labels: ', df['label'].unique() )                               # print label values (should only be 0, 1, 2)
print( 'Original shape: ', df.shape)                                           # print original shape
# print( 'Data preview:\n', df.head() )                                          # preview data
if df['clip_name'].duplicated().any():                                         # check for duplicates
    print("There are duplicate clip names.")
else:
    print("No duplicate clip names found.")
if not df['clip_name'].str.startswith("clip_").all():                          # validate all filenames start with 'clip_'
    print("Some filenames do not start with 'clip_':")
    print(df[~df['clip_name'].str.startswith("clip_")])                        # print invalid rows, if any
else:
    print("All filenames start with 'clip_'.")    
print()
df['label'] = df['label'].astype(str).str.strip().str.lower()                  # convert to string, strip whitespaces, convert to lowercase
df = df.drop(df[df['label'] == '2'].index)                                     # prune invalid/void pass/dribble scenarios
print( 'Unique labels: ', df['label'].unique() )                               # print label values (should only be 0/1)
print( 'New shape: ', df.shape )                                               # print pruned df shape

df['clip_path'] = df['clip_name'].apply(lambda x: os.path.join('raw_clips', x))# add column for full clip path
# print( df.head() )                                                             # preview dataframe

Unique labels:  [2 0 1]
Original shape:  (2395, 2)
No duplicate clip names found.
All filenames start with 'clip_'.

Unique labels:  ['0' '1']
New shape:  (1154, 2)


## Input Transform for SlowFast

In [4]:
# Code from: https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/facebookresearch_pytorchvideo_slowfast.ipynb 
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
slowfast_alpha = 4

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors. 
    """
    def __init__(self):
        super().__init__()
        
    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

## VideoDataSet 

In [5]:
# Create custom DataSet object
class VideoDataset(Dataset):
    def __init__(self, data_frame, transform=transform):
        """
        Args:
            data_frame (pd.DataFrame): DataFrame with columns ['clip_name', 'label', 'clip_path']
            transform (callable, optional): Transform to be applied to video frames
        Returns: frames and label for given clip
        """
        self.data_frame = data_frame
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        # Get clip path and label
        clip_path = self.data_frame.iloc[idx]['clip_path']
        label = self.data_frame.iloc[idx]['label']
        label = int(label)

        # Initialize an EncodedVideo helper class and load the video
        video = EncodedVideo.from_path(clip_path)
        video_data = video.get_clip(0, 1) # 0 to 1 sec
        
        # Apply a transform to normalize the video input
        video_data = transform(video_data)
        
        # Move the frames to the desired device
        frames = video_data["video"]
        frames = [i.to(device)[None, ...] for i in frames]
        
        # print('frames: \n', frames)
        
        return frames, label

## DataLoaders

In [6]:
# Split data and set up DataLoaders

# Split data frames into train/test/validation
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)  # 70% train
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=42)  # 15% test 15% validation

# Create Dataset objects
train_dataset = VideoDataset(train_df, transform=transform)
test_dataset = VideoDataset(test_df, transform=transform)
val_dataset = VideoDataset(val_df, transform=transform)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

## Sinlge Clip Inference

In [7]:
# Model setup
# (model already initialized in imports)
device = "cpu"
model = model.eval()
model = model.to(device)

In [8]:
# Kinetics labels mapping
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

In [14]:
# Model inference
train_iter = iter(train_loader)
frames, label = next(train_iter)
# print(f"Frames shape: {frames.shape}, Label: {label}")
print( frames[0].shape )

preds = model(frames)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

AssertionError: input for MultiPathWayWithFuse needs to be a list of tensors

In [18]:
print( frames[0].shape )

torch.Size([1, 1, 3, 8, 256, 256])


In [None]:
# EVALUATE BASELINE MODEL PERFORMANCE (test set)
# model.eval()

#freeze feature extraction layers / turn off requires_grad 
#replace final layer with a binary classification layer (single neuron for pass/dribble)
#train for 3 epochs
#test accuracy

In [None]:
# FINE-TUNE MODEL

In [None]:
# EVALUATE FINE-TUNED MODEL PERFORMANCE (test set)

In [None]:
# EVALUTE FINE-TUNED MODEL GENERALIZATION PERFORMANCE (validation set)

In [None]:
# Classify a single clip