In [None]:
!pip install -qU pytorchvideo transformers evaluate torchvision==0.16.0

# Video Classification

**Video classification** is the task of assigning a label or class to an entire video. Videos are expected to have only one class for each video.

Video classification models take a video as input and return a prediction about which class the video belongs to. One of applications is action/activity recognition, for fitness application.

We will fine-tune `VideoMAE` on a subset of the `UCF101` dataset.

## Load UCF101 dataset

In [2]:
from huggingface_hub import hf_hub_download
import tarfile

hf_dataset_identifier = 'sayakpaul/ucf101-subset'
filename = 'UCF101_subset.tar.gz'
file_path = hf_hub_download(
    repo_id=hf_dataset_identifier,
    filename=filename,
    repo_type='dataset'
)

with tarfile.open(file_path) as t:
    t.extractall('.')

In [3]:
import pathlib

dataset_root_path = 'UCF101_subset'
dataset_root_path = pathlib.Path(dataset_root_path)

In [4]:
video_count_train = len(list(dataset_root_path.glob('train/*/*.avi')))
video_count_val = len(list(dataset_root_path.glob('val/*/*.avi')))
video_count_test = len(list(dataset_root_path.glob('test/*/*.avi')))
video_total = video_count_train + video_count_val + video_count_test
print(f'Train: {video_count_train}, Val: {video_count_val}, Test: {video_count_test}, Total: {video_total}')

Train: 300, Val: 30, Test: 75, Total: 405


In [5]:
all_video_file_paths = (
    list(dataset_root_path.glob('train/*/*.avi'))
    + list(dataset_root_path.glob('val/*/*.avi'))
    + list(dataset_root_path.glob('test/*/*.avi'))
)
all_video_file_paths[:5]

[PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g11_c01.avi'),
 PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g23_c01.avi'),
 PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g02_c01.avi'),
 PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g20_c05.avi'),
 PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g20_c07.avi')]

In the training, the video clips belonging to the same group/scene are denoted by `g` in the video file paths.

For the validation and evaluation splits, we will not want video clips from the same group/scene to prevent data leakage.

We also need to create two dictionaries:
* `label2id` maps the class names to integers
* `id2label` maps the integers to class names

In [6]:
class_labels = sorted({str(path).split('/')[2] for path in all_video_file_paths})

label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}
print(f'Unique classes: {list(label2id.keys())}.')

Unique classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress'].


## Load a model to fine-tune

We will instantiate a video classification model from a pretrained checkpoint and its associated image processor.

The model's encoder comes with pretrained parameters, and the classification head is randomly initialized.

In [7]:
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification

model_ckpt = 'MCG-NJU/videomae-base'
image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
model = VideoMAEForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
)

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Prepare the datasets for training

For preprocessing the videos, we will leverage the `PyTorchVideo`.

In [2]:
import pytorchvideo.data

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)



For the training dataset transformations, we use a combination of uniform temporal subsampling, pixel normalization, random cropping, and random horizontal flipping.

For the validation and evaluation dataset transformations, we keep the same transformation chain except for random cropping and horizontal fliiping.

We will use the `image_processor` associated with the pretrained model to obtain
* image mean and standard deviation with which the video frame pixels will be normalized
* spatial resotluion to which the video frames will be resized

In [22]:
mean = image_processor.image_mean
std = image_processor.image_std

if 'shortest_edge' in image_processor.size:
    height = width = image_processor.size['shortest_edge']
else:
    height = image_processor.size['height']
    width = image_processor.size['width']

resize_to = (height, width)

num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

In [None]:
import os

train_transform = Compose([
    ApplyTransformToKey(
        key='video',
        transform=Compose([
            UniformTemporalSubsample(num_frames_to_sample),
            Lambda(lambda x: x / 255.0),
            Normalize(mean, std),
            RandomShortSideScale(min_size=256, max_size=320),
            RandomCrop(resize_to),
            RandomHorizontalFlip(p=0.5)
        ])
    )
])

train_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, 'train'),
    clip_sampler=pytorchvideo.data.make_clip_sampler('random', clip_duration),
    decode_audio=False,
    transform=train_transform
)

In [None]:
val_transform = Compose([
    ApplyTransformToKey(
        key='video',
        transform=Compose([
            UniformTemporalSubsample(num_frames_to_sample),
            Lambda(lambda x: x / 255.0),
            Normalize(mean, std),
            Resize(resize_to),
        ])
    )
])

val_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, 'val'),
    clip_sampler=pytorchvideo.data.make_clip_sampler('uniform', clip_duration),
    decode_audio=False,
    transform=val_transform
)

test_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, 'test'),
    clip_sampler=pytorchvideo.data.make_clip_sampler('uniform', clip_duration),
    decode_audio=False,
    transform=val_transform
)

We use the `pytrochvideo.data.Ucf101()` because it is tailored for the UCF-101 dataset. If we want to use a custom dataset, we can extend the `LabeledVideoDataset` class.

In [None]:
print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)

## Visualize the preprocessed video for better debugging

In [None]:
import imageio
import numpy as np
from IPython.display import Image

def unnormalize_img(img):
    img = (img * std) + mean
    img = (img * 255).astype('uint8')
    return img.clip(0, 255)

def create_gif(video_tensor, filename='sample.gif'):
    """Prepares a GIF from a video tensor

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width)
    """
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)

    kwargs = {'duration': 0.25}
    imageio.mimsave(filename, frames, 'GIF', **kwargs)
    return filename

def display_gif(video_tensor, gif_name='sample.gif'):
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)



sample_video = next(iter(train_dataset))
video_tensor = sample_video['video']
display_gif(video_tensor)

## Evaluate

In [None]:
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)

    return metric.compute(
        predictions=predictions,
        references=eval_pred.label_ids
    )

In the VideoMAE paper, the authors evaluate the model on several clips from test videos and apply different crops to those clips and report the aggregate score. Due to simplicity, we will not consider that in this guide.

We also need to define a `collate_fn` to batch examples together:

In [None]:
def collate_fn(examples):
    # permute to (num_frames, num_channels, height, width)
    pixel_values = torch.stack(
        [example['video'].permute(1, 0, 2, 3) for example in examples]
    )

    labels = torch.tensor(
        [example['label'] for example in examples]
    )

    return {'pixel_values': pixel_values, 'labels': labels}

## Train

In [None]:
from transformers import TrainingArguments, Trainer

model_name = model_ckpt.split('/')[-1]
new_model_name = f"{model_name}-finetuned-ucf101-subset"
num_epochs = 4
batch_size = 4

training_args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_step=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    push_to_hub=False,
    max_steps=(train_dataset.num_videos // batch_size) * num_epochs
)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)

train_results = trainer.train()

## Inference

In [None]:
sample_test_video = next(iter(test_dataset))

In [None]:
from transformers import pipeline

video_cls = pipeline(model='my_awesome_video_cls_model')

In [None]:
video_cls("https://huggingface.co/datasets/sayakpaul/ucf101-subset/resolve/main/v_BasketballDunk_g14_c06.avi")

We can also manually do the inference

In [None]:
def run_inference(model, video):
    # (num_frames, num_channels, height, width)
    permuted_sample_test_video = video.permute(1, 0, 2, 3)
    inputs = {
        'pixel_values': permuted_sample_test_video.unsqueeze(0),
        'labels': torch.tensor(
            [sample_test_video['label']]
            # skip if no label availble
        )
    }

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model = model.to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    return logits


logits = run_inference(model, sample_test_video['video'])

In [None]:
predicted_cls_idx = logits.argmax(-1).item()
print('Predicted class:', model.config.id2label[predicted_cls_idx])