In [1]:
# from huggingface_hub import hf_hub_download
# hf_dataset_identifier = "sayakpaul/ucf101-subset"
# filename = "UCF101_subset.tar.gz"
# file_path = hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset")

In [2]:
# import tarfile
# with tarfile.open(file_path) as t:
#      t.extractall(".")

In [1]:
import pathlib
dataset_root_path = "UCF101_subset"
dataset_root_path = pathlib.Path(dataset_root_path)

In [2]:
video_count_train = len(list(dataset_root_path.glob("train/*/*.avi")))
video_count_val = len(list(dataset_root_path.glob("val/*/*.avi")))
video_count_test = len(list(dataset_root_path.glob("plain_test/*/*.avi")))
video_total = video_count_train + video_count_val + video_count_test
print(f"Total videos: {video_total}")

Total videos: 405


In [3]:
all_video_file_paths = (
    list(dataset_root_path.glob("train/*/*.avi"))
    + list(dataset_root_path.glob("val/*/*.avi"))
    + list(dataset_root_path.glob("plain_test/*/*.avi"))
 )

In [4]:
all_video_file_paths[:5]

[PosixPath('UCF101_subset/train/BaseballPitch/v_BaseballPitch_g17_c05.avi'),
 PosixPath('UCF101_subset/train/BaseballPitch/v_BaseballPitch_g19_c05.avi'),
 PosixPath('UCF101_subset/train/BaseballPitch/v_BaseballPitch_g05_c06.avi'),
 PosixPath('UCF101_subset/train/BaseballPitch/v_BaseballPitch_g15_c06.avi'),
 PosixPath('UCF101_subset/train/BaseballPitch/v_BaseballPitch_g13_c03.avi')]

In [200]:
class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")


Unique classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress'].


In [201]:
# from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
# 
# model_ckpt = "MCG-NJU/videomae-base"
# image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
# model = VideoMAEForVideoClassification.from_pretrained(
#     model_ckpt,
#     label2id=label2id,
#     id2label=id2label,
#     ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
# )

In [242]:
from transformers import VivitForVideoClassification
model_ckpt = "google/vivit-b-16x2-kinetics400"

model = VivitForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)
for param in model.base_model.parameters():
    param.requires_grad = False

  return self.fget.__get__(instance, owner)()
Some weights of VivitForVideoClassification were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [243]:
from transformers import VivitImageProcessor
# processor did not provide info of specified transformation here (i.e, whether doing flip)
image_processor = VivitImageProcessor.from_pretrained(model_ckpt)
mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = model.config.num_frames
# frame number to be sampled / s 
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

In [244]:
import os
import pytorchvideo.data
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)
from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)

train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)

val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

In [245]:
train_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "train"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)


val_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "val"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

test_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "plain_test"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

In [246]:
print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)

300 30 75


In [247]:
import imageio
import numpy as np
from IPython.display import Image

def unnormalize_img(img):
    """Un-normalizes the image pixels."""
    img = (img * std) + mean
    img = (img * 255).astype("uint8")
    return img.clip(0, 255)

def create_gif(video_tensor, filename="sample.gif"):
    """Prepares a GIF from a video tensor.
    
    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)
    kargs = {"duration": 0.25}
    imageio.mimsave(filename, frames, "GIF", **kargs)
    return filename

def display_gif(video_tensor, gif_name="sample.gif"):
    """Prepares and displays a GIF from a video tensor."""
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)

In [248]:
sample_video = next(iter(train_dataset))
print(sample_video["video"].shape)
# video_tensor = sample_video["video"]
# display_gif(video_tensor)

torch.Size([3, 32, 224, 224])


In [249]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [250]:
import torch
def collate_fn(examples):
    # permute to (num_frames, num_channels, height, width)
    # c, t, h, w -> t, c, h, w 
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [253]:
from transformers import TrainingArguments, Trainer

model_name = model_ckpt.split("/")[-1]
new_model_name = f"{model_name}-finetuned-ucf101-subset—withouImgP"
num_epochs = 10
batch_size = 8

args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
)

In [254]:
# full training
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)
# only train the classification head


max_steps is given, it will override any value given in num_train_epochs


In [255]:
torch.cuda.empty_cache()
train_results = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,2.2655,2.016372,0.297297
1,1.8417,1.636138,0.594595
2,1.4663,1.347977,0.756757
3,1.2458,1.127036,0.864865
4,1.0042,0.962128,0.864865
5,1.0585,0.8472,0.891892
6,0.886,0.766741,0.945946
7,0.7971,0.714819,0.945946
8,0.7598,0.688415,0.945946
9,0.7415,0.682645,0.945946




In [271]:
# model_ckpt = "vivit-b-16x2-kinetics400-finetuned-ucf101-subset—withouImgP/checkpoint-370"
model_ckpt = "vivit-b-16x2-kinetics400-finetuned-ucf101-subset—onlyChanginghead/checkpoint-370"

model = VivitForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

In [272]:
import evaluate
from torch.utils.data import DataLoader

eval_dataloader = DataLoader(test_dataset, batch_size=8)
metric = evaluate.load("accuracy")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.eval()

torch.cuda.empty_cache()
for batch in eval_dataloader:
    inputs = {
        "pixel_values": batch['video'].permute(0, 2, 1, 3, 4),
        # "labels": torch.tensor(batch["label"]),  # this can be skipped if you don't have labels available.
    }
    # print(batch['label'])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model = model.to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["label"])

metric.compute()



{'accuracy': 0.9770114942528736}