In [1]:
!pip install pytorchvideo remotezip transformers -q

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [1]:
import remotezip as rz


def list_files_from_zip_url(zip_url):
    """List the files in each class of the dataset given a URL with the zip file.

    Args:
      zip_url: A URL from which the files can be extracted from.

    Returns:
      List of files in each of the classes.
    """
    files = []
    with rz.RemoteZip(zip_url) as zip:
        for zip_info in zip.infolist():
            files.append(zip_info.filename)
    return files

In [2]:
url = "https://storage.googleapis.com/thumos14_files/UCF101_videos.zip"
files = list_files_from_zip_url(url)
files = [f for f in files if f.endswith(".avi")]
files[:10]

['UCF101/v_ApplyEyeMakeup_g01_c01.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c02.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c03.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c04.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c05.avi',
 'UCF101/v_ApplyEyeMakeup_g01_c06.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c01.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c02.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c03.avi',
 'UCF101/v_ApplyEyeMakeup_g02_c04.avi']

In [3]:
def get_class(fname):
    """Retrieve the name of the class given a filename.

    Args:
      fname: Name of the file in the UCF101 dataset.

    Returns:
      Class that the file belongs to.
    """
    return fname.split("_")[-3]

In [4]:
import collections


def get_files_per_class(files):
    """Retrieve the files that belong to each class.

    Args:
      files: List of files in the dataset.

    Returns:
      Dictionary of class names (key) and files (values).
    """
    files_for_class = collections.defaultdict(list)
    for fname in files:
        class_name = get_class(fname)
        files_for_class[class_name].append(fname)
    return files_for_class

In [5]:
files_for_class = get_files_per_class(files)
classes = list(files_for_class.keys())

In [6]:
print("Num classes:", len(classes))
print("Num videos for class[0]:", len(files_for_class[classes[0]]))

Num classes: 101
Num videos for class[0]: 145


In [7]:
def select_subset_of_classes(files_for_class, classes, files_per_class):
    """Create a dictionary with the class name and a subset of the files in that class.

    Args:
      files_for_class: Dictionary of class names (key) and files (values).
      classes: List of classes.
      files_per_class: Number of files per class of interest.

    Returns:
      Dictionary with class as key and list of specified number of video files in that class.
    """
    files_subset = dict()

    for class_name in classes:
        class_files = files_for_class[class_name]
        files_subset[class_name] = class_files[:files_per_class]

    return files_subset

In [8]:
classes[0], files_for_class[classes[0]][:10]

('ApplyEyeMakeup',
 ['UCF101/v_ApplyEyeMakeup_g01_c01.avi',
  'UCF101/v_ApplyEyeMakeup_g01_c02.avi',
  'UCF101/v_ApplyEyeMakeup_g01_c03.avi',
  'UCF101/v_ApplyEyeMakeup_g01_c04.avi',
  'UCF101/v_ApplyEyeMakeup_g01_c05.avi',
  'UCF101/v_ApplyEyeMakeup_g01_c06.avi',
  'UCF101/v_ApplyEyeMakeup_g02_c01.avi',
  'UCF101/v_ApplyEyeMakeup_g02_c02.avi',
  'UCF101/v_ApplyEyeMakeup_g02_c03.avi',
  'UCF101/v_ApplyEyeMakeup_g02_c04.avi'])

In [9]:
num_classes = 10
files_per_classes = 50

files_subset = select_subset_of_classes(
    files_for_class, classes[:num_classes], files_per_classes
)
list(files_subset.keys())

['ApplyEyeMakeup',
 'ApplyLipstick',
 'Archery',
 'BabyCrawling',
 'BalanceBeam',
 'BandMarching',
 'BaseballPitch',
 'BasketballDunk',
 'Basketball',
 'BenchPress']

In [10]:
import tqdm
from urllib import request
import pathlib


def download_from_zip(zip_url, to_dir, file_names):
    """Download the contents of the zip file from the zip URL.

    Args:
      zip_url: A URL with a zip file containing data.
      to_dir: A directory to download data to.
      file_names: Names of files to download.
    """
    with rz.RemoteZip(zip_url) as zip:
        for fn in tqdm.tqdm(file_names):
            class_name = get_class(fn)
            zip.extract(fn, str(to_dir / class_name))
            unzipped_file = to_dir / class_name / fn

            fn = pathlib.Path(fn).parts[-1]
            output_file = to_dir / class_name / fn
            unzipped_file.rename(output_file)

In [11]:
def split_class_lists(files_for_class, count):
    """Returns the list of files belonging to a subset of data as well as the remainder of
    files that need to be downloaded.

    Args:
      files_for_class: Files belonging to a particular class of data.
      count: Number of files to download.

    Returns:
      Files belonging to the subset of data and dictionary of the remainder of files that need to be downloaded.
    """
    split_files = []
    remainder = {}
    for cls in files_for_class:
        split_files.extend(files_for_class[cls][:count])
        remainder[cls] = files_for_class[cls][count:]
    return split_files, remainder

In [12]:
import os
import random


def download_ufc_101_subset(zip_url, num_classes, splits, download_dir):
    """Download a subset of the UFC101 dataset and split them into various parts, such as
    training, validation, and test.

    Args:
      zip_url: A URL with a ZIP file with the data.
      num_classes: Number of labels.
      splits: Dictionary specifying the training, validation, test, etc. (key) division of data
              (value is number of files per split).
      download_dir: Directory to download data to.

    Return:
      Mapping of the directories containing the subsections of data.
    """
    files = list_files_from_zip_url(zip_url)
    for f in files:
        path = os.path.normpath(f)
        tokens = path.split(os.sep)
        if len(tokens) <= 2:
            files.remove(
                f
            )  # Remove that item from the list if it does not have a filename

    files_for_class = get_files_per_class(files)

    classes = list(files_for_class.keys())[:num_classes]

    for cls in classes:
        random.shuffle(files_for_class[cls])

    # Only use the number of classes you want in the dictionary
    files_for_class = {x: files_for_class[x] for x in classes}

    dirs = {}
    for split_name, split_count in splits.items():
        print(split_name, ":")
        split_dir = download_dir / split_name
        split_files, files_for_class = split_class_lists(files_for_class, split_count)
        download_from_zip(zip_url, split_dir, split_files)
        dirs[split_name] = split_dir

    return dirs

In [15]:
dataset_root_path = "./UCF101_subset/"
download_dir = pathlib.Path(dataset_root_path)
subset_paths = download_ufc_101_subset(
    url,
    num_classes=num_classes,
    splits={"train": 30, "val": 10, "test": 10},
    download_dir=download_dir,
)

train :


100%|██████████| 300/300 [01:38<00:00,  3.06it/s]


val :


100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


test :


100%|██████████| 100/100 [00:28<00:00,  3.51it/s]


In [16]:
video_count_train = len(list(download_dir.glob("train/*/*.avi")))
video_count_val = len(list(download_dir.glob("val/*/*.avi")))
video_count_test = len(list(download_dir.glob("test/*/*.avi")))
video_total = video_count_train + video_count_val + video_count_test
print(f"Total videos: {video_total}")

Total videos: 500


In [17]:
all_video_file_paths = (
    list(download_dir.glob("train/*/*.avi"))
    + list(download_dir.glob("val/*/*.avi"))
    + list(download_dir.glob("test/*/*.avi"))
)
all_video_file_paths[:5]

[PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g25_c05.avi'),
 PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g06_c02.avi'),
 PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g02_c05.avi'),
 PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g12_c05.avi'),
 PosixPath('UCF101_subset/train/BabyCrawling/v_BabyCrawling_g22_c02.avi')]

In [18]:
class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

In [19]:
# !find ./UCF101_subset

In [21]:
from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification

model_ckpt = "MCG-NJU/videomae-base-finetuned-kinetics"

feature_extractor = VideoMAEFeatureExtractor.from_pretrained(model_ckpt)
model = VideoMAEForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base-finetuned-kinetics and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
feature_extractor

VideoMAEFeatureExtractor {
  "do_center_crop": true,
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "VideoMAEFeatureExtractor",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "size": 224
}

In [24]:
import pytorchvideo.data

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)

In [25]:
mean = feature_extractor.image_mean
std = feature_extractor.image_std
resize_to = feature_extractor.size
clip_duration = 2


train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(8),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)
train_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "train"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)

val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(8),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize((resize_to, resize_to)),
                ]
            ),
        ),
    ]
)
val_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "val"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

In [26]:
sample_video = next(iter(train_dataset))
sample_video.keys()

dict_keys(['video', 'video_name', 'video_index', 'clip_index', 'aug_index', 'label'])

In [27]:
for k in sample_video:
    if k == "video":
        print(k, sample_video["video"].shape)
    else:
        print(k, sample_video[k])

video torch.Size([3, 8, 224, 224])
video_name v_Basketball_g01_c01.avi
video_index 210
clip_index 0
aug_index 0
label 7


In [28]:
id2label[sample_video[k]]

'Basketball'

Code related to data preparation utlities have been borrowed and repurposed from the following two articles:

* https://www.tensorflow.org/tutorials/load_data/video
* https://pytorchvideo.org/docs/tutorial_classification

## TODOs

- [ ] Provide appropriate credits
- [ ] Add plenty of comments
- [ ] Training and inference
- [ ] Commentary
- [ ] Code formatting