In [1]:
from combo_dataloader import ComboDataLoader, ComboDLTransform, DataLoaderType
import torchvision
import time
import torch.nn as nn
import pytorch_lightning
import torch
import json
from typing import List, Tuple
import os
import torchmetrics
import random

  from .autonotebook import tqdm as notebook_tqdm


### Setting up video inputs and model

**Load in video paths and labels**

In [2]:
with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames_json = json.load(f)

In [3]:
# Create a label name to id mapping
kinetics_classnames_to_id = {}
for k, v in kinetics_classnames_json.items():
    kinetics_classnames_to_id[str(k).replace('"', "")] = v

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames_to_id.items():
    kinetics_id_to_classname[v] = k

In [4]:
# These videos are corrupted and cannot be read
# Ignore this for running on your own system
null_videos = {
    "/home/maureen/kinetics/kinetics400_10classes/train/xxUezLcXkDs_000256_000266.mp4",
    "/home/maureen/kinetics/kinetics400_10classes/train/CUxsn4YXksI_000119_000129.mp4"
}

In [5]:
# Given an annotation file and a base path to the videos, load the video paths and labels
def load_video_paths(annotation_file_path, video_base_path, shuffle=True) -> Tuple[List[str], List[int]]:
	video_paths = []
	labels = []
	with open(annotation_file_path, 'r') as annotation_file:
		for i, line in enumerate(annotation_file):
			if i != 0: # skip column headers
				line = annotation_file.readline()
				if line:
					label, youtube_id, time_start, time_end, split, is_cc = line.strip().split(',')
					label_id = kinetics_classnames_to_id.get(label)
					vpath = f'{video_base_path}/{split}/{youtube_id}_{int(time_start):06d}_{int(time_end):06d}.mp4'

					if os.path.exists(vpath) and vpath not in null_videos:
						video_paths.append(vpath)
						labels.append(label_id)

	if shuffle:
		combined = list(zip(video_paths, labels))
		random.shuffle(combined)
		video_paths, labels = zip(*combined)

	return video_paths, labels

In [6]:
val_paths, val_labels = load_video_paths(
    '/home/maureen/kinetics/kinetics400_10classes/annotations/val.csv',
    '/home/maureen/kinetics/kinetics400_10classes'
)
train_paths, train_labels = load_video_paths(
    '/home/maureen/kinetics/kinetics400_10classes/annotations/train.csv',
    '/home/maureen/kinetics/kinetics400_10classes'
)

In [7]:
train_paths = train_paths[:100]
train_labels = train_labels[:100]

**Set up transform**

In [8]:
dali_portion = 30
pytorch_portion = 70

### Train loop

In [9]:
class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
    def __init__(self, model):
        super().__init__()
        model = model.to("cuda")
        self.model = model

        self.micro_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=400, average='micro')
        self.micro_F1 = torchmetrics.classification.MulticlassF1Score(num_classes=400, average='micro')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        # The model expects a video tensor of shape (B, C, T, H, W), which is the
        # format provided by the dataset
        pred = self.model(batch["frames"])
        labels = batch["label"].to(torch.long)

        # Compute cross entropy loss, loss.backwards will be called behind the scenes
        # by PyTorchLightning after being returned from this method.
        loss = torch.nn.functional.cross_entropy(pred, labels)

        self.log("train_loss", loss)
        return loss

    def test_step(self, batch, batch_idx):
        pred = self.model(batch["frames"])
        labels = batch["label"].to(torch.long)

        loss = torch.nn.functional.cross_entropy(pred, labels)
        pred_labels = torch.argmax(pred, dim=1)
        micro_acc = self.micro_accuracy(pred_labels, labels)
        micro_f1 = self.micro_F1(pred_labels, labels)

        self.log("test_loss", loss)
        self.log("test_micro_accuracy", micro_acc, on_epoch=True)
        self.log("test_micro_f1", micro_f1)

        return loss

    def configure_optimizers(self):
        """
        Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
        usually useful for training video models.
        """
        return torch.optim.Adam(self.parameters(), lr=1e-4)

### R3D_18

In [11]:
pretrained_model = torchvision.models.video.r3d_18(weights=torchvision.models.video.R3D_18_Weights.KINETICS400_V1)
transform = ComboDLTransform(
		crop=112,
		mean=[0.43216, 0.394666, 0.37645],
		std=[0.22803 , 0.22145 , 0.216989],
		short_side_scale=128
)
train_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
    dataloader_portions=[pytorch_portion, dali_portion],
    video_paths=train_paths,
    labels=train_labels,
    transform=transform,
    stride=2,
    step=32,
    sequence_length=16,
    fps=32,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)
test_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
    dataloader_portions=[pytorch_portion, dali_portion],
    video_paths=val_paths,
    labels=val_labels,
    transform=transform,
    stride=2,
    step=32,
    sequence_length=16,
    fps=32,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=5)
model = VideoClassificationLightningModule(pretrained_model)
trainer.fit(model=model, train_dataloaders=train_dl)
trainer.test(model=model, dataloaders=test_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type               | Params
------------------------------------------------------
0 | model          | VideoResNet        | 33.4 M
1 | micro_accuracy | MulticlassAccuracy | 0     
2 | micro_F1       | MulticlassF1Score  | 0     
------------------------------------------------------
33.4 M    Trainable params
0         Non-trainable params
33.4 M    Total params
133.486   Total estimated model params size (MB)


Epoch 0: : 0it [00:00, ?it/s]

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


Testing DataLoader 0: : 2it [00:01,  1.42it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


Testing DataLoader 0: : 12it [00:06,  1.92it/s]

## X3D_S

In [None]:
pretrained_model = torch.hub.load('facebookresearch/pytorchvideo', "x3d_s", pretrained=True)
transform = ComboDLTransform(
    crop=182,
    mean=[0.45, 0.45, 0.45],
    std = [0.225, 0.225, 0.225],
    short_side_scale=182
)
train_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
    dataloader_portions=[pytorch_portion, dali_portion],
    video_paths=train_paths,
    labels=train_labels,
    transform=transform,
    stride=6,
    step=6 * 13,
    sequence_length=13,
    fps=30,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)
test_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
    dataloader_portions=[pytorch_portion, dali_portion],
    video_paths=train_paths,
    labels=train_labels,
    transform=transform,
    stride=6,
    step=6 * 13,
    sequence_length=13,
    fps=30,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=5)
model = VideoClassificationLightningModule(pretrained_model)
trainer.fit(model=model, train_dataloaders=train_dl)
trainer.test(model=model, dataloaders=test_dl)

### 3D Resnet (Slow R50)

In [None]:
pretrained_model = torch.hub.load('facebookresearch/pytorchvideo', "slow_r50", pretrained=True)
transform = ComboDLTransform(
    crop=256,
    mean=[0.45, 0.45, 0.45],
    std = [0.225, 0.225, 0.225],
    short_side_scale=256
)
train_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
    dataloader_portions=[pytorch_portion, dali_portion],
    video_paths=train_paths,
    labels=train_labels,
    transform=transform,
    stride=8,
    step=8 * 8,
    sequence_length=8,
    fps=30,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)
test_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
    dataloader_portions=[pytorch_portion, dali_portion],
    video_paths=train_paths,
    labels=train_labels,
    transform=transform,
    stride=8,
    step=8 * 8,
    sequence_length=8,
    fps=30,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=5)
model = VideoClassificationLightningModule(pretrained_model)
trainer.fit(model=model, train_dataloaders=train_dl)
trainer.test(model=model, dataloaders=test_dl)

## SlowFast

For SlowFast, we use an additional transform to sample the slow pathway. So that multiprocessing can find the transform when we run in the notebook, it must be defined in a separate file. We also redefine the `LightningModule` to 

In [10]:
# Additional transform needs to be defined in a separate file when using
# in a notebook so that multiprocessing can find it
import slowfast_transform

In [12]:
transform = ComboDLTransform(
    crop=256,
    mean=[0.45 * 255, 0.45 * 255, 0.45 * 255],
    std = [0.225 * 255, 0.225* 255, 0.225 * 255],
    short_side_scale=256
)

# For this model, we only use a PyTorch dataloader, because passed-in DALI transform 
# functions must return a tensor, rather than a list of two tensors as we would like.
# Adding support for a function that returns a list of two tensors would require
# special modifications in _dali_dataloader.py, which I decided not to do to 
# keep the implementation general, but could be done if you would like to use
# DALI in the combined dataloader with SlowFast.
test_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH],
    dataloader_portions=[pytorch_portion],
    video_paths=val_paths,
    labels=val_labels,
    transform=transform,
    stride=2,
    step=32,
    sequence_length=16,
    fps=30,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    pytorch_additional_transform=slowfast_transform.PackPathway(),
)

train_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH],
    dataloader_portions=[pytorch_portion],
    video_paths=train_paths,
    labels=train_labels,
    transform=transform,
    stride=2,
    step=32,
    sequence_length=16,
    fps=30,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    pytorch_additional_transform=slowfast_transform.PackPathway()
)

trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=5)
pretrained_model = torch.hub.load('facebookresearch/pytorchvideo', "slowfast_r50", pretrained=True)
model = VideoClassificationLightningModule(pretrained_model)
trainer.fit(model=model, train_dataloaders=train_dl)
trainer.test(model=model, dataloaders=test_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type               | Params
------------------------------------------------------
0 | model          | Net                | 34.6 M
1 | micro_accuracy | MulticlassAccuracy | 0     
2 | micro_F1       | MulticlassF1Score  | 0     
------------------------------------------------------
34.6 M    Trainable params
0         Non-trainable params
34.6 M    Total params
138.266   Total estimated model params size (MB)


Epoch 0: : 0it [00:00, ?it/s]torch.Size([8, 3, 4, 256, 256])
torch.Size([8, 3, 16, 256, 256])


RuntimeError: input image (T: 4 H: 8 W: 8) smaller than kernel size (kT: 8 kH: 7 kW: 7)