In [1]:
from combo_dataloader import ComboDataLoader, ComboDLTransform, DataLoaderType
import torchvision
import time
import torch.nn as nn
import pytorch_lightning
import torch
import json
from typing import List, Tuple
import os
import torchmetrics
import random

  from .autonotebook import tqdm as notebook_tqdm


### Setting up video inputs and model

**Load in video paths and labels**

In [2]:
with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames_json = json.load(f)

In [3]:
# Create a label name to id mapping
kinetics_classnames_to_id = {}
for k, v in kinetics_classnames_json.items():
    kinetics_classnames_to_id[str(k).replace('"', "")] = v

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames_to_id.items():
    kinetics_id_to_classname[v] = k

In [4]:
# These videos are somehow corrupt and can't be read
# When running on your system, you can delete this cell
null_videos = {
    "/home/maureen/kinetics/kinetics400_10classes/train/xxUezLcXkDs_000256_000266.mp4",
    "/home/maureen/kinetics/kinetics400_10classes/train/CUxsn4YXksI_000119_000129.mp4"
}

In [5]:
# Loads video paths and labels from a CSV file
def load_video_paths(annotation_file_path, video_base_path, shuffle=True) -> Tuple[List[str], List[int]]:
	video_paths = []
	labels = []
	with open(annotation_file_path, 'r') as annotation_file:
		for i, line in enumerate(annotation_file):
			if i != 0: # skip column headers
				line = annotation_file.readline()
				if line:
					label, youtube_id, time_start, time_end, split, is_cc = line.strip().split(',')
					label_id = kinetics_classnames_to_id.get(label)
					vpath = f'{video_base_path}/{split}/{youtube_id}_{int(time_start):06d}_{int(time_end):06d}.mp4'

					if os.path.exists(vpath) and vpath not in null_videos:
						video_paths.append(vpath)
						labels.append(label_id)

	if shuffle:
		combined = list(zip(video_paths, labels))
		random.shuffle(combined)
		video_paths, labels = zip(*combined)

	return video_paths, labels

In [6]:
# Load in test and training paths
# Replace these paths for your own system
val_paths, val_labels = load_video_paths(
    '/home/maureen/kinetics/kinetics400_10classes/annotations/val.csv',
    '/home/maureen/kinetics/kinetics400_10classes'
)
train_paths, train_labels = load_video_paths(
    '/home/maureen/kinetics/kinetics400_10classes/annotations/train.csv',
    '/home/maureen/kinetics/kinetics400_10classes'
)

**Set up transform**

In [7]:
# This is the specified transform for the R3D_18 model used here
transform = ComboDLTransform(
		crop=112,
		mean=[0.43216, 0.394666, 0.37645],
		std=[0.22803 , 0.22145 , 0.216989],
		short_side_scale=128
)

#### Determine the optimal video allocation to DALI/Torch dataloaders

To figure out the best allocation of videos to dataloading subprocesses, we run just dataloading on a small subset of the input videos to get an approximation of the relative efficiency of DALI and PyTorch dataloaders for this setup.

**Using only a DALI dataloader**

In [8]:
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.DALI],
		dataloader_portions=[1],
		video_paths=train_paths[:50],
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		dali_pipeline_kwargs={"num_threads": 10}
)

In [9]:
start = time.perf_counter()
for batch in dl:
    pass
dali_time = time.perf_counter() - start
dali_time

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


20.474931203003507

**Using PyTorch with a Decord backend**

In [10]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH],
		dataloader_portions=[1],
		video_paths=train_paths[:50],
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
		pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
)

In [11]:
start = time.perf_counter()
for batch in dl:
    pass
pytorch_decord_time = time.perf_counter() - start
pytorch_decord_time

8.870198810996953

**Using the optimal combination of DALI and PyTorch with a Decord backend**

In [12]:
dali_portion = int(round(pytorch_decord_time / (pytorch_decord_time + dali_time) * 100))
pytorch_portion = int(round(dali_time / (pytorch_decord_time + dali_time) * 100))

# Expected time with these portions
dali_portion / 100 * dali_time

6.142479360901052

In [14]:
# Create the dataloader
train_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
    dataloader_portions=[pytorch_portion, dali_portion],
    video_paths=train_paths,
    labels=train_labels,
    transform=transform,
    stride=2,
    step=32,
    sequence_length=16,
    fps=32,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)
test_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
    dataloader_portions=[pytorch_portion, dali_portion],
    video_paths=val_paths,
    labels=val_labels,
    transform=transform,
    stride=2,
    step=32,
    sequence_length=16,
    fps=32,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)

### Train loop

To train, we use pytorchlightning to set up a training loop, as well as testing. We use the pretrained `r3d_18` model from `torchvision`, but randomly initialize the weights of the last fully-connected layer so we can demonstrate training. 

Note that the videos we used (see `train.csv` and `val.csv`) only represent 10 of the 400 Kinetics400 classes, so we must use a micro average (weighted by the occurrence of the classes in the dataset) to get meaningful metrics from testing.

In [15]:
class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
    def __init__(self):
        super().__init__()
        model = torchvision.models.video.r3d_18(weights=torchvision.models.video.R3D_18_Weights.KINETICS400_V1)
        model = model.to("cuda")

        # Identify the fully connected layer whose weights you want to randomize
        fc_layer = model.fc

        # Randomly initialize the weights of the fc_layer
        nn.init.xavier_uniform_(fc_layer.weight)
        nn.init.zeros_(fc_layer.bias)

        # Freeze all but last fully-connected layer
        for name, param in model.named_parameters():
            if not name.startswith("fc"):
                param.requires_grad = False
        self.model = model

        # Since we're only using 10/400 classes, micro average makes sense
        self.micro_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=400, average='micro')
        self.micro_F1 = torchmetrics.classification.MulticlassF1Score(num_classes=400, average='micro')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        # The model expects a video tensor of shape (B, C, T, H, W), which is the
        # format provided by the dataset
        pred = self.model(batch["frames"])

        labels = batch["label"].to(torch.long)

        # Compute cross entropy loss, loss.backwards will be called behind the scenes
        # by PyTorchLightning after being returned from this method.
        loss = torch.nn.functional.cross_entropy(pred, labels)

        self.log("train_loss", loss)
        return loss

    def test_step(self, batch, batch_idx):
        pred = self.model(batch["frames"])
        labels = batch["label"].to(torch.long)

        loss = torch.nn.functional.cross_entropy(pred, labels)
        pred_labels = torch.argmax(pred, dim=1)
        micro_acc = self.micro_accuracy(pred_labels, labels)
        micro_f1 = self.micro_F1(pred_labels, labels)

        self.log("test_loss", loss)
        self.log("test_micro_accuracy", micro_acc, on_epoch=True)
        self.log("test_micro_f1", micro_f1)

        return loss

    def configure_optimizers(self):
        """
        Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
        usually useful for training video models.
        """
        return torch.optim.Adam(self.parameters(), lr=1e-4)

In [16]:
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=5)
model = VideoClassificationLightningModule()
trainer.fit(model=model, train_dataloaders=train_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type               | Params
------------------------------------------------------
0 | model          | VideoResNet        | 33.4 M
1 | micro_accuracy | MulticlassAccuracy | 0     
2 | micro_F1       | MulticlassF1Score  | 0     
------------------------------------------------------
205 K     Trainable params
33.2 M    Non-trainable params
33.4 M    Total params
133.486   Total estimated model params size (MB)


Epoch 0: : 0it [00:00, ?it/s]

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


Epoch 4: : 3325it [07:53,  7.03it/s, loss=2.09, v_num=68]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: : 3325it [07:53,  7.02it/s, loss=2.09, v_num=68]


In [17]:
trainer.test(model=model, dataloaders=test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


Testing DataLoader 0: : 1it [00:00,  6.78it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


Testing DataLoader 0: : 236it [00:33,  7.12it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


Testing DataLoader 0: : 269it [00:37,  7.17it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


Testing DataLoader 0: : 279it [00:38,  7.21it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


Testing DataLoader 0: : 282it [00:38,  7.24it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


Testing DataLoader 0: : 287it [00:39,  7.29it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           1.0219424962997437
   test_micro_accuracy       0.686170220375061
      test_micro_f1          0.686170220375061
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


[{'test_loss': 1.0219424962997437,
  'test_micro_accuracy': 0.686170220375061,
  'test_micro_f1': 0.686170220375061}]