In [1]:
from combo_dataloader import ComboDataLoader, ComboDLTransform, DataLoaderType
import torchvision
import torch
import torch.nn as nn
import torchmetrics
import time
import pytorch_lightning
import json
import json
from typing import List, Tuple
import os
import random

  from .autonotebook import tqdm as notebook_tqdm


### Setting up video inputs and model

**Load in video paths and labels**

In [2]:
with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames_json = json.load(f)

In [3]:
kinetics_classnames_to_id = {}
for k, v in kinetics_classnames_json.items():
    kinetics_classnames_to_id[str(k).replace('"', "")] = v
# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames_to_id.items():
    kinetics_id_to_classname[v] = k

In [4]:
null_videos = {
    "/home/maureen/kinetics/kinetics400_10classes/train/xxUezLcXkDs_000256_000266.mp4",
    "/home/maureen/kinetics/kinetics400_10classes/train/CUxsn4YXksI_000119_000129.mp4"
}

In [5]:
def load_video_paths(annotation_file_path, video_base_path, shuffle=True) -> Tuple[List[str], List[int]]:
	video_paths = []
	labels = []
	with open(annotation_file_path, 'r') as annotation_file:
		for i, line in enumerate(annotation_file):
			if i != 0: # skip column headers
				line = annotation_file.readline()
				if line:
					label, youtube_id, time_start, time_end, split, is_cc = line.strip().split(',')
					label_id = kinetics_classnames_to_id.get(label)
					vpath = f'{video_base_path}/{split}/{youtube_id}_{int(time_start):06d}_{int(time_end):06d}.mp4'

					if os.path.exists(vpath) and vpath not in null_videos:
						video_paths.append(vpath)
						labels.append(label_id)

	if shuffle:
		combined = list(zip(video_paths, labels))
		random.shuffle(combined)
		video_paths, labels = zip(*combined)

	return video_paths, labels

In [6]:
video_paths, train_labels = load_video_paths(
    '/home/maureen/kinetics/kinetics400_10classes/annotations/train.csv',
    '/home/maureen/kinetics/kinetics400_10classes'
)

**Set up train/test pipeline**

In [7]:
class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
    def __init__(self):
        super().__init__()
        model = torchvision.models.video.r3d_18(weights=torchvision.models.video.R3D_18_Weights.KINETICS400_V1)
        model = model.to("cuda")

        # Identify the fully connected layer whose weights you want to randomize
        fc_layer = model.fc

        # Randomly initialize the weights of the fc_layer
        nn.init.xavier_uniform_(fc_layer.weight)
        nn.init.zeros_(fc_layer.bias)

        # Freeze all but last fully-connected layer
        for name, param in model.named_parameters():
            if not name.startswith("fc"):
                param.requires_grad = False
        self.model = model

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        # The model expects a video tensor of shape (B, C, T, H, W), which is the
        # format provided by the dataset
        pred = self.model(batch["frames"])

        labels = batch["label"].to(torch.long)

        # Compute cross entropy loss, loss.backwards will be called behind the scenes
        # by PyTorchLightning after being returned from this method.
        loss = torch.nn.functional.cross_entropy(pred, labels)

        self.log("train_loss", loss)
        return loss

    def test_step(self, batch, batch_idx):
        pred = self.model(batch["frames"])
        labels = batch["label"].to(torch.long)

        loss = torch.nn.functional.cross_entropy(pred, labels)
        pred_labels = torch.argmax(pred, dim=1)

        self.log("test_loss", loss)

        return loss

    def configure_optimizers(self):
        """
        Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
        usually useful for training video models.
        """
        return torch.optim.Adam(self.parameters(), lr=1e-4)

In [8]:
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=2)
model = VideoClassificationLightningModule()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


**Set up transform**

In [9]:
transform = ComboDLTransform(
		crop=112,
		mean=[0.43216, 0.394666, 0.37645],
		std=[0.22803 , 0.22145 , 0.216989],
		short_side_scale=128
)

### Comparing dataloader configurations

**Using only a PyTorch dataloader**

This configuration creates a single subprocess for a pytorch dataloader to load video inputs. Note the `num_workers` kwarg, which will get passed to the torch DataLoader constructor.

In [10]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH],
		dataloader_portions=[1],
		video_paths=video_paths,
		labels=train_labels,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
)

Let's time how long it takes to train on all the videos.

In [11]:
start = time.perf_counter()
trainer.fit(model=model, train_dataloaders=dl)
pytorch_time = time.perf_counter() - start
dl.shutdown()
pytorch_time

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type        | Params
--------------------------------------
0 | model | VideoResNet | 33.4 M
--------------------------------------
205 K     Trainable params
33.2 M    Non-trainable params
33.4 M    Total params
133.486   Total estimated model params size (MB)


Epoch 1: : 3539it [28:00,  2.11it/s, loss=2.21, v_num=37]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: : 3539it [28:01,  2.11it/s, loss=2.21, v_num=37]


3358.7186316158623

**Using only a DALI dataloader**

In [12]:
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.DALI],
		dataloader_portions=[1],
		video_paths=video_paths,
		labels=train_labels,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		dali_pipeline_kwargs={"num_threads": 10}
)

In [13]:
start = time.perf_counter()
model = VideoClassificationLightningModule()
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=2)
trainer.fit(model=model, train_dataloaders=dl)
dali_time = time.perf_counter() - start
dl.shutdown()
dali_time

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type        | Params
--------------------------------------
0 | model | VideoResNet | 33.4 M
--------------------------------------
205 K     Trainable params
33.2 M    Non-trainable params
33.4 M    Total params
133.486   Total estimated model params size (MB)


Epoch 0: : 0it [00:00, ?it/s]

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


Epoch 1: : 2825it [19:09,  2.46it/s, loss=1.92, v_num=38]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: : 2825it [19:09,  2.46it/s, loss=1.92, v_num=38]


2314.4302513890434

**Using the optimal combination of DALI and PyTorch**

Based on the times measured above, we allocate the videos in an optimal split between DALI and PyTorch to take advantage of concurrency between the CPU and GPU.

In [14]:
dali_portion = int(round(pytorch_time / (pytorch_time + dali_time) * 100))
pytorch_portion = int(round(dali_time / (pytorch_time + dali_time) * 100))

# Expected time with these portions
# We won't get this ideal time, since there is overhead
dali_portion / 100 * dali_time

1365.5138483195356

In [15]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
		dataloader_portions=[pytorch_portion, dali_portion],
		video_paths=video_paths,
		labels=train_labels,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
		dali_pipeline_kwargs={"num_threads": 10}
)

In [16]:
start = time.perf_counter()
model = VideoClassificationLightningModule()
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=2)
trainer.fit(model=model, train_dataloaders=dl)
end = time.perf_counter() - start
dl.shutdown()
end

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type        | Params
--------------------------------------
0 | model | VideoResNet | 33.4 M
--------------------------------------
205 K     Trainable params
33.2 M    Non-trainable params
33.4 M    Total params
133.486   Total estimated model params size (MB)


Epoch 0: : 0it [00:00, ?it/s]

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


Epoch 1: : 3126it [12:00,  4.34it/s, loss=2.06, v_num=39]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: : 3126it [12:00,  4.34it/s, loss=2.06, v_num=39]


1451.3871485558338

**Using PyTorch with a Decord backend**

Using decord, we can push the resize down to the decoding step to get over 2x speedup.

In [17]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH],
		dataloader_portions=[1],
		video_paths=video_paths,
		labels=train_labels,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
		pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
)

In [18]:
start = time.perf_counter()
model = VideoClassificationLightningModule()
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=2)
trainer.fit(model=model, train_dataloaders=dl)
pytorch_decord_time = time.perf_counter() - start
dl.shutdown()
pytorch_decord_time

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type        | Params
--------------------------------------
0 | model | VideoResNet | 33.4 M
--------------------------------------
205 K     Trainable params
33.2 M    Non-trainable params
33.4 M    Total params
133.486   Total estimated model params size (MB)


Epoch 1: : 3539it [08:17,  7.11it/s, loss=2.25, v_num=40]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: : 3539it [08:18,  7.10it/s, loss=2.25, v_num=40]


999.626766884001

In [19]:
pytorch_time / pytorch_decord_time

3.359972684690641

**Using the optimal combination of DALI and PyTorch with a Decord backend**

In [20]:
dali_portion = int(round(pytorch_decord_time / (pytorch_decord_time + dali_time) * 100))
pytorch_portion = int(round(dali_time / (pytorch_decord_time + dali_time) * 100))

# Expected time with these portions
dali_portion / 100 * dali_time

694.329075416713

In [21]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
		dataloader_portions=[pytorch_portion, dali_portion],
		video_paths=video_paths,
		labels=train_labels,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
		pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
		dali_pipeline_kwargs={"num_threads": 10},
)

In [22]:
start = time.perf_counter()
model = VideoClassificationLightningModule()
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=2)
trainer.fit(model=model, train_dataloaders=dl)
end = time.perf_counter() - start
dl.shutdown()
end

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type        | Params
--------------------------------------
0 | model | VideoResNet | 33.4 M
--------------------------------------
205 K     Trainable params
33.2 M    Non-trainable params
33.4 M    Total params
133.486   Total estimated model params size (MB)


Epoch 0: : 0it [00:00, ?it/s]

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


Epoch 1: : 3324it [07:52,  7.03it/s, loss=2.12, v_num=41]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: : 3324it [07:53,  7.03it/s, loss=2.12, v_num=41]


950.7100973417982

In [23]:
# Speedup 
pytorch_time / end

3.5328525919803493