In [2]:
from combo_dataloader import ComboDataLoader, ComboDLTransform, DataLoaderType
import torchvision
import time
import pytorch_lightning
import torch
import json

  from .autonotebook import tqdm as notebook_tqdm


### Setting up video inputs and model

**Load in video paths and labels**

In [3]:
with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames_to_id = json.load(f)

In [4]:

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames_to_id.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

In [5]:
annotation_file_path = "/home/maureen/kinetics/kinetics400/annotations/val.csv"
video_base_path = "/home/maureen/kinetics/kinetics400"
video_paths = []
labels = []
with open(annotation_file_path, 'r') as annotation_file:
	for i, line in enumerate(annotation_file):
		if i != 0: # skip column headers
			line = annotation_file.readline()
			label, youtube_id, time_start, time_end, split, is_cc = line.strip().split(',')
			label_id = kinetics_classnames_to_id.get(label)
			vpath = f'{video_base_path}/{split}/{youtube_id}_{int(time_start):06d}_{int(time_end):06d}.mp4'
			video_paths.append(vpath)
			labels.append(label_id)

**Set up transform**

In [6]:
transform = ComboDLTransform(
		crop=112,
		mean=[0.43216, 0.394666, 0.37645],
		std=[0.22803 , 0.22145 , 0.216989],
		short_side_scale=128
)


**Using only a DALI dataloader**

In [7]:
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.DALI],
		dataloader_portions=[1],
		video_paths=video_paths,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		dali_pipeline_kwargs={"num_threads": 10}
)

In [8]:
start = time.perf_counter()
for batch in dl:
    pass
dali_time = time.perf_counter() - start
dali_time

38.809071857482195

**Using PyTorch with a Decord backend**

Using decord, we can push the resize down to the decoding step to get over 2x speedup.

In [9]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH],
		dataloader_portions=[1],
		video_paths=video_paths,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
		pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
)

In [10]:
start = time.perf_counter()
for batch in dl:
    pass
pytorch_decord_time = time.perf_counter() - start
pytorch_decord_time

13.961845997720957

**Using the optimal combination of DALI and PyTorch with a Decord backend**

In [11]:
dali_portion = int(round(pytorch_decord_time / (pytorch_decord_time + dali_time) * 100))
pytorch_portion = int(round(dali_time / (pytorch_decord_time + dali_time) * 100))

# Expected time with these portions
dali_portion / 100 * dali_time

10.090358682945372

In [12]:
train_videos = video_paths[:5]
train_labels = labels[:5]
test_videos = video_paths[5:10]
test_labels = labels[5:10]

In [13]:
# Create the dataloader
train_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH ],
    dataloader_portions=[pytorch_portion],
    video_paths=train_videos,
    labels=train_labels,
    transform=transform,
    stride=2,
    step=32,
    sequence_length=16,
    fps=32,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)
test_dl = ComboDataLoader(
    dataloaders=[DataLoaderType.PYTORCH],
    dataloader_portions=[pytorch_portion],
    video_paths=test_videos,
    labels=test_labels,
    transform=transform,
    stride=2,
    step=32,
    sequence_length=16,
    fps=32,
    batch_size=8,
    pytorch_dataloader_kwargs={"num_workers": 10},
    pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
    dali_pipeline_kwargs={"num_threads": 10},
)

### Train loop

In [19]:
class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
    def __init__(self):
        super().__init__()
        model = torchvision.models.video.r3d_18()
        model = model.to("cuda")

        # Freeze all but last fully-connected layer
        for name, param in model.named_parameters():
            if not name.startswith("fc"):
                param.requires_grad = False
        self.model = model

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        # The model expects a video tensor of shape (B, C, T, H, W), which is the
        # format provided by the dataset
        pred = self.model(batch["frames"])
        labels = batch["label"].to(torch.long)

        # Compute cross entropy loss, loss.backwards will be called behind the scenes
        # by PyTorchLightning after being returned from this method.
        loss = torch.nn.functional.cross_entropy(pred, labels)

        self.log("train_loss", loss)
        return loss

    def test_step(self, batch, batch_idx):
        pred = self.model(batch["frames"])
        labels = batch["label"].to(torch.long)

        loss = torch.nn.functional.cross_entropy(pred, labels)

        self.log("test_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        pred = self.model(batch["frames"])
        labels = batch["label"].to(torch.long)
        
        loss = torch.nn.functional.cross_entropy(pred, labels)

        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        """
        Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
        usually useful for training video models.
        """
        return torch.optim.Adam(self.parameters(), lr=1e-4)

In [20]:
iterator = iter(train_dl)

In [21]:
trainer = pytorch_lightning.Trainer(accelerator='gpu', devices=1, max_epochs=100)
model = VideoClassificationLightningModule()
trainer.fit(model=model, train_dataloaders=train_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type        | Params
--------------------------------------
0 | model | VideoResNet | 33.4 M
--------------------------------------
205 K     Trainable params
33.2 M    Non-trainable params
33.4 M    Total params
133.486   Total estimated model params size (MB)


Epoch 99: : 10it [00:03,  3.28it/s, loss=1.61, v_num=35]

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: : 10it [00:03,  2.98it/s, loss=1.61, v_num=35]


In [23]:
# TODO: separate test and val
trainer.test(model=model, dataloaders=test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: : 9it [00:00, 13.10it/s] 
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           12.400954246520996
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 12.400954246520996}]