In [5]:
from combo_dataloader import ComboDataLoader, ComboDLTransform, DataLoaderType
import torchvision
import time

### Setting up video inputs and model

**Load in video paths**

In [6]:
annotation_file_path = "/home/maureen/kinetics/kinetics400_10classes/annotations/val.csv"
video_base_path = "/home/maureen/kinetics/kinetics400_10classes"
video_paths = []
with open(annotation_file_path, 'r') as annotation_file:
	for i, line in enumerate(annotation_file):
		if i != 0: # skip column headers
			line = annotation_file.readline()
			label, youtube_id, time_start, time_end, split, is_cc = line.strip().split(',')
			vpath = f'{video_base_path}/{split}/{youtube_id}_{int(time_start):06d}_{int(time_end):06d}.mp4'
			video_paths.append(vpath)

**Set up transform**

In [7]:
transform = ComboDLTransform(
		crop=112,
		mean=[0.43216, 0.394666, 0.37645],
		std=[0.22803 , 0.22145 , 0.216989],
		short_side_scale=128
)

### Comparing dataloader configurations

**Using only a PyTorch dataloader**

This configuration creates a single subprocess for a pytorch dataloader to load video inputs. Note the `num_workers` kwarg, which will get passed to the torch DataLoader constructor.

In [8]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH],
		dataloader_portions=[1],
		video_paths=video_paths,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
)

Let's time how long it takes to process all the videos.

In [9]:
start = time.perf_counter()
for batch in dl:
    pass
pytorch_time = time.perf_counter() - start
pytorch_time

142.3154000339564

**Using only a DALI dataloader**

In [10]:
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.DALI],
		dataloader_portions=[1],
		video_paths=video_paths,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		dali_pipeline_kwargs={"num_threads": 10}
)

In [11]:
start = time.perf_counter()
for batch in dl:
    pass
dali_time = time.perf_counter() - start
dali_time

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


101.49571724503767

**Using the optimal combination of DALI and PyTorch**

Based on the times measured above, we allocate the videos in an optimal split between DALI and PyTorch to take advantage of concurrency between the CPU and GPU.

In [12]:
dali_portion = int(round(pytorch_time / (pytorch_time + dali_time) * 100))
pytorch_portion = int(round(dali_time / (pytorch_time + dali_time) * 100))

# Expected time with these portions
# We won't get this ideal time, since there is overhead
dali_portion / 100 * dali_time

58.867516002121846

In [13]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
		dataloader_portions=[pytorch_portion, dali_portion],
		video_paths=video_paths,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
		dali_pipeline_kwargs={"num_threads": 10}
)

In [14]:
start = time.perf_counter()
for batch in dl:
    pass
end = time.perf_counter() - start
end

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


70.62431819702033

**Using PyTorch with a Decord backend**

Using decord, we can push the resize down to the decoding step to get over 2x speedup.

In [15]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH],
		dataloader_portions=[1],
		video_paths=video_paths,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
		pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
)

In [16]:
start = time.perf_counter()
for batch in dl:
    pass
pytorch_decord_time = time.perf_counter() - start
pytorch_decord_time

37.828518219990656

In [17]:
pytorch_time / pytorch_decord_time

3.762119340924889

**Using the optimal combination of DALI and PyTorch with a Decord backend**

In [18]:
dali_portion = int(round(pytorch_decord_time / (pytorch_decord_time + dali_time) * 100))
pytorch_portion = int(round(dali_time / (pytorch_decord_time + dali_time) * 100))

# Expected time with these portions
dali_portion / 100 * dali_time

27.403843656160173

In [19]:
# Create the dataloader
dl = ComboDataLoader(
		dataloaders=[DataLoaderType.PYTORCH, DataLoaderType.DALI],
		dataloader_portions=[pytorch_portion, dali_portion],
		video_paths=video_paths,
		transform=transform,
		stride=2,
		step=32,
		sequence_length=16,
		fps=32,
		batch_size=8,
		pytorch_dataloader_kwargs={"num_workers": 10},
		pytorch_dataset_kwargs=dict(decoder="decord", short_side_scale=128),
		dali_pipeline_kwargs={"num_threads": 10},
)

In [20]:
start = time.perf_counter()
for batch in dl:
    pass
end = time.perf_counter() - start
end

[/opt/dali/dali/operators/reader/loader/video_loader.h:180] ``file_list_include_preceding_frame`` uses the default value False. In future releases, the default value will be changed to True.


33.9872514490271

In [21]:
# Speedup 
pytorch_time / end

4.187317125287289