# Profiling memory loading
Another crucial part of training a deep learning model is the dataset, and of course loading the dataset from disk. This task can be parallelized, and we will now investigate how the number of CPU loaders affect the time.

In [None]:
import numpy as np
import torch
from torch import nn
from torch.profiler import profile, record_function, ProfilerActivity
import torchvision
import torchvision.transforms as transforms

In [None]:
path = "./data"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64

In [None]:
transform_train = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR100(
    root=path, train=True, download=True, transform=transform_train)

Files already downloaded and verified


In [None]:
def load_data(dataset, num_workers, sort_string):
    trainloader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    
    trainloader_enum = enumerate(trainloader)

    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
        with record_function("data_load"):
            batch_idx, (inputs, targets) = next(trainloader_enum)
            inputs, targets = inputs.to(device), targets.to(device)       

    print(prof.key_averages().table(sort_by=sort_string, row_limit=15))

###1 Loader

In [None]:
workers = 1
sort_string = "cuda_time_total"
load_data(trainset, workers, sort_string)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              data_load         0.08%       4.334ms        99.97%        5.366s        5.366s       0.000us         0.00%      72.000us      72.000us             1  
                                               aten::to         0.00%      72.000us        99.46%        5.338s        1.335s       0.000us         0.00%      72.000us      18.000us             4  
         

###2 loaders

In [None]:
workers = 2
sort_string = "self_cuda_time_total"
load_data(trainset, workers, sort_string)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::copy_         0.09%      38.000us         8.80%       3.587ms       1.794ms      72.000us       100.00%      72.000us      36.000us             2  
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      72.000us       100.00%      72.000us      36.000us             2  
         

###4 loaders

In [None]:
workers = 4
sort_string = "self_cuda_time_total"
load_data(trainset, workers, sort_string)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::copy_         0.17%      49.000us        16.54%       4.708ms       2.354ms      75.000us       100.00%      75.000us      37.500us             2  
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      75.000us       100.00%      75.000us      37.500us             2  
         

###8 loaders

In [None]:
workers = 8
sort_string = "cuda_time_total"
load_data(trainset, workers, sort_string)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              data_load         1.67%     253.000us        99.52%      15.049ms      15.049ms       0.000us         0.00%      71.000us      71.000us             1  
                                               aten::to        10.51%       1.589ms        35.33%       5.342ms       1.335ms       0.000us         0.00%      71.000us      17.750us             4  
         

###16 loaders

In [None]:
workers = 16
sort_string = "cuda_time_total"
load_data(trainset, workers, sort_string)

  cpuset_checked))


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              data_load        10.83%       5.393ms        99.88%      49.758ms      49.758ms       0.000us         0.00%      72.000us      72.000us             1  
                                               aten::to         3.07%       1.529ms         7.86%       3.917ms     979.250us       0.000us         0.00%      72.000us      18.000us             4  
         

# Analysis

Parallelizing data loading has the potential to give massive speedups. As can be seen above, the running time decreases from over 5s with one dataloader to dozens of milliseconds with 2 loaders. The speedup continues with more dataloaders, until 8 loaders. When using 16 loaders, we instead see an increase in loading time. The system outputs a warning that this is above the recommended maximum, and this is where the communication overhead of the 16 processes becomes larger than the efficiency gain. Overall, we achieve a speedup greater than 300 for using 8 loaders over 1, although this is likely a fluke result as the system may need to warmup. Compared to 2 loaders, 8 loaders achieved a 2.6 time speedup, which is worse than linear but an important observation.