# Profiling memory loading
Another crucial part of training a deep learning model is the dataset, and of course loading the dataset from disk. This task can be parallelized, and we will now investigate how the number of CPU loaders affect the time.

In [5]:
import numpy as np
import torch
from torch import nn
from torch.profiler import profile, record_function, ProfilerActivity
import torchvision
import torchvision.transforms as transforms

In [6]:
path = "./data"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64

In [7]:
transform_train = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR100(
    root=path, train=True, download=True, transform=transform_train)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [00:03<00:00, 48075102.93it/s]


Extracting ./data/cifar-100-python.tar.gz to ./data


In [8]:
def load_data(dataset, num_workers, sort_string):
    trainloader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    
    trainloader_enum = enumerate(trainloader)

    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
        with record_function("data_load"):
            batch_idx, (inputs, targets) = next(trainloader_enum)
            inputs, targets = inputs.to(device), targets.to(device)       

    print(prof.key_averages().table(sort_by=sort_string, row_limit=15))

### 1 Loader

In [9]:
workers = 1
sort_string = "cuda_time_total"
load_data(trainset, workers, sort_string)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              data_load         1.34%       1.727ms       100.00%     129.146ms     129.146ms             1  
enumerate(DataLoader)#_MultiProcessingDataLoaderIter...        98.57%     127.305ms        98.66%     127.418ms     127.418ms             1  
                                            aten::empty         0.01%      16.000us         0.01%      16.000us       8.000us             2  
                                               aten::to         0.01%       8.000us         0.01%       8.000us       2.000us             4  
      

  warn("CUDA is not available, disabling CUDA profiling")


In [9]:
workers = 1
sort_string = "self_cuda_time_total"
load_data(trainset, workers, sort_string)

### 2 loaders

In [None]:
workers = 2
sort_string = "cuda_time_total"
load_data(trainset, workers, sort_string)

In [None]:
workers = 2
sort_string = "self_cuda_time_total"
load_data(trainset, workers, sort_string)

### 4 loaders

In [None]:
workers = 4
sort_string = "cuda_time_total"
load_data(trainset, workers, sort_string)

In [None]:
workers = 4
sort_string = "self_cuda_time_total"
load_data(trainset, workers, sort_string)

### 8 loaders

In [None]:
workers = 8
sort_string = "cuda_time_total"
load_data(trainset, workers, sort_string)

In [None]:
workers = 8
sort_string = "self_cuda_time_total"
load_data(trainset, workers, sort_string)

### 16 loaders

In [None]:
workers = 16
sort_string = "cuda_time_total"
load_data(trainset, workers, sort_string)

In [None]:
workers = 16
sort_string = "self_cuda_time_total"
load_data(trainset, workers, sort_string)