In [None]:
from resnet import resnet18, resnet34, resnet50, resnet101, resnet152
from resnet import resnet18_bottleneck, resnet34_bottleneck, resnet50_bottleneck, resnet101_bottleneck, resnet152_bottleneck
import numpy as np
import torch
from torch import nn

# Profiling resnets on GPU - Forward

Here we profile the forward passes only of various resnets executed on GPUs. It's expected that the execution times are significantly lower than with CPUs.

In [None]:
device = torch.device("cuda")

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

def profile_model(model, inputs, sort_by_self=True):
    sort_by_string = "self_cuda_time_total" if sort_by_self else "cuda_time_total"
    model(inputs)   # Warmup

    with profile(activities=[
        ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
        with record_function("model_inference"):
            model(inputs)

    print(prof.key_averages().table(sort_by=sort_by_string, row_limit=15))

### ResNet-18

In [None]:
model = resnet18().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        12.91%       6.082ms        13.84%       6.518ms     325.900us      26.386ms        68.36%      29.420ms       1.471ms            20  
                                  volta_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us      11.150ms        28.89%      11.150ms     857.692us            13  
       cu

### ResNet-34

In [None]:
model = resnet34().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution         2.52%       2.983ms         3.28%       3.886ms     107.944us      47.194ms        71.70%      47.194ms       1.311ms            36  
                                  volta_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us      25.083ms        38.11%      25.083ms     864.931us            29  
       cu

### ResNet-50

In [None]:
model = resnet50().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution         5.25%       5.781ms         6.31%       6.947ms     192.972us      46.319ms        71.44%      46.319ms       1.287ms            36  
                                  volta_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us      24.462ms        37.73%      24.462ms     843.517us            29  
       cu

### ResNet-101

In [None]:
model = resnet101().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution         2.69%       4.549ms         4.13%       6.998ms      99.971us      62.125ms        71.37%      62.125ms     887.500us            70  
                                  volta_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us      32.372ms        37.19%      32.372ms     513.841us            63  
void cudn

### ResNet-152

In [None]:
model = resnet152().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution         3.37%       7.075ms        14.74%      30.990ms     297.981us      88.257ms        72.09%      88.257ms     848.625us           104  
                                  volta_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us      47.984ms        39.19%      47.984ms     494.680us            97  
void cudn

Initially, we observe that the execution times of all plain resnets have been significantly reduced compared to any of the CPU resnets, even when the CPU networks are implemented with bottleneck layers. As a comparison, the largest GPU network ResNet-152 with plain layers execute in a total of 210ms CPU time and 122ms GPU time, which is much lower than even the ResNet-18 with bottleneck layers on CPU which executed in ~1.132 seconds (see CPU forward notebook).

# Bottleneck ResNets

### Bottleneck RN18

In [None]:
model = resnet18_bottleneck().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        13.87%       2.118ms        23.32%       3.560ms     127.143us       4.005ms        30.48%       4.005ms     143.036us            28  
                                 aten::cudnn_batch_norm         9.20%       1.404ms        16.03%       2.447ms      87.393us       3.730ms        28.39%       3.730ms     133.214us            28  
         

### Bottleneck RN34

In [None]:
model = resnet34_bottleneck().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        13.66%       3.445ms        21.82%       5.502ms     105.808us       6.270ms        31.66%       6.270ms     120.577us            52  
                                             aten::add_         7.04%       1.776ms        11.16%       2.815ms      23.458us       5.381ms        27.17%       5.381ms      44.842us           120  
         

### Bottleneck RN50

In [None]:
model = resnet50_bottleneck().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)   # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        13.78%       3.710ms        20.58%       5.540ms     106.538us       6.309ms        31.78%       6.309ms     121.327us            52  
                                             aten::add_         6.49%       1.748ms         9.76%       2.626ms      21.883us       5.383ms        27.11%       5.383ms      44.858us           120  
         

### Bottleneck RN 101

In [None]:
model = resnet101_bottleneck().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        12.35%       7.389ms        18.14%      10.856ms     105.398us      12.212ms        38.14%      12.212ms     118.563us           103  
                                             aten::add_         7.35%       4.401ms        11.51%       6.890ms      28.828us       8.334ms        26.03%       8.334ms      34.870us           239  
         

### Bottleneck RN152

In [None]:
model = resnet152_bottleneck().to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        12.74%      10.334ms        18.87%      15.308ms      99.403us      24.380ms        45.01%      24.380ms     158.312us           154  
                                             aten::add_         7.36%       5.966ms        11.87%       9.626ms      26.888us      12.776ms        23.59%      12.776ms      35.687us           358  
         

With bottleneck layers and GPU, we achieve an even greater efficiency increase, where even the largest ResNet-152 runs in 81ms of CPU time and 54ms of GPU time. Combining GPU acceleration with bottleneck layers is an excellent configuration to achieve high performing networks. By using a smaller network, such as ResNet-50, we can trade some accuracy for tremendous speedups, where the bottleneck implementation of ResNet-50 achieved 26.9ms of CPU time and 19.9ms of CUDA time.

## Large ResNets

Huge gains are observed, even with large-mode resnets where the number of internal channels have been quadrupled:


### Large ResNet-50

In [None]:
model = resnet50_bottleneck(large_mode=True).to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution         7.82%       3.251ms        12.49%       5.195ms      98.019us      23.002ms        58.79%      23.002ms     434.000us            53  
           cudnn_volta_scudnn_128x128_relu_medium_nn_v1         0.00%       0.000us         0.00%       0.000us       0.000us       8.930ms        22.82%       8.930ms     343.462us            26  
         

### Large ResNet-101 

In [None]:
model = resnet101_bottleneck(large_mode=True).to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution         4.47%       5.505ms         6.68%       8.230ms      79.135us      50.307ms        66.40%      50.307ms     483.721us           104  
           cudnn_volta_scudnn_128x128_relu_medium_nn_v1         0.00%       0.000us         0.00%       0.000us       0.000us      26.045ms        34.37%      26.045ms     434.083us            60  
         

### Large ResNet-152

In [None]:
model = resnet152_bottleneck(large_mode=True).to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
profile_model(model, inputs)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution         5.74%       9.684ms         9.27%      15.649ms     100.961us      71.551ms        67.34%      71.551ms     461.619us           155  
           cudnn_volta_scudnn_128x128_relu_medium_nn_v1         0.00%       0.000us         0.00%       0.000us       0.000us      38.723ms        36.44%      38.723ms     430.256us            90  
         

# Analysis

As expected we see significant gains in performance when running various resnets. With the Standard ResNets we see upto 50x speedup compared to running on CPU with the execution times reducing from order of magnitude of a few seconds to a few hundered milliseconds.

After adding bottleneck layers we get even faster execution times with order of magnitude less than one hundered milliseconds (the largest one, Bottleneck RN152 runs a forward pass in less than 150ms). Furthermore when running on GPU we get very fast runtimes for Large Resnets with Large ResNet-152 runnning in a fraction of the time it takes to run any sort of resnet on the CPU.

### Forward pass and Backward pass

Finally, we investigate the execution time of a full forward and backward pass.

In [None]:
model = resnet152_bottleneck(large_mode=True).to(device)
inputs = torch.rand(32, 3, 224, 224).to(device)    # Batch size 32
targets = torch.randint(1000, size=(32,)).to(device)

model(inputs)  # Warmup

inputs.requires_grad_(True)
loss = nn.CrossEntropyLoss()

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        y = model(inputs)
    with record_function("backpropagation"):
        out = loss(y, targets)
        out.backward()

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15))
    

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
autograd::engine::evaluate_function: ConvolutionBack...         0.63%       2.769ms        18.94%      83.292ms     537.368us       0.000us         0.00%     135.664ms     875.252us           155  
                                   ConvolutionBackward0         0.25%       1.096ms        17.93%      78.828ms     508.568us       0.000us         0.00%     130.647ms     842.884us           155  
         

We see here once again, as expected, backpropogation runs much faster when we run on GPUs. Training the model becomes much more feasible as now we can run backpropogation in just a few hundered milliseconds with a single training step taking less than a second. This makes it significantly more reasonable to train a complex model such as Large ResNet-152.