In [None]:
from resnet import resnet18, resnet34, resnet50, resnet101, resnet152
from resnet import resnet18_bottleneck, resnet34_bottleneck, resnet50_bottleneck, resnet101_bottleneck, resnet152_bottleneck
import numpy as np
import torch
from torch import nn

# Profiling resnets - Forward

Initially, we profile the forward passes only of various resnets.

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

def profile_model(model, inputs, sort_by_self=True):
    sort_by_string = "self_cpu_time_total" if sort_by_self else "cpu_time_total"
    model(inputs)   # Warmup

    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
        with record_function("model_inference"):
            model(inputs)

    print(prof.key_averages().table(sort_by=sort_by_string, row_limit=15))

### ResNet-18

In [None]:
model = resnet18()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
         aten::mkldnn_convolution        74.35%        1.955s        74.46%        1.958s      97.915ms            20  
          aten::native_batch_norm        11.88%     312.413ms        11.96%     314.581ms      15.729ms            20  
    aten::max_pool2d_with_indices        10.93%     287.447ms        10.93%     287.447ms     287.447ms             1  
                  aten::clamp_min         1.44%      37.938ms         1.44%      37.938ms       2.371ms            16  
                       aten::add_         0.69%      18.194ms         0.69%      18.194ms     649.786us            28  
                  model_inference       

### ResNet-34

In [None]:
model = resnet34()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
         aten::mkldnn_convolution        83.78%        4.992s        83.85%        4.996s     138.788ms            36  
          aten::native_batch_norm         9.00%     536.570ms         9.06%     540.098ms      15.003ms            36  
    aten::max_pool2d_with_indices         4.90%     292.017ms         4.90%     292.017ms     292.017ms             1  
                  aten::clamp_min         1.18%      70.235ms         1.18%      70.235ms       2.195ms            32  
                       aten::add_         0.61%      36.113ms         0.61%      36.113ms     694.481us            52  
                  model_inference       

### ResNet-50

In [None]:
model = resnet50()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
         aten::mkldnn_convolution        85.04%        5.300s        85.10%        5.304s     147.335ms            36  
          aten::native_batch_norm         8.11%     505.444ms         8.17%     509.183ms      14.144ms            36  
    aten::max_pool2d_with_indices         4.66%     290.535ms         4.66%     290.535ms     290.535ms             1  
                  aten::clamp_min         1.12%      69.988ms         1.12%      69.988ms       2.187ms            32  
                       aten::add_         0.56%      34.603ms         0.56%      34.603ms     665.442us            52  
                  model_inference       

### ResNet-101

In [None]:
model = resnet101()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
         aten::mkldnn_convolution        86.31%        8.672s        86.38%        8.679s     123.993ms            70  
          aten::native_batch_norm         7.71%     774.586ms         7.77%     781.175ms      11.160ms            70  
    aten::max_pool2d_with_indices         3.74%     375.532ms         3.74%     375.532ms     375.532ms             1  
                  aten::clamp_min         1.12%     112.486ms         1.12%     112.486ms       1.704ms            66  
                       aten::add_         0.56%      56.467ms         0.56%      56.467ms     548.223us           103  
                  model_inference       

### ResNet-152

In [None]:
model = resnet152()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
         aten::mkldnn_convolution        88.81%       13.168s        88.88%       13.178s     126.716ms           104  
          aten::native_batch_norm         6.98%        1.034s         7.05%        1.045s      10.046ms           104  
    aten::max_pool2d_with_indices         1.95%     289.665ms         1.95%     289.665ms     289.665ms             1  
                  aten::clamp_min         1.14%     169.396ms         1.14%     169.396ms       1.694ms           100  
                       aten::add_         0.56%      82.569ms         0.56%      82.569ms     536.162us           154  
                  model_inference       

Running Standard ResNets on the CPU is fairly slow because there are several connections in each layer. Most of the time is taken in the convolution layers which forms sort of a performance "bottleneck" here.

# Bottleneck ResNets

### Bottleneck RN18

In [None]:
model = resnet18_bottleneck()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
         aten::mkldnn_convolution        39.38%     445.910ms        39.55%     447.853ms      15.995ms            28  
          aten::native_batch_norm        27.57%     312.195ms        27.69%     313.598ms      11.200ms            28  
    aten::max_pool2d_with_indices        26.14%     296.015ms        26.14%     296.015ms     296.015ms             1  
                  aten::clamp_min         2.47%      28.012ms         2.47%      28.012ms       1.167ms            24  
                  model_inference         2.13%      24.066ms       100.00%        1.132s        1.132s             1  
                       aten::add_       

### Bottleneck RN34

In [None]:
model = resnet34_bottleneck()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

This report only display top-level ops statistics
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         2.18%      32.382ms       100.00%        1.488s        1.488s             1  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.488s



### Bottleneck RN50

In [None]:
model = resnet50_bottleneck()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

This report only display top-level ops statistics
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         2.06%      28.524ms       100.00%        1.386s        1.386s             1  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.386s



### Bottleneck RN 101

In [None]:
model = resnet101_bottleneck()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

This report only display top-level ops statistics
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         2.01%      38.184ms       100.00%        1.898s        1.898s             1  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.898s



### Bottleneck RN152

In [None]:
model = resnet152_bottleneck()
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

This report only display top-level ops statistics
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         2.29%      60.595ms       100.00%        2.644s        2.644s             1  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.644s



As we can see the Bottleneck ResNets run much faster already just running them on CPU. This is because the bottleneck layers have much less parameters and therefore they run significantly faster.

## Large ResNets

Huge gains are observed, even with large-mode resnets where the number of internal channels have been quadrupled:


### Large ResNet-50

In [None]:
model = resnet50_bottleneck(large_mode=True)
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

This report only display top-level ops statistics
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         0.98%      28.007ms       100.00%        2.845s        2.845s             1  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.845s



### Large ResNet-101 


In [None]:
model = resnet101_bottleneck(large_mode=True)
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

This report only display top-level ops statistics
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         1.34%      51.297ms       100.00%        3.827s        3.827s             1  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 3.827s



### Large ResNet-152

In [None]:
model = resnet152_bottleneck(large_mode=True)
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
profile_model(model, inputs)

This report only display top-level ops statistics
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         1.27%      68.666ms       100.00%        5.389s        5.389s             1  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 5.389s



# Analysis

Adding bottleneck layers drastically reduces computation time of the forward pass. Standard ResNets (without bottleneck layers) are about 5 times slower than the bottleneck counterparts. For excessively large ResNets, where the number of hidden channels is quadrupled, execution time is only approximately doubled and remains at roughly 1/3 the time of the corresponding plain ResNet.

Using bottleneck layers brings down the proportion of time spent performing convolution drastically. For the plain nets, we observe that between 80% and 90% of execution time is spent on convolution (the MKL implementation aten::mkldnn_convolution). When employing bottlenecks, this is instead 40-45%. Even for the extra large ResNet152, the computation time spent on convolution is only 70%, and its absolute execution time is lower than that of the plain ResNet18 (3.72s vs 3.85s).

### Forward pass and Backward pass


Finally, we investigate the execution time of a full forward and backward pass.

In [None]:
model = resnet152_bottleneck(large_mode=True)
inputs = torch.rand(32, 3, 224, 224)    # Batch size 32
targets = torch.randint(1000, size=(32,))

model(inputs)  # Warmup

inputs.requires_grad_(True)
loss = nn.CrossEntropyLoss()

with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("model_inference"):
        y = model(inputs)
    with record_function("backpropagation"):
        out = loss(y, targets)
        out.backward()

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=15))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        backpropagation         0.13%      20.795ms        60.12%        9.469s        9.469s             1  
autograd::engine::evaluate_function: ConvolutionBack...         0.09%      13.563ms        52.29%        8.236s      53.135ms           155  
                                   ConvolutionBackward0         0.02%       3.295ms        51.67%        8.139s      52.508ms           155  
                             aten::convolution_backward        51.48%        8.108s        51.65%        8.135s      52.486ms           155  
      

We observe a large increase in execution time, where the majority of the additional time is unsurprisingly spent in the backpropagation. Though a forward pass was relatively efficient, clocking at 6.3 seconds (with some added overhead from previous experiment), the total wall clock time was almost 18.2s with 65% spent in the backward pass. Despite the increased efficiency gained from bottleneck layers, much greater speedups are required to make training feasible. This motivates the migration onto GPUs.