In [1]:
%matplotlib inline

In [2]:
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [21]:
activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA, ProfilerActivity.XPU]
sort_by_keyword = "self_" + device + "_time_total"

model = models.resnet18().to(device)
inputs = torch.randn(5, 3, 224, 224).to(device)

with profile(activities=activities, profile_memory=True, record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)

prof.export_chrome_trace("trace.json")
print(prof.key_averages().table(sort_by=sort_by_keyword, row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
           aten::cudnn_batch_norm        26.52%       1.931ms        31.78%       2.314ms     115.700us       1.424ms        19.54%       2.426ms     121.300us           0 b           0 b      49.72 Mb       1.00 Kb            20  
          aten::cudnn_convolution        10.74%     782.000us        1

In [18]:
activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
sort_by_keyword = "self_" + device + "_time_total"

model = models.resnet18().to(device)
inputs = torch.randn(5, 3, 224, 224).to(device)

with profile(activities=activities, with_stack=True, profile_memory=True) as prof:
    with record_function("model_inference"):
        model(inputs)

# prof.export_chrome_trace("trace.json")
print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference        30.54%       2.473ms       100.00%       8.098ms       8.098ms       1.558ms        19.21%       8.109ms       8.109ms           0 b           0 b           0 b    -109.13 Mb             1  
           aten::cudnn_batch_norm        27.01%       2.187ms        3