출처 : pytorch 튜트리얼

In [1]:
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

In [2]:
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

In [4]:
with profile(activities = [ProfilerActivity.CPU], record_shapes = True) as prof:
    with record_function("model_inference"):
        model(inputs)

In [5]:
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit = 10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         2.09%      14.157ms        99.97%     677.043ms     677.043ms             1  
                     aten::conv2d         0.04%     250.000us        75.40%     510.641ms      25.532ms            20  
                aten::convolution         0.46%       3.082ms        75.36%     510.391ms      25.520ms            20  
               aten::_convolution         0.06%     405.000us        74.90%     507.309ms      25.365ms            20  
         aten::mkldnn_convolution        74.77%     506.404ms        74.85%     506.904ms      25.345ms            20  
                 aten::batch_norm       

In [8]:
print(prof.key_averages(group_by_input_shape = True).table(sort_by = "cpu_time_total", row_limit = 10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls                                                                      Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                  model_inference         2.09%      14.157ms        99.97%     677.043ms     677.043ms             1                                                                                []  
                     aten::conv2d         0.01%      86.000us        16.90%     114.459ms      28.615ms             4                             [[5, 64, 56, 56], [64, 64, 3, 3], [], [], [], 

In [2]:
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)


print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         1.33%       5.932ms        99.96%     447.139ms     447.139ms       0.000us         0.00%      11.719ms      11.719ms             1  
                                           aten::conv2d         0.02%      97.000us        79.63%     356.209ms      17.810ms       0.000us         0.00%       9.889ms     494.450us            20  
         

In [3]:
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)

with profile(activities=[ProfilerActivity.CPU],
        profile_memory=True, record_shapes=True) as prof:
    model(inputs)

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::empty         0.19%     961.000us         0.19%     961.000us       6.006us      94.84 Mb      94.84 Mb           160  
    aten::max_pool2d_with_indices         9.75%      48.288ms         9.75%      48.288ms      48.288ms      11.48 Mb      11.48 Mb             1  
                       aten::mean         0.07%     339.000us         1.07%       5.280ms     251.429us      28.75 Kb      28.75 Kb            21  
                      aten::addmm         1.94%       9.598ms         1.94%       9.612ms       9.612ms      19.

In [4]:
print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::empty         0.19%     961.000us         0.19%     961.000us       6.006us      94.84 Mb      94.84 Mb           160  
                 aten::batch_norm         0.02%     101.000us        10.73%      53.172ms       2.659ms      47.41 Mb           0 b            20  
     aten::_batch_norm_impl_index         0.04%     188.000us        10.71%      53.071ms       2.654ms      47.41 Mb           0 b            20  
          aten::native_batch_norm         9.46%      46.894ms        10.67%      52.855ms       2.643ms      47.

In [5]:
model = models.resnet18().cuda()
inputs = torch.rand(5, 3, 224, 224).cuda()
with profile(activities= [ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    model(inputs)

prof.export_chrome_trace('trace.json')

In [6]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    with_stack = True) as prof:
        model(inputs)

print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Source Location                                            
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------  
                                aten::cudnn_convolution        30.62%       3.949ms        37.52%       4.839ms     241.950us       9.876ms        84.41%       9.876ms     493.800us            20  runpy.py(85): _run_code        

In [7]:
prof.export_stacks("/tmp/profiler_stacks.txt", "self_cuda_time_total")

In [8]:
from torch.profiler import schedule

my_schedule = schedule(
    skip_first= 10,
    wait = 5,
    warmup = 1, 
    active = 3,
    repeat = 2
)

In [12]:
def trace_handler(p):
    output = p.key_averages().table(sort_by = "self_cuda_time_total", row_limit = 10)
    print(output)
    p.export_chrome_trace("tmp/trace_" + str(p.step_num) + ".json")

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule = torch.profiler.schedule(
        wait = 1, 
        warmup = 1, 
        active=2),
    on_trace_ready = trace_handler
)   as p:
    for idx in range(8):
        model(inputs)
        p.step()

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        19.52%       4.921ms        25.44%       6.413ms     160.325us      19.740ms        79.55%      19.740ms     493.500us            40  
           volta_scudnn_128x64_3dconv_fprop_small_nn_v1         0.00%       0.000us         0.00%       0.000us       0.000us       8.596ms        34.64%       8.596ms       1.228ms             7  
volta_scu

In [19]:
def trace_handler(p):
    output = p.key_averages().table(sort_by = "self_cuda_time_total", row_limit = 10)
    print(output)
    #p.export_chrome_trace("tmp/trace_" + str(p.step_num) + ".json")

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule = torch.profiler.schedule(
        wait = 1, 
        warmup = 1, 
        active=2),
    on_trace_ready = trace_handler
)   as p:
    for idx in range(8):
        model(inputs)
        p.step()

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        14.29%       4.070ms        18.42%       5.248ms     131.200us      19.741ms        71.63%      19.741ms     493.525us            40  
           volta_scudnn_128x64_3dconv_fprop_small_nn_v1         0.00%       0.000us         0.00%       0.000us       0.000us      11.053ms        40.10%      11.053ms       1.228ms             9  
volta_scu

In [15]:
!git clone https://github.com/brendangregg/FlameGraph

Cloning into 'FlameGraph'...
remote: Enumerating objects: 1147, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 1147 (delta 13), reused 26 (delta 13), pack-reused 1119[K
Receiving objects: 100% (1147/1147), 1.90 MiB | 14.77 MiB/s, done.
Resolving deltas: 100% (659/659), done.


In [16]:
!cd FlameGraph

/content/FlameGraph


In [18]:
#./flamegraph.pl --title "CUDA time" --countname "us." /tmp/profiler_stacks.txt > perf_viz.svg