# PyTorch 2.0 Compile

NOTE: works best on H100, V100, A10

In [None]:
# temp fix for colab
! ldconfig /usr/lib64-nvidia

/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link



In [None]:
import torch
from torch.profiler import profile, record_function, ProfilerActivity

In [None]:
torch.__version__

'2.1.0+cu118'

In [None]:
from transformers import ViTFeatureExtractor, ViTForImageClassification
from PIL import Image
import requests
import time

In [None]:
device = torch.device('cuda:0')

In [None]:
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-large-patch32-384')
model = ViTForImageClassification.from_pretrained('google/vit-large-patch32-384')

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

In [None]:
model = model.to(device)

In [None]:
def benchmark_model(model, feature_extractor, image, batch_size):
    inputs = feature_extractor(images=[image for _ in range(batch_size)], return_tensors="pt")
    inputs = inputs.to(device)

    print("🔥 warming up model...")
    with torch.no_grad():
        for _ in range(10):
            _ = model(**inputs)

    print(f"performing benchmark with {batch_size=}")
    durations = []
    for _ in range(10):
        start_time = time.time()
        with torch.no_grad():
            _ = model(**inputs)
        end_time = time.time()
        durations.append((end_time - start_time) * 1000)  # Convert to milliseconds

    avg_duration = sum(durations) / len(durations)
    throughput = (batch_size / avg_duration) * 1000  # images per second

    print(f"Average inference time with {batch_size=}: {avg_duration=:.2f} ms")
    print(f"Model throughput with {batch_size=}: {throughput=:.2f} images/s")

    return avg_duration, throughput

In [None]:
benchmark_model(model, feature_extractor, image, batch_size=1)

🔥 warming up model...
performing benchmark with batch_size=1
Average inference time with batch_size=1: avg_duration=34.95 ms
Model throughput with batch_size=1: throughput=28.61 images/s


(34.94832515716553, 28.613674489490318)

In [None]:
benchmark_model(model, feature_extractor, image, batch_size=2)

🔥 warming up model...
performing benchmark with batch_size=2
Average inference time with batch_size=2: avg_duration=60.64 ms
Model throughput with batch_size=2: throughput=32.98 images/s


(60.636043548583984, 32.983682360435026)

In [None]:
benchmark_model(model, feature_extractor, image, batch_size=4)

🔥 warming up model...
performing benchmark with batch_size=4
Average inference time with batch_size=4: avg_duration=96.27 ms
Model throughput with batch_size=4: throughput=41.55 images/s


(96.2719202041626, 41.548979095018076)

In [None]:
benchmark_model(model, feature_extractor, image, batch_size=8)

🔥 warming up model...
performing benchmark with batch_size=8
Average inference time with batch_size=8: avg_duration=198.40 ms
Model throughput with batch_size=8: throughput=40.32 images/s


(198.40080738067627, 40.32241655473817)

In [None]:
benchmark_model(model, feature_extractor, image, batch_size=16)

🔥 warming up model...
performing benchmark with batch_size=16
Average inference time with batch_size=16: avg_duration=433.54 ms
Model throughput with batch_size=16: throughput=36.91 images/s


(433.53726863861084, 36.9057083609975)

In [None]:
benchmark_model(model, feature_extractor, image, batch_size=32)

🔥 warming up model...
performing benchmark with batch_size=32
Average inference time with batch_size=32: avg_duration=808.31 ms
Model throughput with batch_size=32: throughput=39.59 images/s


(808.3103656768799, 39.588753724818524)

In [None]:
compiled_model = torch.compile(model)

In [None]:
for bs in [1, 2, 4, 8, 16]:
    benchmark_model(compiled_model, feature_extractor, image, batch_size=bs)
    print()

🔥 warming up model...
performing benchmark with batch_size=1
Average inference time with batch_size=1: avg_duration=32.27 ms
Model throughput with batch_size=1: throughput=30.98 images/s

🔥 warming up model...


  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,


performing benchmark with batch_size=2
Average inference time with batch_size=2: avg_duration=59.22 ms
Model throughput with batch_size=2: throughput=33.77 images/s

🔥 warming up model...
performing benchmark with batch_size=4
Average inference time with batch_size=4: avg_duration=93.82 ms
Model throughput with batch_size=4: throughput=42.63 images/s

🔥 warming up model...
performing benchmark with batch_size=8
Average inference time with batch_size=8: avg_duration=209.60 ms
Model throughput with batch_size=8: throughput=38.17 images/s

🔥 warming up model...
performing benchmark with batch_size=16
Average inference time with batch_size=16: avg_duration=430.69 ms
Model throughput with batch_size=16: throughput=37.15 images/s



In [None]:
def profile_model(model, image, trace_filename):
    inputs = feature_extractor(images=image, return_tensors="pt")
    inputs = inputs.to(device)
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True,
        with_stack=True
    ) as prof:
        with torch.no_grad():
            model(**inputs)

    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
    prof.export_chrome_trace(trace_filename)  # Save as .json file

In [None]:
profile_model(model, image, "raw_model.json")

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::linear         2.64%       2.559ms        55.25%      53.458ms     368.676us       0.000us         0.00%      58.653ms     404.503us           145  
                                            aten::addmm        40.76%      39.443ms        45.22%      43.760ms     301.793us      50.857ms        86.83%      58.653ms     404.503us           145  
         

In [None]:
profile_model(compiled_model, image, "compiled_model.json")

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               aten::mm        10.72%       4.215ms        16.19%       6.368ms      44.222us      49.187ms        86.68%      49.187ms     341.576us           144  
                        volta_sgemm_128x32_sliced1x4_tn         0.00%       0.000us         0.00%       0.000us       0.000us      33.034ms        58.21%      33.034ms     275.283us           120  
         

You can view the trace at: https://ui.perfetto.dev/