In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torchvision
from transformers import AutoModelForCausalLM, AutoModelForAudioClassification
from torch.profiler import profile, record_function, ProfilerActivity


In [3]:

# --- Configuration ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {DEVICE}")

# --- Helper: standard profiling function ---
def profile_model(model, inputs, model_name):
    print(f"\n--- Profiling {model_name} ---")
    model.to(DEVICE)
    model.eval() # Inference mode
    
    # Ensure inputs are on the correct device
    if isinstance(inputs, dict):
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    else:
        inputs = inputs.to(DEVICE)

    # 2.9 Best Practice: Use a schedule to skip noise and warmup
    # wait=1 (skip first step), warmup=1 (compile/warmup), active=3 (record 3 steps)
    my_schedule = torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1)

    with torch.profiler.profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if DEVICE == "cuda" else [ProfilerActivity.CPU],
        schedule=my_schedule,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/{model_name}'),
        record_shapes=True,
        profile_memory=True,
        with_stack=True
    ) as p:
        
        # We simulate a loop of 5 steps (1 wait + 1 warmup + 3 active)
        for step in range(5):
            with record_function(f"inference_step_{step}"):
                if isinstance(inputs, dict):
                    _ = model(**inputs)
                else:
                    _ = model(inputs)
            p.step() # Signal to profiler that a step is done

    print(f"Profiling complete. Logs saved to ./log/{model_name}")
    # Print a quick text summary of the *active* steps
    print(p.key_averages().table(sort_by="cuda_time_total" if DEVICE == "cuda" else "cpu_time_total", row_limit=5))



Running on: cuda


In [4]:

# --- 1. Vision Model: ResNet50 (Standard, Easy to load) ---
# We use torchvision's new weight enum standard
print("Loading Vision Model...")
vision_weights = torchvision.models.ResNet50_Weights.DEFAULT
vision_model = torchvision.models.resnet50(weights=vision_weights)
vision_inputs = torch.randn(100, 3, 224, 224) # Standard ImageNet batch
profile_model(vision_model, vision_inputs, "vision_resnet50")


Loading Vision Model...

--- Profiling vision_resnet50 ---
Profiling complete. Logs saved to ./log/vision_resnet50
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         0.34%     173.453us        48.53%      25.090ms       8.3

In [5]:
100 * 3 * 224 * 224 * 4

60211200

In [6]:


# --- 2. LLM: GPT-2 (Small, Open Weights, Standard Architecture) ---
# We use HuggingFace transformers. GPT-2 is chosen for speed/simplicity over Llama 3 for a basic script.
print("Loading LLM...")
llm_name = "gpt2" 
llm_model = AutoModelForCausalLM.from_pretrained(llm_name)

B=64
L=128
llm_inputs = {"input_ids": torch.randint(0, 50257, (B, L)), "attention_mask": torch.ones(B, L)}
profile_model(llm_model, llm_inputs, "llm_gpt2")



Loading LLM...

--- Profiling llm_gpt2 ---


[W109 20:20:02.701987353 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event


Profiling complete. Logs saved to ./log/llm_gpt2
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         0.10%     197.338us        78.53%     159.293ms      53.098ms       0.000us         0.00%     161.728ms      53.909ms      

In [7]:

# --- 3. Audio Model: Wav2Vec2 (Standard Speech Architecture) ---
# Using a small classifier version for speed
print("Loading Audio Model...")
audio_name = "facebook/wav2vec2-base-960h"
audio_model = AutoModelForAudioClassification.from_pretrained(audio_name)
# Dummy audio waveform: Batch 1, 16000 samples (1 second of audio at 16kHz)
audio_inputs = {"input_values": torch.randn(1, 16000)} 
profile_model(audio_model, audio_inputs, "audio_wav2vec2")

Loading Audio Model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Profiling audio_wav2vec2 ---
Profiling complete. Logs saved to ./log/audio_wav2vec2
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         0.49%     162.012us        99.97%      33.211ms      11.070ms       0.000us        