<a href="https://colab.research.google.com/github/itsPronay/HSIC/blob/main/ResNet_336_qai_hub_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms, models, datasets
from torch.utils.data import DataLoader
import torch.optim as optim

In [21]:
import argparse

parser = argparse.ArgumentParser('ResNet')

parser.add_argument('--epoch', type=int, default=1)
parser.add_argument('--learning_rate', type=float, default=0.0005)
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--imagesize', choices=[112, 224, 256, 336, 448, 512, 1120], default=336)
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--num_class', type=int, default=2)

args, unknown = parser.parse_known_args()

In [3]:
!curl -L -o dataset.zip\
  https://www.kaggle.com/api/v1/datasets/download/chetankv/dogs-cats-images

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  434M  100  434M    0     0  64.5M      0  0:00:06  0:00:06 --:--:-- 85.4M


In [4]:
# !unzip /content/dataset.zip

In [22]:
train_dir = '/content/dataset/training_set'
test_dir = '/content/dataset/test_set'

img_size = (args.imagesize, args.imagesize)

train_transform = transforms.Compose([
    transforms.Resize(img_size),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor()
])

test_transform = transforms.Compose([
    transforms.Resize(img_size),
    transforms.ToTensor()
])

train_dataset = datasets.ImageFolder(root=train_dir, transform=train_transform)
test_dataset = datasets.ImageFolder(root=test_dir, transform=test_transform)

train_loader = DataLoader(dataset = train_dataset, batch_size=args.batch, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=args.batch, shuffle=False)


In [23]:
resnet18 = models.resnet18(weights=True)
resnet34 = models.resnet34(weights=True)
resnet50 = models.resnet50(weights=True)



In [24]:
resnet18.fc = nn.Linear(resnet18.fc.in_features, args.num_class)
resnet34.fc = nn.Linear(resnet34.fc.in_features, args.num_class)
resnet50.fc = nn.Linear(resnet50.fc.in_features, args.num_class)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device is "+ str(device))

resnet18.to(device)
resnet34.to(device)
resnet50.to(device)


Device is cuda


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [25]:
criterion = nn.CrossEntropyLoss()

resnet18optimizer = optim.Adam(resnet18.parameters(), lr=args.learning_rate)
resnet34optimizer = optim.Adam(resnet34.parameters(), lr=args.learning_rate)
resnet50optimizer = optim.Adam(resnet50.parameters(), lr=args.learning_rate)

## Training ResNet18 Model

In [26]:
print('Training Resnet 18 Model')

for epoch in range(args.epoch):
    resnet18.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        resnet18optimizer.zero_grad()
        outputs = resnet18(images)
        loss = criterion(outputs, labels)
        loss.backward()
        resnet18optimizer.step()

        running_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_acc = 100 * correct / total

    print(f"Epoch [{epoch+1}/{args.epoch}] "
          f"Loss: {running_loss:.4f} "
          f"Train Acc: {train_acc:.2f}%")

    if epoch + 1 == args.epoch:
        torch.save(resnet18.state_dict(), "Resnet18.pth")


Training Resnet 18 Model
Epoch [1/1] Loss: 32.5164 Train Acc: 94.64%


## Training Resnet 34

In [27]:
print('Training Resnet 34 Model')

for epoch in range(args.epoch):
    resnet34.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        resnet34optimizer.zero_grad()
        outputs = resnet34(images)
        loss = criterion(outputs, labels)
        loss.backward()
        resnet34optimizer.step()

        running_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_acc = 100 * correct / total

    print(f"Epoch [{epoch+1}/{args.epoch}] "
          f"Loss: {running_loss:.4f} "
          f"Train Acc: {train_acc:.2f}%")

    if epoch + 1 == args.epoch:
        torch.save(resnet34.state_dict(), "Resnet34.pth")


Training Resnet 34 Model
Epoch [1/1] Loss: 38.1960 Train Acc: 93.49%


## Training Resnet 50

In [28]:
print('Training Resnet 50 Model')

for epoch in range(args.epoch):
    resnet50.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        resnet50optimizer.zero_grad()
        outputs = resnet50(images)
        loss = criterion(outputs, labels)
        loss.backward()
        resnet50optimizer.step()

        running_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_acc = 100 * correct / total

    print(f"Epoch [{epoch+1}/{args.epoch}] "
          f"Loss: {running_loss:.4f} "
          f"Train Acc: {train_acc:.2f}%")

    if epoch + 1 == args.epoch:
        torch.save(resnet50.state_dict(), "Resnet50.pth")


Training Resnet 50 Model
Epoch [1/1] Loss: 47.4570 Train Acc: 92.17%


In [29]:
!pip install "qai-hub[torch]"
!qai-hub configure --api_token vqh9wt98ef7yptfydrf1tiuf6i5klo3q74gu52kv

2026-02-10 20:15:50.285 - INFO - Enabling verbose logging.
qai-hub configuration saved to /root/.qai_hub/client.ini
[api]
api_token = vqh9wt98ef7yptfydrf1tiuf6i5klo3q74gu52kv
api_url = https://workbench.aihub.qualcomm.com
web_url = https://workbench.aihub.qualcomm.com
verbose = True




## Moving models to CPU as ai hub expects CPU tracing, Tracing it on gpu leads to failure of compile job

Error - Unable to load torch model via torch.jit.load().  We recommend using at least torch 1.11 to trace a pytorch model. You can install the latest recommended torch via: `pip install "qai-hub[torch]"`.


In [30]:
resnet18 = resnet18.to("cpu").eval()
resnet34 = resnet34.to("cpu").eval()
resnet50 = resnet50.to("cpu").eval()

In [31]:
# !qai-hub list-devices

In [32]:
import qai_hub as hub

devices = [
    hub.Device('Dragonwing IQ-9075 EVK'),
    hub.Device('QCS8550 (Proxy)'),
    hub.Device('Google Pixel 10 Pro XL'),
    hub.Device('Samsung Galaxy S24 (Family)'),
    hub.Device('Samsung Galaxy S24 Ultra')
]

models = [
    ("resnet18", resnet18),
    ("resnet34", resnet34),
    ("resnet50", resnet50),
]

for name, model_fn in models:
    print(name)

resnet18
resnet34
resnet50


In [33]:
import torch

print(args.imagesize)
print("Height:{0}, Width:{1}".format(args.imagesize, args.imagesize))

input_shape: tuple[int, ...] = (1, 3, args.imagesize, args.imagesize)
example_input = torch.rand(input_shape)

traced_model_18 = torch.jit.trace(resnet18, example_input)
traced_model_34 = torch.jit.trace(resnet34, example_input)
traced_model_50 = torch.jit.trace(resnet50, example_input)

def getTracedModel(name):
  if name == models[0][0]:
    t = traced_model_18
  elif name == models[1][0]:
    t = traced_model_34
  else:
    t = traced_model_50

  return t


336
Height:336, Width:336


## Compile job

3 models, 5 Devices for each model

15 jobs in total

In [34]:

compile_jobs = []

for device in devices:
  for name, model in models:
    name_formatted = name + "_" + device.name
    print("Submitting compile job for: " + name_formatted)

    job = hub.submit_compile_job(
        model=getTracedModel(name),
        name=name_formatted,
        device=device,
        input_specs=dict(image=input_shape),
    )
    assert isinstance(job, hub.CompileJob)
    compile_jobs.append(job)

Submitting compile job for: resnet18_Dragonwing IQ-9075 EVK
Uploading tmprz0ieq28.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:00<00:00, 46.8MB/s]


Scheduled compile job (jgjl16w8p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgjl16w8p/

Submitting compile job for: resnet34_Dragonwing IQ-9075 EVK
Uploading tmpeoe9f9gn.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:01<00:00, 74.5MB/s]


Scheduled compile job (jpev20l05) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpev20l05/

Submitting compile job for: resnet50_Dragonwing IQ-9075 EVK
Uploading tmprzhjfpe3.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:01<00:00, 85.3MB/s]


Scheduled compile job (jgz7wq46p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgz7wq46p/

Submitting compile job for: resnet18_QCS8550 (Proxy)
Uploading tmpbcnp8l1n.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:00<00:00, 69.0MB/s]


Scheduled compile job (j5w9x01jp) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j5w9x01jp/

Submitting compile job for: resnet34_QCS8550 (Proxy)
Uploading tmpvnfsnha3.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:01<00:00, 82.8MB/s]


Scheduled compile job (jg9487xv5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jg9487xv5/

Submitting compile job for: resnet50_QCS8550 (Proxy)
Uploading tmp8vysmfxk.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:01<00:00, 82.0MB/s]


Scheduled compile job (jp183kvlg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp183kvlg/

Submitting compile job for: resnet18_Google Pixel 10 Pro XL
Uploading tmpb2sm4dlx.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:00<00:00, 61.8MB/s]


Scheduled compile job (jgdv0yzlg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgdv0yzlg/

Submitting compile job for: resnet34_Google Pixel 10 Pro XL
Uploading tmpau4ul24o.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:01<00:00, 64.2MB/s]


Scheduled compile job (j57d619r5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j57d619r5/

Submitting compile job for: resnet50_Google Pixel 10 Pro XL
Uploading tmp5pnrdpdp.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:01<00:00, 72.9MB/s]


Scheduled compile job (jp4w863lg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp4w863lg/

Submitting compile job for: resnet18_Samsung Galaxy S24 (Family)
Uploading tmpmsbfl8b3.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:00<00:00, 64.1MB/s]


Scheduled compile job (jpx1m8x9g) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpx1m8x9g/

Submitting compile job for: resnet34_Samsung Galaxy S24 (Family)
Uploading tmpazjt44tq.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:01<00:00, 65.0MB/s]


Scheduled compile job (j5mz418qp) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j5mz418qp/

Submitting compile job for: resnet50_Samsung Galaxy S24 (Family)
Uploading tmpbwxj_f7s.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:01<00:00, 93.8MB/s]


Scheduled compile job (jgnexdkmg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgnexdkmg/

Submitting compile job for: resnet18_Samsung Galaxy S24 Ultra
Uploading tmpzj04w2yf.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:00<00:00, 46.6MB/s]


Scheduled compile job (jpry9mweg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpry9mweg/

Submitting compile job for: resnet34_Samsung Galaxy S24 Ultra
Uploading tmprgn_6dsm.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:01<00:00, 68.0MB/s]


Scheduled compile job (jp2mjqem5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp2mjqem5/

Submitting compile job for: resnet50_Samsung Galaxy S24 Ultra
Uploading tmpaolve_6j.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:01<00:00, 71.2MB/s]


Scheduled compile job (jpydnkm4p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpydnkm4p/



In [35]:
profile_jobs = []

for job in compile_jobs:
    device = job.device
    formattedName = job.name + device.name

    print("Submitting profiling job for:" + formattedName)

    pf_job = hub.submit_profile_job(
        model=job.get_target_model(),
        device=device,
        name=job.name + "_profiling"
    )

    assert isinstance(pf_job, hub.ProfileJob)
    profile_jobs.append(pf_job)


Submitting profiling job for:resnet18_Dragonwing IQ-9075 EVKDragonwing IQ-9075 EVK
Waiting for compile job (jgjl16w8p) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Scheduled profile job (j5q2wxvm5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j5q2wxvm5/

Submitting profiling job for:resnet34_Dragonwing IQ-9075 EVKDragonwing IQ-9075 EVK
Waiting for compile job (jpev20l05) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Scheduled profile job (jglk79llp) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jglk79llp/

Submitting profiling job for:resnet50_Dragonwing IQ-9075 EVKDragonwing IQ-9075 EVK
Scheduled profile job (j561v9w7p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j561v9w7p/

Submitting profiling job for:resnet18_QCS8550 (Proxy)QCS8550 (Proxy)
Scheduled p

In [36]:
import numpy as np
import pandas as pd

def us_to_ms(x):
    return x / 1e3

def bytes_to_mb(x):
    return x / (1024 ** 2)

summary_rows = []
util_rows = []
memory_rows = []
bottleneck_rows = []  # NEW: Performance bottlenecks

for pf_job in profile_jobs:
    result = pf_job.download_profile()
    s = result["execution_summary"]
    d = pd.DataFrame(result["execution_detail"])
    times = np.array(s["all_inference_times"])

    model_name = pf_job.name.split("_")[0]
    device_name = pf_job.device.name

    # -------------------------------
    # Table 1: End-to-End Performance
    # -------------------------------
    summary_rows.append({
        "Model": model_name,
        "Device": device_name,
        "Mean Latency (ms)": round(us_to_ms(times.mean()), 4),
        "Median Latency (ms)": round(us_to_ms(np.median(times)), 4),  # NEW
        "P50 Latency (ms)": round(us_to_ms(np.percentile(times, 50)), 4),  # NEW
        "P95 Latency (ms)": round(us_to_ms(np.percentile(times, 95)), 4),
        "P99 Latency (ms)": round(us_to_ms(np.percentile(times, 99)), 4),  # NEW
        "Std Dev (ms)": round(us_to_ms(times.std()), 1),  # NEW: Variance
        "Cold Start (ms)": round(us_to_ms(s["first_load_time"]), 4),
        "Warm Start (ms)": round(us_to_ms(s["warm_load_time"]), 4),
        "Speedup (Cold→Warm)": round(s["first_load_time"] / s["warm_load_time"], 4),  # NEW
    })

    # -------------------------------
    # Table 2: Memory Footprint
    # -------------------------------
    memory_rows.append({
        "Model": model_name,
        "Device": device_name,
        "Inference Peak (MB)": round(bytes_to_mb(s["estimated_inference_peak_memory"]), 4),
        "Cold Start Peak (MB)": round(bytes_to_mb(s["first_load_peak_memory"]), 4),
        "Warm Start Peak (MB)": round(bytes_to_mb(s["warm_load_peak_memory"]), 4),
        "Memory Reduction (Cold→Warm %)": round(  # NEW
            (1 - s["warm_load_peak_memory"] / s["first_load_peak_memory"]) * 100, 1
        ),
        "Memory Reduction (Warm→Inference %)": round(  # NEW
            (1 - s["estimated_inference_peak_memory"] / s["warm_load_peak_memory"]) * 100, 1
        ),
    })

    # -------------------------------
    # Table 3: Accelerator Utilization
    # -------------------------------
    total_time = d["execution_time"].sum()
    util = d.groupby("compute_unit")["execution_time"].sum() / total_time * 100

    util_rows.append({
        "Model": model_name,
        "Device": device_name,
        "CPU (%)": round(util.get("CPU", 0.0), 4),
        "GPU (%)": round(util.get("GPU", 0.0), 4),
        "NPU (%)": round(util.get("NPU", 0.0), 4),
        "Total Time (ms)": round(us_to_ms(total_time), 4),  # NEW
        "Dominant Unit": util.idxmax() if len(util) > 0 else "N/A",  # NEW
    })

    # -------------------------------
    # Table 4: Performance Bottlenecks (NEW)
    # -------------------------------
    # Find top 5 slowest operations
    top_ops = d.nlargest(5, "execution_time")[["name", "type", "compute_unit", "execution_time"]]

    bottleneck_rows.append({
        "Model": model_name,
        "Device": device_name,
        "Slowest Op": top_ops.iloc[0]["name"].split("/")[-1],
        "Op Type": top_ops.iloc[0]["type"],
        "Op Time (ms)": round(us_to_ms(top_ops.iloc[0]["execution_time"]), 4),
        "Op Unit": top_ops.iloc[0]["compute_unit"],
        "Top 5 Ops Time (ms)": round(us_to_ms(top_ops["execution_time"].sum()), 4),
        "% of Total": round(top_ops["execution_time"].sum() / total_time * 100, 4),
    })

# Create tables
table_perf = pd.DataFrame(summary_rows)
table_mem = pd.DataFrame(memory_rows)
table_util = pd.DataFrame(util_rows)
table_bottleneck = pd.DataFrame(bottleneck_rows)  # NEW

# Display tables (Markdown format)
print("\n" + "="*100)
print("TABLE 1: End-to-End Performance")
print("="*100)
print(table_perf.to_markdown(index=False))

print("\n" + "="*100)
print("TABLE 2: Memory Footprint")
print("="*100)
print(table_mem.to_markdown(index=False))

print("\n" + "="*100)
print("TABLE 3: Accelerator Utilization")
print("="*100)
print(table_util.to_markdown(index=False))

print("\n" + "="*100)
print("TABLE 4: Performance Bottlenecks")
print("="*100)
print(table_bottleneck.to_markdown(index=False))

print(f"\n✓ Total profile jobs: {len(profile_jobs)}")
print(f"✓ Rows in each table: {len(table_perf)}")

Waiting for profile job (j5q2wxvm5) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Waiting for profile job (jglk79llp) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Waiting for profile job (j561v9w7p) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Waiting for profile job (jpvw4y7mg) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Waiting for profile job (jgjl16q8p) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Waiting for profile job (jpev20y05) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Waiting for profile job (jgz7wqn6p) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Waiting for profile job (j5mz41mqp) completion. Type Ctrl+C to stop waiting at any time.
 