In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms, models, datasets
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.utils.data import Subset

In [2]:
import argparse

parser = argparse.ArgumentParser('ResNet')

parser.add_argument('--model', choices=['resnet18', 'resnet34', 'resnet50', 'all'], default='all')
parser.add_argument('--epoch', type=int, default=1)
parser.add_argument('--learning_rate', type=float, default=0.0005)
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--imagesize', type=int, nargs='+', default=[112, 224, 336, 448, 560, 672, 784, 896, 1008, 1120])
# parser.add_argument('--imagesize', type=int, nargs='+', default=[112, 224, 336])
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--num_class', type=int, default=2)

args, unknown = parser.parse_known_args()

In [3]:
!curl -L -o dataset.zip\
  https://www.kaggle.com/api/v1/datasets/download/chetankv/dogs-cats-images

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  434M  100  434M    0     0  18.6M      0  0:00:23  0:00:23 --:--:-- 19.7M


In [4]:
# only run on kaggle
import sys

sys.path.append('/kaggle/working/')

In [28]:
#!unzip dataset.zip

Helper function to control batch size based on image size

In [6]:
def getbatchSize(imagesize):
  if imagesize < 500:
    batch = 32
  elif imagesize < 800:
    batch = 12
  else:
    batch = 4
  return batch

In [7]:
train_dir = 'dataset/training_set'

loaders = []

for res in args.imagesize:
  train_transform = transforms.Compose([
      transforms.Resize((res, res)),
      transforms.RandomHorizontalFlip(),
      transforms.ToTensor()
  ])

  train_dataset = datasets.ImageFolder(root=train_dir, transform=train_transform)
  train_subset = Subset(train_dataset, range(500))

  batchsize = getbatchSize(res)
  train_loader = DataLoader(dataset = train_subset, batch_size=batchsize, shuffle=True)

  loaders.append(train_loader)

Paths where the model's would be saved

In [8]:
resnet_18_savedpath = []
resnet_34_savedpath = []
resnet_50_savedpath = []

In [9]:
def trainResnet18(size, train_loader):
  resnet18 = models.resnet18(weights=True)
  resnet18.fc = nn.Linear(resnet18.fc.in_features, args.num_class)
  resnet18.to(device)

  criterion = nn.CrossEntropyLoss()
  resnet18optimizer = optim.Adam(resnet18.parameters(), lr=args.learning_rate)

  print('Training Resnet 18 Model on size ' + str(size) + " pixel")
  for epoch in range(args.epoch):
      resnet18.train()
      running_loss = 0.0
      correct = 0
      total = 0

      for images, labels in train_loader:
          images, labels = images.to(device), labels.to(device)

          resnet18optimizer.zero_grad()
          outputs = resnet18(images)
          loss = criterion(outputs, labels)
          loss.backward()
          resnet18optimizer.step()

          running_loss += loss.item()
          _, preds = torch.max(outputs, 1)
          correct += (preds == labels).sum().item()
          total += labels.size(0)

      train_acc = 100 * correct / total

      print(f"Epoch [{epoch+1}/{args.epoch}] "
            f"Loss: {running_loss:.4f} "
            f"Train Acc: {train_acc:.2f}%")

      if epoch + 1 == args.epoch:
          save_dir = "Resnet18_" + str(size) + ".pth"
          torch.save(resnet18.state_dict(), save_dir)
          resnet_18_savedpath.append(save_dir)

### Train Function for Resnet34


In [10]:
def trainResnet34(size, train_loader):
  resnet34 = models.resnet34(weights=True)
  resnet34.fc = nn.Linear(resnet34.fc.in_features, args.num_class)
  resnet34.to(device)

  criterion = nn.CrossEntropyLoss()
  resnet34optimizer = optim.Adam(resnet34.parameters(), lr=args.learning_rate)

  print('Training Resnet 34 Model on size ' + str(size) + " pixel")
  for epoch in range(args.epoch):
      resnet34.train()
      running_loss = 0.0
      correct = 0
      total = 0

      for images, labels in train_loader:
          images, labels = images.to(device), labels.to(device)

          resnet34optimizer.zero_grad()
          outputs = resnet34(images)
          loss = criterion(outputs, labels)
          loss.backward()
          resnet34optimizer.step()

          running_loss += loss.item()
          _, preds = torch.max(outputs, 1)
          correct += (preds == labels).sum().item()
          total += labels.size(0)

      train_acc = 100 * correct / total

      print(f"Epoch [{epoch+1}/{args.epoch}] "
            f"Loss: {running_loss:.4f} "
            f"Train Acc: {train_acc:.2f}%")

      if epoch + 1 == args.epoch:
          save_dir = "Resnet34_" + str(size) + ".pth"
          torch.save(resnet34.state_dict(), save_dir)
          resnet_34_savedpath.append(save_dir)

### Training function for Resnet 50

In [11]:
def trainResnet50(size, train_loader):
  resnet50 = models.resnet50(weights=True)
  resnet50.fc = nn.Linear(resnet50.fc.in_features, args.num_class)
  resnet50.to(device)

  criterion = nn.CrossEntropyLoss()
  resnet50optimizer = optim.Adam(resnet50.parameters(), lr=args.learning_rate)

  print('Training Resnet 50 Model on size ' + str(size) + " pixel")
  for epoch in range(args.epoch):
      resnet50.train()
      running_loss = 0.0
      correct = 0
      total = 0

      for images, labels in train_loader:
          images, labels = images.to(device), labels.to(device)

          resnet50optimizer.zero_grad()
          outputs = resnet50(images)
          loss = criterion(outputs, labels)
          loss.backward()
          resnet50optimizer.step()

          running_loss += loss.item()
          _, preds = torch.max(outputs, 1)
          correct += (preds == labels).sum().item()
          total += labels.size(0)

      train_acc = 100 * correct / total

      print(f"Epoch [{epoch+1}/{args.epoch}] "
            f"Loss: {running_loss:.4f} "
            f"Train Acc: {train_acc:.2f}%")

      if epoch + 1 == args.epoch:
          save_dir = "Resnet50_" + str(size) + ".pth"
          torch.save(resnet50.state_dict(), save_dir)
          resnet_50_savedpath.append(save_dir)

In [12]:
print(trainResnet18)

<function trainResnet18 at 0x7cedaf934a40>


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for train_loader in loaders:
  image_size = train_loader.dataset[0][0].shape[1]
  if args.model == 'resnet18':
    trainResnet18(image_size, train_loader)
  elif args.model == 'resnet34':
    trainResnet34(image_size, train_loader)
  elif args.model == 'resnet50':
    trainResnet50(image_size, train_loader)
  elif args.model == 'all':
    trainResnet18(image_size, train_loader)
    trainResnet34(image_size, train_loader)
    trainResnet50(image_size, train_loader)
  else:
    raise ValueError('Invalid model choice')




Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 248MB/s]


Training Resnet 18 Model on size 112 pixel
Epoch [1/1] Loss: 1.6293 Train Acc: 97.40%




Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth


100%|██████████| 83.3M/83.3M [00:00<00:00, 250MB/s]


Training Resnet 34 Model on size 112 pixel
Epoch [1/1] Loss: 1.8793 Train Acc: 95.20%




Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 244MB/s]


Training Resnet 50 Model on size 112 pixel
Epoch [1/1] Loss: 1.3992 Train Acc: 95.40%
Training Resnet 18 Model on size 224 pixel
Epoch [1/1] Loss: 4.1791 Train Acc: 86.80%
Training Resnet 34 Model on size 224 pixel
Epoch [1/1] Loss: 3.4418 Train Acc: 87.40%
Training Resnet 50 Model on size 224 pixel
Epoch [1/1] Loss: 0.4452 Train Acc: 100.00%
Training Resnet 18 Model on size 336 pixel
Epoch [1/1] Loss: 2.8704 Train Acc: 91.60%
Training Resnet 34 Model on size 336 pixel
Epoch [1/1] Loss: 1.1240 Train Acc: 98.40%
Training Resnet 50 Model on size 336 pixel
Epoch [1/1] Loss: 0.6664 Train Acc: 100.00%
Training Resnet 18 Model on size 448 pixel
Epoch [1/1] Loss: 1.6443 Train Acc: 97.80%
Training Resnet 34 Model on size 448 pixel
Epoch [1/1] Loss: 0.4939 Train Acc: 100.00%
Training Resnet 50 Model on size 448 pixel
Epoch [1/1] Loss: 0.8376 Train Acc: 100.00%
Training Resnet 18 Model on size 560 pixel
Epoch [1/1] Loss: 1.9050 Train Acc: 98.20%
Training Resnet 34 Model on size 560 pixel
Epoch [

In [14]:
!pip install "qai-hub[torch]"
!qai-hub configure --api_token vqh9wt98ef7yptfydrf1tiuf6i5klo3q74gu52kv

Collecting qai-hub[torch]
  Downloading qai_hub-0.44.0-py3-none-any.whl.metadata (2.6 kB)
Collecting backoff>=2.2 (from qai-hub[torch])
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting deprecation (from qai-hub[torch])
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting s3transfer<0.14,>=0.10.3 (from qai-hub[torch])
  Downloading s3transfer-0.13.1-py3-none-any.whl.metadata (1.7 kB)
Collecting semver>=3.0 (from qai-hub[torch])
  Downloading semver-3.0.4-py3-none-any.whl.metadata (6.8 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Downloading s3transfer-0.13.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.3/85.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading semver-3.0.4-py3-none-any.whl (17 kB)
Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Downloading qai_hub-0.44.0-py3-none-any.whl (113 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

## Moving models to CPU as well, as ai hub expects CPU tracing, Tracing it on gpu leads to failure of compile job

Error - Unable to load torch model via torch.jit.load().  We recommend using at least torch 1.11 to trace a pytorch model. You can install the latest recommended torch via: `pip install "qai-hub[torch]"`.


In [15]:
loaded_models = []

for path in resnet_18_savedpath:
  if args.model == 'resnet18' or args.model == 'all':
    model = models.resnet18(weights=None)
    model.fc = nn.Linear(model.fc.in_features, args.num_class)

    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint)

    model = model.to("cpu").eval()
    loaded_models.append((path[:-4], model))

for path in resnet_34_savedpath:
  if args.model == 'resnet34' or args.model == 'all':
    model = models.resnet34(weights=None)
    model.fc = nn.Linear(model.fc.in_features, args.num_class)

    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint)

    model = model.to("cpu").eval()
    loaded_models.append((path[:-4], model))

for path in resnet_50_savedpath:
  if args.model != 'resnet50' or args.model != 'all':
    model = models.resnet50(weights=None)
    model.fc = nn.Linear(model.fc.in_features, args.num_class)

    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint)

    model = model.to("cpu").eval()
    loaded_models.append((path[:-4], model))

In [16]:
print("******************************************")
print('Loaded Models: ')
print("******************************************")
for name, _ in loaded_models:
  print(name)

******************************************
Loaded Models: 
******************************************
Resnet18_112
Resnet18_224
Resnet18_336
Resnet18_448
Resnet18_560
Resnet18_672
Resnet18_784
Resnet18_896
Resnet18_1008
Resnet18_1120
Resnet34_112
Resnet34_224
Resnet34_336
Resnet34_448
Resnet34_560
Resnet34_672
Resnet34_784
Resnet34_896
Resnet34_1008
Resnet34_1120
Resnet50_112
Resnet50_224
Resnet50_336
Resnet50_448
Resnet50_560
Resnet50_672
Resnet50_784
Resnet50_896
Resnet50_1008
Resnet50_1120


In [17]:
import qai_hub as hub

devices = [
    # hub.Device('Dragonwing IQ-9075 EVK'),
    # hub.Device('QCS8550 (Proxy)'),
    # hub.Device('Google Pixel 10 Pro XL'),
    # hub.Device('Samsung Galaxy S24 (Family)'),
    hub.Device('Samsung Galaxy S24 Ultra')
]

In [18]:
def get_image_res(s: str) -> int:
    """
    Returns the integer after the last underscore in a string.
    Example: "res_334" -> 334
    """
    try:
        return int(s.split('_')[-1])
    except ValueError:
        raise ValueError(f"No valid integer found after underscore in '{s}'")


In [19]:
traced_models = []

for name, model in loaded_models:
  res = get_image_res(name)
  input_shape: tuple[int, ...] = (1, 3, res, res)
  example_input = torch.rand(input_shape)

  model_name = 'traced_' + name
  traced_model = torch.jit.trace(model, example_input)

  traced_models.append((res, name, traced_model, input_shape))


## Compile job count


In [20]:
device_count = len(devices)
model_count = len(traced_models)

print('Devices: ' + str(device_count))
print('Model count: ' + str(model_count))
print('Compile jobs count ' + str(device_count * model_count))

Devices: 1
Model count: 30
Compile jobs count 30


In [21]:

compile_jobs = []

for device in devices:
  for res, name, traced_model, input_shape in traced_models:
    name_formatted = name + "_" + device.name
    print("Submitting compile job for: " + name_formatted)

    job = hub.submit_compile_job(
        model=traced_model,
        name=name_formatted,
        device=device,
        input_specs=dict(image=input_shape),
    )
    assert isinstance(job, hub.CompileJob)
    compile_jobs.append((res, name, job))

Submitting compile job for: Resnet18_112_Samsung Galaxy S24 Ultra
Uploading tmpoovmul3u.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:02<00:00, 21.6MB/s]


Scheduled compile job (j561vk90p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j561vk90p/

Submitting compile job for: Resnet18_224_Samsung Galaxy S24 Ultra
Uploading tmpu48mhprv.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:02<00:00, 21.4MB/s]


Scheduled compile job (jp3m8yllg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp3m8yllg/

Submitting compile job for: Resnet18_336_Samsung Galaxy S24 Ultra
Uploading tmpk0fsvy90.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:01<00:00, 23.2MB/s]


Scheduled compile job (jgovmj7x5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgovmj7x5/

Submitting compile job for: Resnet18_448_Samsung Galaxy S24 Ultra
Uploading tmp9kuzrtzv.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:01<00:00, 24.6MB/s]


Scheduled compile job (jpvw4jyjg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpvw4jyjg/

Submitting compile job for: Resnet18_560_Samsung Galaxy S24 Ultra
Uploading tmpf2ylxadl.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:01<00:00, 22.5MB/s]


Scheduled compile job (jgjl1j6xp) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgjl1j6xp/

Submitting compile job for: Resnet18_672_Samsung Galaxy S24 Ultra
Uploading tmp06z32rc4.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:01<00:00, 22.7MB/s]


Scheduled compile job (jpev2j015) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpev2j015/

Submitting compile job for: Resnet18_784_Samsung Galaxy S24 Ultra
Uploading tmp80ku96vq.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:03<00:00, 13.7MB/s]


Scheduled compile job (jgz7w1qkp) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgz7w1qkp/

Submitting compile job for: Resnet18_896_Samsung Galaxy S24 Ultra
Uploading tmpfg1g71m1.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:01<00:00, 22.6MB/s]


Scheduled compile job (j5w9xjk6p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j5w9xjk6p/

Submitting compile job for: Resnet18_1008_Samsung Galaxy S24 Ultra
Uploading tmpi180dvqx.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:01<00:00, 25.1MB/s]


Scheduled compile job (jg9486rl5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jg9486rl5/

Submitting compile job for: Resnet18_1120_Samsung Galaxy S24 Ultra
Uploading tmpx1pz5add.pt


100%|[34m██████████[0m| 42.8M/42.8M [00:01<00:00, 25.7MB/s]


Scheduled compile job (jp183r92g) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp183r92g/

Submitting compile job for: Resnet34_112_Samsung Galaxy S24 Ultra
Uploading tmpl6z47vih.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 38.9MB/s]


Scheduled compile job (jgdv0jkeg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgdv0jkeg/

Submitting compile job for: Resnet34_224_Samsung Galaxy S24 Ultra
Uploading tmphkg2rlp1.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 38.2MB/s]


Scheduled compile job (j57d6qml5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j57d6qml5/

Submitting compile job for: Resnet34_336_Samsung Galaxy S24 Ultra
Uploading tmpvgbh9hqj.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 33.9MB/s]


Scheduled compile job (jp4w8z7vg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp4w8z7vg/

Submitting compile job for: Resnet34_448_Samsung Galaxy S24 Ultra
Uploading tmp4vz_i4_j.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 35.8MB/s]


Scheduled compile job (jpx1mwq1g) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpx1mwq1g/

Submitting compile job for: Resnet34_560_Samsung Galaxy S24 Ultra
Uploading tmph2z1x6ke.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 39.7MB/s]


Scheduled compile job (j5mz4j7wp) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j5mz4j7wp/

Submitting compile job for: Resnet34_672_Samsung Galaxy S24 Ultra
Uploading tmp2avck3_d.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 36.2MB/s]


Scheduled compile job (jgnexj4rg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgnexj4rg/

Submitting compile job for: Resnet34_784_Samsung Galaxy S24 Ultra
Uploading tmptyw_t6n0.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 36.3MB/s]


Scheduled compile job (jpry9zr9g) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpry9zr9g/

Submitting compile job for: Resnet34_896_Samsung Galaxy S24 Ultra
Uploading tmpbslo_qnq.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 34.8MB/s]


Scheduled compile job (jp2mj2145) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp2mj2145/

Submitting compile job for: Resnet34_1008_Samsung Galaxy S24 Ultra
Uploading tmpnizraa_7.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 34.1MB/s]


Scheduled compile job (jpydn9l7p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpydn9l7p/

Submitting compile job for: Resnet34_1120_Samsung Galaxy S24 Ultra
Uploading tmpzh8hwbuf.pt


100%|[34m██████████[0m| 81.5M/81.5M [00:02<00:00, 34.5MB/s]


Scheduled compile job (jp0rknw6p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp0rknw6p/

Submitting compile job for: Resnet50_112_Samsung Galaxy S24 Ultra
Uploading tmp4rhqbiul.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 37.3MB/s]


Scheduled compile job (jp878lnx5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp878lnx5/

Submitting compile job for: Resnet50_224_Samsung Galaxy S24 Ultra
Uploading tmp_dmpbzhw.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 36.6MB/s]


Scheduled compile job (j5q2wjn45) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j5q2wjn45/

Submitting compile job for: Resnet50_336_Samsung Galaxy S24 Ultra
Uploading tmp_ibqf_wa.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 40.5MB/s]


Scheduled compile job (jglk7jd8p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jglk7jd8p/

Submitting compile job for: Resnet50_448_Samsung Galaxy S24 Ultra
Uploading tmpaz7dh5xh.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 35.4MB/s]


Scheduled compile job (j561vkx0p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j561vkx0p/

Submitting compile job for: Resnet50_560_Samsung Galaxy S24 Ultra
Uploading tmp5_0akmq4.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 41.9MB/s]


Scheduled compile job (jp3m8ydlg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp3m8ydlg/

Submitting compile job for: Resnet50_672_Samsung Galaxy S24 Ultra
Uploading tmpr0stkc44.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 41.9MB/s]


Scheduled compile job (jgovmjxx5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgovmjxx5/

Submitting compile job for: Resnet50_784_Samsung Galaxy S24 Ultra
Uploading tmphwh8vjwc.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 35.1MB/s]


Scheduled compile job (jpvw4j8jg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpvw4j8jg/

Submitting compile job for: Resnet50_896_Samsung Galaxy S24 Ultra
Uploading tmpuf4x6osj.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 41.6MB/s]


Scheduled compile job (jgjl1j9xp) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgjl1j9xp/

Submitting compile job for: Resnet50_1008_Samsung Galaxy S24 Ultra
Uploading tmpv8j5or9s.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 40.5MB/s]


Scheduled compile job (jpev2jq15) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jpev2jq15/

Submitting compile job for: Resnet50_1120_Samsung Galaxy S24 Ultra
Uploading tmp8um3jwr6.pt


100%|[34m██████████[0m| 90.3M/90.3M [00:02<00:00, 40.5MB/s]


Scheduled compile job (jgz7w16kp) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgz7w16kp/



In [22]:
profile_jobs = []

for res, name, job in compile_jobs:
    device = job.device
    formattedName = job.name + device.name

    print("Submitting profiling job for:" + formattedName)

    pf_job = hub.submit_profile_job(
        model=job.get_target_model(),
        device=device,
        name=job.name + "_profiling"
    )

    assert isinstance(pf_job, hub.ProfileJob)
    profile_jobs.append((res, name, pf_job))


Submitting profiling job for:Resnet18_112_Samsung Galaxy S24 UltraSamsung Galaxy S24 Ultra
Scheduled profile job (j5w9xjj6p) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/j5w9xjj6p/

Submitting profiling job for:Resnet18_224_Samsung Galaxy S24 UltraSamsung Galaxy S24 Ultra
Scheduled profile job (jg94866l5) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jg94866l5/

Submitting profiling job for:Resnet18_336_Samsung Galaxy S24 UltraSamsung Galaxy S24 Ultra
Scheduled profile job (jp183rr2g) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp183rr2g/

Submitting profiling job for:Resnet18_448_Samsung Galaxy S24 UltraSamsung Galaxy S24 Ultra
Scheduled profile job (jgdv0jjeg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgdv0jjeg/

Submitting profiling job for:Resnet18_560_Samsung Galaxy S24 UltraSamsung Galaxy S24 Ult

In [27]:
import numpy as np
import pandas as pd

def us_to_ms(x):
    return x / 1e3

def bytes_to_mb(x):
    return x / (1024 ** 2)

def extract_resolution(model_name):
    """Extract resolution from model name (e.g., 'Resnet18_112' -> 112)"""
    parts = model_name.split("_")
    if len(parts) >= 2:
        try:
            return int(parts[-1])
        except ValueError:
            # If last part is not a number, try second-to-last
            if len(parts) >= 3:
                try:
                    return int(parts[-2])
                except ValueError:
                    return None
    return None

def extract_architecture(model_name):
    """Extract architecture from model name (e.g., 'Resnet18_112' -> 'Resnet18')"""
    parts = model_name.split("_")
    if len(parts) >= 2:
        return parts[0]
    return model_name

summary_rows = []
util_rows = []
memory_rows = []
bottleneck_rows = []

for res, name, pf_job in profile_jobs:
    result = pf_job.download_profile()
    s = result["execution_summary"]
    d = pd.DataFrame(result["execution_detail"])
    times = np.array(s["all_inference_times"])

    # Use the 'name' from the tuple (e.g., "Resnet18_112")
    # and 'res' which is the resolution value
    model_name = name
    device_name = pf_job.device.name
    resolution = res  # Use the resolution from the tuple directly
    architecture = extract_architecture(model_name)

    # -------------------------------
    # Table 1: End-to-End Performance
    # -------------------------------
    summary_rows.append({
        "Architecture": architecture,
        "Resolution": resolution,
        "Device": device_name,
        "Mean Latency (ms)": round(us_to_ms(times.mean()), 4),
        "Median Latency (ms)": round(us_to_ms(np.median(times)), 4),
        "P50 Latency (ms)": round(us_to_ms(np.percentile(times, 50)), 4),
        "P95 Latency (ms)": round(us_to_ms(np.percentile(times, 95)), 4),
        "P99 Latency (ms)": round(us_to_ms(np.percentile(times, 99)), 4),
        "Std Dev (ms)": round(us_to_ms(times.std()), 4),
        "Cold Start (ms)": round(us_to_ms(s["first_load_time"]), 4),
        "Warm Start (ms)": round(us_to_ms(s["warm_load_time"]), 4),
        "Speedup (Cold→Warm)": round(s["first_load_time"] / s["warm_load_time"], 2),
    })

    # -------------------------------
    # Table 2: Memory Footprint
    # -------------------------------
    memory_rows.append({
        "Architecture": architecture,
        "Resolution": resolution,
        "Device": device_name,
        "Inference Peak (MB)": round(bytes_to_mb(s["estimated_inference_peak_memory"]), 2),
        "Cold Start Peak (MB)": round(bytes_to_mb(s["first_load_peak_memory"]), 2),
        "Warm Start Peak (MB)": round(bytes_to_mb(s["warm_load_peak_memory"]), 2),
        "Memory Reduction Cold→Warm (%)": round(
            (1 - s["warm_load_peak_memory"] / s["first_load_peak_memory"]) * 100, 2
        ),
        "Memory Reduction Warm→Inference (%)": round(
            (1 - s["estimated_inference_peak_memory"] / s["warm_load_peak_memory"]) * 100, 2
        ),
    })

    # -------------------------------
    # Table 3: Accelerator Utilization
    # -------------------------------
    total_time = d["execution_time"].sum()
    util = d.groupby("compute_unit")["execution_time"].sum() / total_time * 100

    util_rows.append({
        "Architecture": architecture,
        "Resolution": resolution,
        "Device": device_name,
        "CPU (%)": round(util.get("CPU", 0.0), 2),
        "GPU (%)": round(util.get("GPU", 0.0), 2),
        "NPU (%)": round(util.get("NPU", 0.0), 2),
        "Total Time (ms)": round(us_to_ms(total_time), 2),
        "Dominant Unit": util.idxmax() if len(util) > 0 else "N/A",
    })

    # -------------------------------
    # Table 4: Performance Bottlenecks
    # -------------------------------
    # Find top 5 slowest operations
    top_ops = d.nlargest(15, "execution_time")[["name", "type", "compute_unit", "execution_time"]]

    bottleneck_rows.append({
        "Architecture": architecture,
        "Resolution": resolution,
        "Device": device_name,
        "Slowest Op": top_ops.iloc[0]["name"].split("/")[-1],
        "Op Type": top_ops.iloc[0]["type"],
        "Op Time (ms)": round(us_to_ms(top_ops.iloc[0]["execution_time"]), 4),
        "Op Unit": top_ops.iloc[0]["compute_unit"],
        "Top 5 Ops Time (ms)": round(us_to_ms(top_ops["execution_time"].sum()), 2),
        "% of Total": round(top_ops["execution_time"].sum() / total_time * 100, 2),
    })

# Create tables (rows will be in the same order as profile_jobs)
table_perf = pd.DataFrame(summary_rows)
table_mem = pd.DataFrame(memory_rows)
table_util = pd.DataFrame(util_rows)
table_bottleneck = pd.DataFrame(bottleneck_rows)

# Display tables (Markdown format)
print("\n" + "="*140)
print("TABLE 1: End-to-End Performance")
print("="*140)
print(table_perf.to_markdown(index=False))

print("\n" + "="*140)
print("TABLE 2: Memory Footprint")
print("="*140)
print(table_mem.to_markdown(index=False))

print("\n" + "="*140)
print("TABLE 3: Accelerator Utilization")
print("="*140)
print(table_util.to_markdown(index=False))

print("\n" + "="*140)
print("TABLE 4: Performance Bottlenecks")
print("="*140)
print(table_bottleneck.to_markdown(index=False))

print(f"\n✓ Total profile jobs: {len(profile_jobs)}")
print(f"✓ Rows in each table: {len(table_perf)}")
print(f"✓ Architectures: {table_perf['Architecture'].unique().tolist()}")
print(f"✓ Resolutions: {sorted(table_perf['Resolution'].unique().tolist())}")


TABLE 1: End-to-End Performance
| Architecture   |   Resolution | Device                   |   Mean Latency (ms) |   Median Latency (ms) |   P50 Latency (ms) |   P95 Latency (ms) |   P99 Latency (ms) |   Std Dev (ms) |   Cold Start (ms) |   Warm Start (ms) |   Speedup (Cold→Warm) |
|:---------------|-------------:|:-------------------------|--------------------:|----------------------:|-------------------:|-------------------:|-------------------:|---------------:|------------------:|------------------:|----------------------:|
| Resnet18       |          112 | Samsung Galaxy S24 Ultra |              0.4503 |                0.416  |             0.416  |             0.4642 |             0.6759 |         0.2425 |           548.258 |           158.053 |                  3.47 |
| Resnet18       |          224 | Samsung Galaxy S24 Ultra |              0.741  |                0.7055 |             0.7055 |             0.7695 |             1.0084 |         0.252  |           581.469 |        