<a href="https://colab.research.google.com/github/itsPronay/HSIC/blob/main/MVit_single_ai_hub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/itsPronay/MViT

Cloning into 'MViT'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 98 (delta 28), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (98/98), 2.59 MiB | 16.92 MiB/s, done.
Resolving deltas: 100% (28/28), done.


In [2]:
import sys
import os

sys.path.append(os.path.join(os.getcwd(), 'MViT'))

import time
import math
import argparse
import numpy as np
from scipy.io import loadmat
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects


from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


import torch
import torch.nn as nn
from torch import optim
import torch.utils.data as Data
import torch.nn.functional as F
from torchsummary import summary
from einops import rearrange, repeat
from timm.models.vision_transformer import Block


from data_prepare import mirror_hsi
from data_prepare import choose_train_and_test
from data_prepare import choose_all_pixels, all_data
from data_prepare import train_and_test_data, train_and_test_label


from Utils import output_metric, plot_confusion_matrix
from CNNUtils import train, test, valid
from Utils import list_to_colormap, classification_map, print_args
from Utils import ActivationOutputData

%matplotlib inline

In [3]:
#!/bin/bash
!curl -L -o pavia-university-hsi.zip\
  https://www.kaggle.com/api/v1/datasets/download/syamkakarla/pavia-university-hsi && unzip pavia-university-hsi.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 33.2M  100 33.2M    0     0  7933k      0  0:00:04  0:00:04 --:--:-- 11.6M
Archive:  pavia-university-hsi.zip
  inflating: PaviaU.mat              
  inflating: PaviaU_gt.mat           


In [4]:
config = {
    'train_dir': 'PaviaU.mat',
    'ground_truth' : 'PaviaU_gt.mat'
}


In [5]:
parser = argparse.ArgumentParser('Pavia_HSIC')

parser.add_argument('--epoch', type=int, default=10)
parser.add_argument('--learning_rate', type=float, default=0.001)
parser.add_argument('--batch_size', type=int, default=10)
parser.add_argument('--patch_size', type=int, default=15)
parser.add_argument('--seed', type=int, default=41)
parser.add_argument('--train_number', type=int, default=25, help='num_train_per_class')
parser.add_argument('--gamma', type=float, default=0.99, help='gamma')
parser.add_argument('--weight_decay', type=float, default=0.001, help='weight_decay')

args, _ = parser.parse_known_args()

In [6]:
dataset_path = os.path.join(os.getcwd(), config['train_dir'])
ground_truth_path = os.path.join(os.getcwd(), config['ground_truth'])

dataset = loadmat(dataset_path)['paviaU']
ground_truth = loadmat(ground_truth_path)['paviaU_gt']

print('Training shape', str(dataset.shape))
print('GT shape', str(ground_truth.shape))

classes = np.max(ground_truth)
print('Number of classes in Pavia dartaset are ' + str(classes))

Training shape (610, 340, 103)
GT shape (610, 340)
Number of classes in Pavia dartaset are 9


In [7]:
#normalizing
shapeor = dataset.shape

dataset = dataset.reshape(np.prod(dataset.shape[:2]), np.prod(dataset.shape[2:]))

std_scaler = StandardScaler()
std_data = std_scaler.fit_transform(dataset)
dataset = std_data.reshape(shapeor)


In [8]:
def applyPCA(data, numComponents=30):
    new_data = np.reshape(data, (-1, data.shape[2]))
    pca = PCA(n_components=numComponents, whiten=True)
    new_data = pca.fit_transform(new_data)
    new_data = np.reshape(new_data, (data.shape[0], data.shape[1], numComponents))
    return new_data, pca

In [9]:
K = 30
print('Before PCA size ' + str(dataset.shape))
dataset, pca = applyPCA(dataset, numComponents=K)
print('After PCA size ' + str(dataset.shape))

Before PCA size (610, 340, 103)
After PCA size (610, 340, 30)


In [10]:
height, width, band = dataset.shape

mirror_data = mirror_hsi(height, width, band, dataset, patch_size=args.patch_size)

*******************************************************
patch_size : 15
mirror_data shape : [624, 354, 30]
*******************************************************


In [11]:
total_pos_train, total_pos_test, total_pos_valid, number_train, number_test, number_valid = choose_train_and_test(ground_truth, args.train_number, args.seed)

x_train, x_test, x_valid = train_and_test_data(mirror_data, band, total_pos_train, total_pos_test, total_pos_valid, args.patch_size)
y_train, y_test, y_valid = train_and_test_label(number_train, number_test, number_valid, classes)

*******************************************************
x_train shape = (225, 15, 15, 30), type = float64
x_test  shape = (42551, 15, 15, 30), type = float64
x_valid  shape = (900, 15, 15, 30), type = float64
*******************************************************
y_train: shape = (225,), type = int64
y_test: shape = (42551,), type = int64
y_valid: shape = (900,), type = int64
*******************************************************


In [12]:
total_pos_train[:10]

array([[ 91, 145],
       [271, 228],
       [ 57, 131],
       [168, 184],
       [570,  34],
       [582,  68],
       [  4,  98],
       [ 22,  87],
       [ 48,  41],
       [ 98, 147]])

In [13]:
import torch.utils.data as Data

# load data
x_train = torch.from_numpy(x_train.transpose(0, 3, 1, 2)).unsqueeze(1).type(torch.FloatTensor)  # (90, 30, 15, 15)
print(x_train.shape)
y_train = torch.from_numpy(y_train).type(torch.LongTensor)  # (13,)
train_label = Data.TensorDataset(x_train, y_train)

x_test = torch.from_numpy(x_test.transpose(0, 3, 1, 2)).unsqueeze(1).type(torch.FloatTensor)  # (5198, 30, 15, 15)
print(x_test.shape)
y_test = torch.from_numpy(y_test).type(torch.LongTensor)  # (5198,)
test_label = Data.TensorDataset(x_test, y_test)

x_valid = torch.from_numpy(x_valid.transpose(0, 3, 1, 2)).unsqueeze(1).type(torch.FloatTensor)  # (5211, 30, 15, 15)
print(x_valid.shape)
y_valid = torch.from_numpy(y_valid).type(torch.LongTensor)
valid_label = Data.TensorDataset(x_valid, y_valid)

train_loader = Data.DataLoader(train_label, batch_size=30, shuffle=True)
test_loader = Data.DataLoader(test_label, batch_size=128, shuffle=True)
valid_loader = Data.DataLoader(valid_label, batch_size=64, shuffle=True)

torch.Size([225, 1, 30, 15, 15])
torch.Size([42551, 1, 30, 15, 15])
torch.Size([900, 1, 30, 15, 15])


# Model

In [14]:
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M, )
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float32)
    omega /= embed_dim / 2.
    omega = 1. / 10000**omega  # (D/2, )  32

    pos = pos.reshape(-1)  # (M, )  169
    out = np.einsum('m, d -> md', pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out)  # (M, D/2)
    emb_cos = np.cos(out) # (M, D/2)

    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb


def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0

    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)

    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
    return emb


def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=True):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    grid_h = np.arange(grid_size, dtype=np.float32)
    grid_w = np.arange(grid_size, dtype=np.float32)
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)

    grid = grid.reshape([2, 1, grid_size, grid_size])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)  # (H*W, D)
    if cls_token:
        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)  # (1+H*W, D)
    return pos_embed

In [15]:
from einops import rearrange, repeat
import torch.nn as nn
from timm.models.vision_transformer import Block

class MViT(nn.Module):
    def __init__(self, in_chans=1, bands=30, num_classes=9, dim=64, heads=4, depth=3, dropout=0.2):
        super(MViT, self).__init__()
        self.conv3d = nn.Sequential(nn.Conv3d(1, 8, 3), nn.BatchNorm3d(8), nn.ReLU())
        self.conv2d = nn.Sequential(nn.Conv2d(8*28, 64, 3), nn.BatchNorm2d(64), nn.ReLU())

        self.cls_token = nn.Parameter(torch.zeros(1, 1, dim))
        nn.init.normal_(self.cls_token, std=.02)

        self.pos_embed = nn.Parameter(torch.zeros(1, 121 + 1, dim), requires_grad=False)
        pos_embed = get_2d_sincos_pos_embed(dim, 11, cls_token=True)
        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))

        self.blocks = nn.ModuleList([Block(dim, heads, qkv_bias=True, attn_drop=0.1) for _ in range(depth)])

        self.norm = nn.LayerNorm(dim)

        self.cls_head = nn.Linear(dim, num_classes)
        self.dropout = nn.Dropout(dropout)

    def random_masking(self, x, mask_ratio=0.75):
        N, L, D = x.shape  # batch, length, dim
        len_keep = int(L * (1 - mask_ratio))
        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]  (N, L)
        # sort noise for each sample
        ids_shuffle = torch.argsort(noise, dim=1)  # 从小到大排序，返回索引 (N, L)
        # keep the first subset
        ids_keep = ids_shuffle[:, :len_keep]
        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
        return x_masked

    def forward(self, x, mask_ratio=0.75):
        # (B, 1, 30, 15, 15)
        x = self.conv3d(x)  # (B, 8, 28, 13, 13)
        x = rearrange(x, 'b c d h w -> b (c d) h w')  # (B, 224, 13, 13)
        x = self.conv2d(x)  # (B, 64, 11, 11)
        x = rearrange(x, 'b c h w -> b (h w) c')  # (B, 121, 64)

        x = x + self.pos_embed[:, 1:, :]

        center_embed = x[:, 60, :].unsqueeze(1)  # (B, 1, 64)
        x = torch.cat([x[:, :60, :], x[:, 61:, :]], dim=1)  # (B, 120, 64)

        if self.training:
            x = self.random_masking(x, mask_ratio)  # (B, 30, 64)

        x = torch.cat([center_embed, x], dim=1)

        cls_token = self.cls_token + self.pos_embed[:, :1, :]
        cls_token = repeat(cls_token, '1 n d -> b n d', b = x.shape[0])  # (B, 1, 64)

        x = torch.cat((cls_token, x), dim = 1)

        x = self.dropout(x)  # (B, 31, 64)

        for blk in self.blocks:
            x = blk(x)

        latent = self.norm(x[:, 0, :])
        x = self.cls_head(latent)
        return x

In [16]:
model = MViT(num_classes=classes).cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.gamma)

In [17]:
print('start training')
acc_list = [0.00]
path = 'mvit.pt'
tic = time.time()
for epoch in range(args.epoch):
    # 计算的是移动平均准确率
    train_acc, train_loss = train(model, train_loader, criterion, optimizer)
    valid_acc, valid_loss = valid(model, valid_loader, criterion)
    print("Epoch: {:03d} - train_loss: {:.4f} - train_acc: {:.4f} - valid_loss: {:.4f} - valid_acc: {:.4f}".\
          format(epoch+1, train_loss, train_acc, valid_loss, valid_acc))
    scheduler.step()

    acc_list.append(valid_acc)
    if acc_list[-1] > acc_list[-2]:
        print("val_acc improved from {:.4f} to {:.4f}, saving model to mvit.pt".format(acc_list[-2], acc_list[-1]))
        torch.save(model.state_dict(), path)
    else:
        print("val_acc did not improve from {:.4f}".format(acc_list[-2]))
        acc_list[-1] = acc_list[-2]

toc = time.time()
print("Running Time: {:.2f}".format(toc-tic))
print("**************************************************")

start training
Epoch: 001 - train_loss: 1.6653 - train_acc: 39.1111 - valid_loss: 1.3176 - valid_acc: 64.0000
val_acc improved from 0.0000 to 64.0000, saving model to mvit.pt
Epoch: 002 - train_loss: 0.8390 - train_acc: 76.0000 - valid_loss: 0.6720 - valid_acc: 83.8889
val_acc improved from 64.0000 to 83.8889, saving model to mvit.pt
Epoch: 003 - train_loss: 0.5940 - train_acc: 83.1111 - valid_loss: 0.5579 - valid_acc: 78.3333
val_acc did not improve from 83.8889
Epoch: 004 - train_loss: 0.4816 - train_acc: 87.5556 - valid_loss: 0.4277 - valid_acc: 88.5556
val_acc improved from 83.8889 to 88.5556, saving model to mvit.pt
Epoch: 005 - train_loss: 0.3766 - train_acc: 89.3333 - valid_loss: 0.3887 - valid_acc: 89.1111
val_acc improved from 88.5556 to 89.1111, saving model to mvit.pt
Epoch: 006 - train_loss: 0.2749 - train_acc: 92.4445 - valid_loss: 0.3443 - valid_acc: 88.5556
val_acc did not improve from 89.1111
Epoch: 007 - train_loss: 0.1945 - train_acc: 96.8889 - valid_loss: 0.2336 - va

In [18]:
model.load_state_dict(torch.load(path))
model.eval()

tar_test, pre_test = test(model, test_loader)
OA_test, AA_mean_test, Kappa_test, AA_test = output_metric(tar_test, pre_test)
AA_test = np.around(AA_test*100, 2)

In [19]:

print("*******************************************************************")
print("Final result:")
print("OA: {:.2f}, AA: {:.2f}, Kappa: {:.4f}".format(OA_test * 100., AA_mean_test*100., Kappa_test))
print("*******************************************************************")
print("Recal: {}".format(AA_test))
print("*******************************************************************")
print("Parameter:")
print_args(vars(args))
print("*******************************************************************")

*******************************************************************
Final result:
OA: 94.59, AA: 94.40, Kappa: 0.9286
*******************************************************************
Recal: [ 91.31  96.28  90.74  90.79 100.    99.94  98.54  86.22  95.77]
*******************************************************************
Parameter:
epoch: 10
learning_rate: 0.001
batch_size: 10
patch_size: 15
seed: 41
train_number: 25
gamma: 0.99
weight_decay: 0.001
*******************************************************************




```
*******************************************************************
Final result:
OA: 91.92, AA: 93.49, Kappa: 0.8947
*******************************************************************
Recal: [ 88.83  90.8   86.02  89.47 100.   100.    99.46  90.84  95.99]
*******************************************************************
Parameter:
epoch: 300
learning_rate: 0.001
batch_size: 10
patch_size: 15
seed: 41
train_number: 25
gamma: 0.99
weight_decay: 0.001
*******************************************************************```



In [20]:
!pip install "qai-hub[torch]"
!qai-hub configure --api_token znlq94irgqllstitbp39jzkdvvkk7cmu6lrcjy33

Collecting qai-hub[torch]
  Downloading qai_hub-0.44.0-py3-none-any.whl.metadata (2.6 kB)
Collecting backoff>=2.2 (from qai-hub[torch])
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting s3transfer<0.14,>=0.10.3 (from qai-hub[torch])
  Downloading s3transfer-0.13.1-py3-none-any.whl.metadata (1.7 kB)
Collecting semver>=3.0 (from qai-hub[torch])
  Downloading semver-3.0.4-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<2.0a.0,>=1.37.4 (from s3transfer<0.14,>=0.10.3->qai-hub[torch])
  Downloading botocore-1.42.49-py3-none-any.whl.metadata (5.9 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from botocore<2.0a.0,>=1.37.4->s3transfer<0.14,>=0.10.3->qai-hub[torch])
  Downloading jmespath-1.1.0-py3-none-any.whl.metadata (7.6 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Downloading s3transfer-0.13.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.3/85.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading semve

In [21]:
model.load_state_dict(torch.load(path))
model = model.to('cpu').eval()
print('Model moved to cpu and in evaluation mode')

Model moved to cpu and in evaluation mode


In [22]:

import qai_hub as hub

devices = [
    # hub.Device('Dragonwing IQ-9075 EVK'),
    # hub.Device('QCS8550 (Proxy)'),
    # hub.Device('Google Pixel 10 Pro XL'),
    # hub.Device('Samsung Galaxy S24 (Family)'),
    hub.Device('Samsung Galaxy S24 Ultra')
]

In [23]:

traced_models = []

shape = (1, 1, 30, 15, 15)

input_shape: tuple[int, ...] = shape
example_input = torch.rand(input_shape)

model_name = 'traced_MVIT'
traced_model = torch.jit.trace(model, example_input)

In [24]:

compile_jobs = []

for device in devices:
    name_formatted =  "MViT_" + device.name
    print("Submitting compile job for: " + name_formatted)

    job = hub.submit_compile_job(
        model=traced_model,
        name=name_formatted,
        device=device,
        input_specs=dict(image=input_shape),
    )
    assert isinstance(job, hub.CompileJob)
    compile_jobs.append((name_formatted, job))

Submitting compile job for: MViT_Samsung Galaxy S24 Ultra
Uploading tmpttm7ksi4.pt


100%|[34m██████████[0m| 1.22M/1.22M [00:01<00:00, 838kB/s]


Scheduled compile job (jgdvz3jzg) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jgdvz3jzg/



In [27]:

profile_jobs = []

for name, job in compile_jobs:
    device = job.device

    print("Submitting profiling job for:" + name)

    pf_job = hub.submit_profile_job(
        model=job.get_target_model(),
        device=device,
        name=job.name + "_profiling"
    )

    assert isinstance(pf_job, hub.ProfileJob)
    profile_jobs.append((name, pf_job))

Submitting profiling job for:MViT_Samsung Galaxy S24 Ultra
Scheduled profile job (jp18vjrng) successfully. To see the status and results:
    https://workbench.aihub.qualcomm.com/jobs/jp18vjrng/



In [28]:
import torch
import qai_hub as hub
import numpy as np
import pandas as pd

# ============================================================================
# STEP 3: Extract and Display Results
# ============================================================================

def us_to_ms(x):
    """Convert microseconds to milliseconds"""
    return x / 1e3

def bytes_to_mb(x):
    """Convert bytes to megabytes"""
    return x / (1024 ** 2)

def extract_architecture(model_name):
    """Extract architecture from model name"""
    return model_name.split("_")[0]

summary_rows = []
util_rows = []
memory_rows = []
bottleneck_rows = []

for name, pf_job in profile_jobs:
    result = pf_job.download_profile()
    s = result["execution_summary"]
    d = pd.DataFrame(result["execution_detail"])
    times = np.array(s["all_inference_times"])

    model_name = name
    device_name = pf_job.device.name
    architecture = extract_architecture(model_name)

    # For HSI model, resolution is fixed at 15x15 patches with 30 bands
    resolution = "15x15x30"  # Height x Width x Bands

    # -------------------------------
    # Table 1: End-to-End Performance
    # -------------------------------
    summary_rows.append({
        "Model": model_name,
        "Input Size": resolution,
        "Device": device_name,
        "Mean Latency (ms)": round(us_to_ms(times.mean()), 4),
        "Median Latency (ms)": round(us_to_ms(np.median(times)), 4),
        "P95 Latency (ms)": round(us_to_ms(np.percentile(times, 95)), 4),
        "P99 Latency (ms)": round(us_to_ms(np.percentile(times, 99)), 4),
        "Std Dev (ms)": round(us_to_ms(times.std()), 4),
        "Cold Start (ms)": round(us_to_ms(s["first_load_time"]), 4),
        "Warm Start (ms)": round(us_to_ms(s["warm_load_time"]), 4),
    })

    # -------------------------------
    # Table 2: Memory Footprint
    # -------------------------------
    memory_rows.append({
        "Model": model_name,
        "Device": device_name,
        "Inference Peak (MB)": round(bytes_to_mb(s["estimated_inference_peak_memory"]), 2),
        "Cold Start Peak (MB)": round(bytes_to_mb(s["first_load_peak_memory"]), 2),
        "Warm Start Peak (MB)": round(bytes_to_mb(s["warm_load_peak_memory"]), 2),
    })

    # -------------------------------
    # Table 3: Accelerator Utilization
    # -------------------------------
    total_time = d["execution_time"].sum()
    util = d.groupby("compute_unit")["execution_time"].sum() / total_time * 100

    util_rows.append({
        "Model": model_name,
        "Device": device_name,
        "CPU (%)": round(util.get("CPU", 0.0), 2),
        "GPU (%)": round(util.get("GPU", 0.0), 2),
        "NPU (%)": round(util.get("NPU", 0.0), 2),
        "Total Time (ms)": round(us_to_ms(total_time), 2),
        "Dominant Unit": util.idxmax() if len(util) > 0 else "N/A",
    })

    # -------------------------------
    # Table 4: Performance Bottlenecks
    # -------------------------------
    top_ops = d.nlargest(5, "execution_time")[["name", "type", "compute_unit", "execution_time"]]

    bottleneck_rows.append({
        "Model": model_name,
        "Device": device_name,
        "Slowest Op": top_ops.iloc[0]["name"].split("/")[-1] if len(top_ops) > 0 else "N/A",
        "Op Type": top_ops.iloc[0]["type"] if len(top_ops) > 0 else "N/A",
        "Op Time (ms)": round(us_to_ms(top_ops.iloc[0]["execution_time"]), 4) if len(top_ops) > 0 else 0,
        "Op Unit": top_ops.iloc[0]["compute_unit"] if len(top_ops) > 0 else "N/A",
        "Top 5 Ops Time (ms)": round(us_to_ms(top_ops["execution_time"].sum()), 2),
        "% of Total": round(top_ops["execution_time"].sum() / total_time * 100, 2),
    })

# Create DataFrames
table_perf = pd.DataFrame(summary_rows)
table_mem = pd.DataFrame(memory_rows)
table_util = pd.DataFrame(util_rows)
table_bottleneck = pd.DataFrame(bottleneck_rows)

# Display tables
print("\n" + "="*140)
print("TABLE 1: End-to-End Performance")
print("="*140)
print(table_perf.to_markdown(index=False))

print("\n" + "="*140)
print("TABLE 2: Memory Footprint")
print("="*140)
print(table_mem.to_markdown(index=False))

print("\n" + "="*140)
print("TABLE 3: Accelerator Utilization")
print("="*140)
print(table_util.to_markdown(index=False))

print("\n" + "="*140)
print("TABLE 4: Performance Bottlenecks")
print("="*140)
print(table_bottleneck.to_markdown(index=False))

print(f"\n✅ Total profile jobs: {len(profile_jobs)}")
print(f"✅ Models tested: {table_perf['Model'].unique().tolist()}")
print(f"✅ Devices tested: {table_perf['Device'].unique().tolist()}")

Waiting for profile job (jp18vjrng) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          

TABLE 1: End-to-End Performance
| Model                         | Input Size   | Device                   |   Mean Latency (ms) |   Median Latency (ms) |   P95 Latency (ms) |   P99 Latency (ms) |   Std Dev (ms) |   Cold Start (ms) |   Warm Start (ms) |
|:------------------------------|:-------------|:-------------------------|--------------------:|----------------------:|-------------------:|-------------------:|---------------:|------------------:|------------------:|
| MViT_Samsung Galaxy S24 Ultra | 15x15x30     | Samsung Galaxy S24 Ultra |              0.3236 |                 0.275 |             0.3311 |             0.5297 |         0.3376 |           436.078 |           144.507 |

TABLE 2: Memory Footprint
| Model                         | Device                   |   Inference Peak (MB) |   Cold Start Peak (MB) |   Warm Start Peak (MB) |
|:----------