In [None]:
# Cell 0: Basic setup (run once)
!pip install --quiet "qai-hub" pillow


In [None]:
# Cell A: Configure Qualcomm AI Hub and list available devices
import qai_hub as hub

# IMPORTANT: put your actual API token here (from AI Hub settings page)
hub.set_session_token("API token")  # TODO: replace

devices = hub.get_devices()
print("Available devices:")
for i, d in enumerate(devices):
    print(f"[{i}] {d}")


In [None]:
# Cell B: Create folder structure for model + calibration data (no copying)
from pathlib import Path

ROOT = Path("/content")  # Colab root
AIHUB_DIR = ROOT / "aihub"
MODEL_DIR = AIHUB_DIR / "model"
CALIB_DIR = AIHUB_DIR / "calib"

MODEL_DIR.mkdir(parents=True, exist_ok=True)
CALIB_DIR.mkdir(parents=True, exist_ok=True)

print("Created (or confirmed) directories:")
print("MODEL_DIR:", MODEL_DIR)
print("CALIB_DIR:", CALIB_DIR)

# Show empty tree (will be filled after you upload files)
!apt-get -qq install tree >/dev/null
!tree -L 3 /content/aihub


In [None]:
# Cell C: Wrap ONNX + .data into ONNX directory container for AI Hub
from pathlib import Path
import shutil

ROOT = Path("/content/aihub")

# If your exported model has a different name, change this:
MODEL_NAME = "ufldv2_tusimple_res18_800x320.onnx"

MODEL_UPLOAD_DIR = ROOT / "model"
onnx_file = MODEL_UPLOAD_DIR / MODEL_NAME
data_file = MODEL_UPLOAD_DIR / (MODEL_NAME + ".data")

assert onnx_file.exists(), f"ONNX file not found: {onnx_file}"
assert data_file.exists(), f"External weights file not found: {data_file}"

# This is the ONNX "container" directory that AI Hub expects:
#   /content/aihub/ufldv2_tusimple_res18_800x320.onnx/
#     ├── ufldv2_tusimple_res18_800x320.onnx
#     └── ufldv2_tusimple_res18_800x320.onnx.data
CONTAINER_DIR = ROOT / MODEL_NAME
CONTAINER_DIR.mkdir(exist_ok=True)

shutil.copy2(onnx_file, CONTAINER_DIR / onnx_file.name)
shutil.copy2(data_file, CONTAINER_DIR / data_file.name)

print("Created ONNX container dir:", CONTAINER_DIR)
!apt-get -qq install tree >/dev/null
!tree -L 2 /content/aihub


In [None]:
# Cell C: Select target Snapdragon device

import qai_hub as hub

devices = hub.get_devices()

DEVICE_INDEX = 61

device = devices[DEVICE_INDEX]
print("Selected device:", device)


In [None]:
# Cell D: Build calibration_data dict from uploaded KITTI frames
from pathlib import Path
from PIL import Image
import numpy as np

CALIB_DIR = Path("/content/aihub/calib")
IMG_W, IMG_H = 800, 320

png_paths = sorted(CALIB_DIR.glob("*.png"))
print("Found", len(png_paths), "calibration images in", CALIB_DIR)

assert len(png_paths) > 0, "No calibration PNGs found. Upload some images first."

# Limit number of calibration samples if many
MAX_CALIB = 200
png_paths = png_paths[:MAX_CALIB]
print("Using", len(png_paths), "images for calibration.")

calib_samples = []
for p in png_paths:
    img = Image.open(p).convert("RGB")
    arr = np.array(img).astype(np.float32) / 255.0
    arr = np.transpose(arr, (2, 0, 1))[None, ...]
    calib_samples.append(arr)

calibration_data = {"input": calib_samples}
print("Built calibration_data with", len(calibration_data["input"]), "samples.")


In [None]:
# Cell E: Compile and profile FP32 ONNX model on Snapdragon via AI Hub
import qai_hub as hub
from pathlib import Path

ROOT = Path("/content/aihub")
MODEL_NAME = "ufldv2_tusimple_res18_800x320.onnx"

# ONNX *directory container* created in Cell B3
CONTAINER_DIR = ROOT / MODEL_NAME
assert CONTAINER_DIR.is_dir(), f"ONNX container dir not found: {CONTAINER_DIR}"

print("Submitting FP32 compile job with ONNX container:", CONTAINER_DIR)

compile_fp32_job = hub.submit_compile_job(
    model=CONTAINER_DIR,
    device=device,
    options="--target_runtime qnn_context_binary",
    name="ufldv2_tusimple_fp32_compile",
)

# Wait for completion
compile_fp32_job.wait()

# Get detailed status
status = compile_fp32_job.get_status()
print("FP32 compile job status code:", status.code)
print("FP32 compile job message:", status.message)
print("Job URL:", compile_fp32_job.url)

# If status.code is not "SUCCESS", bail out early
if status.code != "SUCCESS":
    raise RuntimeError(
        f"Compile job failed with code={status.code}, message={status.message}. "
        f"Open this URL in your browser for details: {compile_fp32_job.url}"
    )

# Otherwise, fetch the compiled target model
compiled_fp32_model = compile_fp32_job.get_target_model()
if compiled_fp32_model is None:
    raise RuntimeError(
        "Compile job reported SUCCESS but returned no target model. "
        f"Check logs at: {compile_fp32_job.url}"
    )

print("✅ FP32 compile job done. Target model id:", compiled_fp32_model.model_id)

# Now profile on-device
print("Submitting FP32 profile job...")
profile_fp32_job = hub.submit_profile_job(
    model=compiled_fp32_model,
    device=device,
)

# Block until the profile job finishes
profile_status = profile_fp32_job.wait()
print("FP32 profile job status:", profile_status)
print("FP32 profile job URL:", profile_fp32_job.url)

# Download the profile results as a Python dict
profile_fp32 = profile_fp32_job.download_profile()
print("\n=== FP32 Snapdragon Profile (raw dict) ===")
print(profile_fp32)


In [None]:
# Cell F: Quantize (INT8 PTQ), compile, and profile on Snapdragon
import qai_hub as hub
from pathlib import Path

# Assumes:
# - calibration_data is already built in Cell E
# - device is selected in Cell C
# - ONNX container dir created in Cell B3

ROOT = Path("/content/aihub")
MODEL_NAME = "ufldv2_tusimple_res18_800x320.onnx"
CONTAINER_DIR = ROOT / MODEL_NAME
assert CONTAINER_DIR.is_dir(), f"ONNX container dir not found: {CONTAINER_DIR}"

# 1) Quantize FP32 ONNX -> INT8 (w8a8) using calibration_data
print("Submitting INT8 quantize job (w8a8 PTQ)...")
quant_job = hub.submit_quantize_job(
    model=CONTAINER_DIR,
    calibration_data=calibration_data,
    weights_dtype=hub.QuantizeDtype.INT8,
    activations_dtype=hub.QuantizeDtype.INT8,
    name="ufldv2_tusimple_int8_w8a8",
)

quant_job.wait()
q_status = quant_job.get_status()
print("INT8 quantize job status code:", q_status.code)
print("INT8 quantize job message:", q_status.message)
print("INT8 quantize job URL:", quant_job.url)

if q_status.code != "SUCCESS":
    raise RuntimeError(
        f"Quantize job failed with code={q_status.code}, message={q_status.message}. "
        f"Open this URL in your browser for details: {quant_job.url}"
    )

quantized_onnx_model = quant_job.get_target_model()
if quantized_onnx_model is None:
    raise RuntimeError(
        "Quantize job reported SUCCESS but returned no target model. "
        f"Check logs at: {quant_job.url}"
    )

print("✅ INT8 quantize job done. Quantized model id:", quantized_onnx_model.model_id)

# 2) Compile quantized ONNX to QNN context binary with quantized IO
print("\nSubmitting INT8 compile job...")
compile_int8_job = hub.submit_compile_job(
    model=quantized_onnx_model,
    device=device,
    input_specs=dict(input=(1, 3, 320, 800)),
    options="--target_runtime qnn_context_binary --quantize_io",
    name="ufldv2_tusimple_int8_compile",
)

compile_int8_job.wait()
c_status = compile_int8_job.get_status()
print("INT8 compile job status code:", c_status.code)
print("INT8 compile job message:", c_status.message)
print("INT8 compile job URL:", compile_int8_job.url)

if c_status.code != "SUCCESS":
    raise RuntimeError(
        f"INT8 compile job failed with code={c_status.code}, message={c_status.message}. "
        f"Open this URL in your browser for details: {compile_int8_job.url}"
    )

compiled_int8_model = compile_int8_job.get_target_model()
if compiled_int8_model is None:
    raise RuntimeError(
        "INT8 compile job reported SUCCESS but returned no target model. "
        f"Check logs at: {compile_int8_job.url}"
    )

print("✅ INT8 compile job done. Target model id:", compiled_int8_model.model_id)

# 3) Profile INT8 model on-device
print("\nSubmitting INT8 profile job...")
profile_int8_job = hub.submit_profile_job(
    model=compiled_int8_model,
    device=device,
)

profile_status_int8 = profile_int8_job.wait()
print("INT8 profile job status:", profile_status_int8)
print("INT8 profile job URL:", profile_int8_job.url)

# Download the profile results as a Python dict
profile_int8 = profile_int8_job.download_profile()
print("\n=== INT8 Snapdragon Profile (raw dict) ===")
print(profile_int8)


In [None]:
# Cell G: Compare key performance metrics between FP32 and INT8 profiles

def _extract_summary(profile_dict):
    """
    Handle both possible formats:
      - profile["execution_summary"][...]
      - profile[...] directly
    """
    if isinstance(profile_dict, dict) and "execution_summary" in profile_dict:
        return profile_dict["execution_summary"]
    return profile_dict

def _get_latency_us(summary):
    """Try both estimated_inference_time and execution_time (docs show both variants)."""
    if summary is None:
        return None
    if "estimated_inference_time" in summary:
        return summary["estimated_inference_time"]
    if "execution_time" in summary:
        return summary["execution_time"]
    return None

def _get_peak_mem_bytes(summary):
    if summary is None:
        return None
    # Prefer estimated_inference_peak_memory if present
    if "estimated_inference_peak_memory" in summary:
        return summary["estimated_inference_peak_memory"]
    # Fall back to inference_memory_peak_range upper bound if present
    if "inference_memory_peak_range" in summary and summary["inference_memory_peak_range"]:
        return summary["inference_memory_peak_range"][1]
    return None

def _ms(us):
    return us / 1000.0 if us is not None else None

def _mb(bytes_):
    return bytes_ / (1024.0 * 1024.0) if bytes_ is not None else None

# ---- Extract summaries ----
summary_fp32 = _extract_summary(profile_fp32)
summary_int8 = _extract_summary(profile_int8)

lat_us_fp32  = _get_latency_us(summary_fp32)
lat_us_int8  = _get_latency_us(summary_int8)
mem_b_fp32   = _get_peak_mem_bytes(summary_fp32)
mem_b_int8   = _get_peak_mem_bytes(summary_int8)

lat_ms_fp32  = _ms(lat_us_fp32)
lat_ms_int8  = _ms(lat_us_int8)
mem_mb_fp32  = _mb(mem_b_fp32)
mem_mb_int8  = _mb(mem_b_int8)

# Throughput in FPS from single-batch latency
fps_fp32 = 1000.0 / lat_ms_fp32 if lat_ms_fp32 else None
fps_int8 = 1000.0 / lat_ms_int8 if lat_ms_int8 else None

print("=== Snapdragon Performance Comparison (QNN context binary) ===\n")

def fmt(x, digits=3):
    return f"{x:.{digits}f}" if isinstance(x, (int, float)) and x is not None else "N/A"

print(f"{'Metric':<25} {'FP32':>12} {'INT8 (w8a8)':>15}")
print("-" * 54)
print(f"{'Latency (ms)':<25} {fmt(lat_ms_fp32):>12} {fmt(lat_ms_int8):>15}")
print(f"{'Throughput (FPS)':<25} {fmt(fps_fp32):>12} {fmt(fps_int8):>15}")
print(f"{'Peak infer mem (MB)':<25} {fmt(mem_mb_fp32):>12} {fmt(mem_mb_int8):>15}")

# Optional: also show first/warm load times if present
fl_us_fp32 = summary_fp32.get("first_load_time") if summary_fp32 else None
fl_us_int8 = summary_int8.get("first_load_time") if summary_int8 else None
wl_us_fp32 = summary_fp32.get("warm_load_time") if summary_fp32 else None
wl_us_int8 = summary_int8.get("warm_load_time") if summary_int8 else None

if fl_us_fp32 or fl_us_int8 or wl_us_fp32 or wl_us_int8:
    print("\nLoad-time details (ms):")
    print(f"{'First load':<25} {fmt(_ms(fl_us_fp32)):>12} {fmt(_ms(fl_us_int8)):>15}")
    print(f"{'Warm load':<25} {fmt(_ms(wl_us_fp32)):>12} {fmt(_ms(wl_us_int8)):>15}")
