In [1]:
import torch
import tensorrt as trt

print("Torch:", torch.__version__, "CUDA:", torch.cuda.is_available())
print("TensorRT:", trt.__version__)

Torch: 2.9.1+cu128 CUDA: True
TensorRT: 10.14.1.48.post1


In [2]:
import sys
from pathlib import Path
import torch
import torch.nn as nn
import tensorrt as trt
import numpy as np

sys.path.insert(0, str(Path.cwd().parents[1]))

from models.squeezenet_model import SqueezeNetCIFAR10

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [3]:
model = SqueezeNetCIFAR10(num_classes=10)
model.load_state_dict(
    torch.load("../../pth/squeezenet_70.pth", map_location="cpu")
)
model.eval()

SqueezeNetCIFAR10(
  (conv1): Conv2d(3, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fire2): Fire(
    (conv1): Conv2d(96, 16, kernel_size=(1, 1), stride=(1, 1))
    (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu1): ReLU(inplace=True)
    (conv2): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu2): ReLU(inplace=True)
  )
  (fire3): Fire(
    (conv1): Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
    (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, af

In [4]:
import torch

model.eval()
dummy_map = {
    1:   torch.randn(1,   3, 32, 32),
    64:  torch.randn(64,  3, 32, 32),
    128: torch.randn(128, 3, 32, 32),
}

for bs, dummy in dummy_map.items():
    out_path = f"squeezenet_70_fp32_b{bs}_op13.onnx"
    torch.onnx.export(
        model, dummy, out_path,
        opset_version=13,
        do_constant_folding=True,
        input_names=["input"],
        output_names=["logits"],
        dynamic_axes=None,   # <-- IMPORTANT: static
        dynamo=False
    )
    print("Exported", out_path)

  torch.onnx.export(


Exported squeezenet_70_fp32_b1_op13.onnx
Exported squeezenet_70_fp32_b64_op13.onnx
Exported squeezenet_70_fp32_b128_op13.onnx


In [5]:
!ls -lh squeezenet_70_fp32_b1_op13.onnx
!ls -lh squeezenet_70_fp32_b64_op13.onnx
!ls -lh squeezenet_70_fp32_b128_op13.onnx

-rw-r--r-- 1 ihsiao ihsiao 2.8M Dec 14 09:26 squeezenet_70_fp32_b1_op13.onnx
-rw-r--r-- 1 ihsiao ihsiao 2.8M Dec 14 09:26 squeezenet_70_fp32_b64_op13.onnx
-rw-r--r-- 1 ihsiao ihsiao 2.8M Dec 14 09:26 squeezenet_70_fp32_b128_op13.onnx


In [6]:
import onnx
m = onnx.load("squeezenet_70_fp32_b1_op13.onnx")
onnx.checker.check_model(m)
print([(op.domain, op.version) for op in m.opset_import])

[('', 13)]


In [7]:
import tensorrt as trt
print(trt.__version__)

10.14.1.48.post1


In [8]:
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.INFO)

# IMPORTANT: these ONNX files must be exported with FIXED batch sizes (static)
onnx_map = {
    1:   "squeezenet_70_fp32_b1_op13.onnx",
    64:  "squeezenet_70_fp32_b64_op13.onnx",
    128: "squeezenet_70_fp32_b128_op13.onnx",
}

def build_static_engine(onnx_path, engine_path):
    with trt.Builder(TRT_LOGGER) as builder, \
         builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
         trt.OnnxParser(network, TRT_LOGGER) as parser:

        with open(onnx_path, "rb") as f:
            if not parser.parse(f.read()):
                for i in range(parser.num_errors):
                    print(parser.get_error(i))
                raise RuntimeError(f"ONNX parse failed for {onnx_path}")

        config = builder.create_builder_config()
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)

        # NO optimization profile => static engine (uses whatever fixed shape is in ONNX)
        serialized = builder.build_serialized_network(network, config)
        if serialized is None:
            raise RuntimeError(f"Engine build failed for {onnx_path}")

        with open(engine_path, "wb") as f:
            f.write(serialized)

    print("Saved:", engine_path)

for bs, onnx_path in onnx_map.items():
    engine_path = f"squeezenet_70_fp32_b{bs}.engine"
    build_static_engine(onnx_path, engine_path)

[12/14/2025-09:26:57] [TRT] [I] [MemUsageChange] Init CUDA: CPU -23, GPU +0, now: CPU 558, GPU 1354 (MiB)
[12/14/2025-09:26:57] [TRT] [I] ----------------------------------------------------------------
[12/14/2025-09:26:57] [TRT] [I] ONNX IR version:  0.0.7
[12/14/2025-09:26:57] [TRT] [I] Opset version:    13
[12/14/2025-09:26:57] [TRT] [I] Producer name:    pytorch
[12/14/2025-09:26:57] [TRT] [I] Producer version: 2.9.1
[12/14/2025-09:26:57] [TRT] [I] Domain:           
[12/14/2025-09:26:57] [TRT] [I] Model version:    0
[12/14/2025-09:26:57] [TRT] [I] Doc string:       
[12/14/2025-09:26:57] [TRT] [I] ----------------------------------------------------------------
[12/14/2025-09:26:57] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +280, GPU +8, now: CPU 673, GPU 1362 (MiB)
[12/14/2025-09:26:57] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[12/14/2025-09:27:30] [TRT] [I] Detected 1 inputs and 1 output network tensors.


In [9]:
!ls -lh squeezenet_70_fp32_b1.engine
!ls -lh squeezenet_70_fp32_b64.engine
!ls -lh squeezenet_70_fp32_b128.engine

-rw-r--r-- 1 ihsiao ihsiao 3.1M Dec 14 09:27 squeezenet_70_fp32_b1.engine
-rw-r--r-- 1 ihsiao ihsiao 3.4M Dec 14 09:27 squeezenet_70_fp32_b64.engine
-rw-r--r-- 1 ihsiao ihsiao 3.4M Dec 14 09:28 squeezenet_70_fp32_b128.engine


In [10]:
import tensorrt as trt
import torch

def benchmark_engine_static(engine_path, batch_size, iters=1000):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    inp = [n for n in names if engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT][0]
    out = [n for n in names if engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT][0]

    # ✅ STATIC engine: DO NOT set_input_shape()
    # context.set_input_shape(inp, (batch_size, 3, 32, 32))

    x = torch.randn(batch_size, 3, 32, 32, device="cuda", dtype=torch.float32)
    y = torch.empty(tuple(context.get_tensor_shape(out)), device="cuda", dtype=torch.float32)

    context.set_tensor_address(inp, int(x.data_ptr()))
    context.set_tensor_address(out, int(y.data_ptr()))

    stream = torch.cuda.Stream()

    # warmup
    for _ in range(50):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    stream.synchronize()

    start = torch.cuda.Event(enable_timing=True)
    end   = torch.cuda.Event(enable_timing=True)

    start.record(stream)
    for _ in range(iters):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    end.record(stream)
    stream.synchronize()

    elapsed_ms = start.elapsed_time(end)

    batch_latency_ms = elapsed_ms / iters
    throughput = (iters * batch_size) / (elapsed_ms / 1000.0)
    ms_per_img = batch_latency_ms / batch_size

    print(f"{engine_path} | batch={batch_size}")
    print(f"  latency:     {batch_latency_ms:.3f} ms/batch")
    print(f"  per-image:   {ms_per_img:.6f} ms/image")
    print(f"  throughput:  {throughput:.1f} images/sec")


print("Starting benchmark (3 static engines)...")
benchmark_engine_static("squeezenet_70_fp32_b1.engine",   batch_size=1,   iters=1000)
benchmark_engine_static("squeezenet_70_fp32_b64.engine",  batch_size=64,  iters=1000)
benchmark_engine_static("squeezenet_70_fp32_b128.engine", batch_size=128, iters=1000)

Starting benchmark (3 static engines)...
squeezenet_70_fp32_b1.engine | batch=1
  latency:     0.226 ms/batch
  per-image:   0.226146 ms/image
  throughput:  4421.9 images/sec
squeezenet_70_fp32_b64.engine | batch=64
  latency:     0.457 ms/batch
  per-image:   0.007145 ms/image
  throughput:  139949.1 images/sec
squeezenet_70_fp32_b128.engine | batch=128
  latency:     0.743 ms/batch
  per-image:   0.005806 ms/image
  throughput:  172244.6 images/sec


In [11]:
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.INFO)

# IMPORTANT: these ONNX files must be exported with FIXED batch sizes (static)
onnx_map = {
    1:   "squeezenet_70_fp32_b1_op13.onnx",
    64:  "squeezenet_70_fp32_b64_op13.onnx",
    128: "squeezenet_70_fp32_b128_op13.onnx",
}

def build_static_fp16_engine(onnx_path, engine_path):
    with trt.Builder(TRT_LOGGER) as builder, \
         builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
         trt.OnnxParser(network, TRT_LOGGER) as parser:

        with open(onnx_path, "rb") as f:
            if not parser.parse(f.read()):
                for i in range(parser.num_errors):
                    print(parser.get_error(i))
                raise RuntimeError(f"ONNX parse failed for {onnx_path}")

        config = builder.create_builder_config()
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)

        # ✅ FP16 enabled
        config.set_flag(trt.BuilderFlag.FP16)

        # ✅ NO optimization profile => static engine (fixed shape from ONNX)
        serialized = builder.build_serialized_network(network, config)
        if serialized is None:
            raise RuntimeError(f"FP16 engine build failed for {onnx_path}")

        with open(engine_path, "wb") as f:
            f.write(serialized)

    print("Saved:", engine_path)

for bs, onnx_path in onnx_map.items():
    engine_path = f"squeezenet_70_fp16_b{bs}.engine"
    build_static_fp16_engine(onnx_path, engine_path)

[12/14/2025-09:28:27] [TRT] [I] ----------------------------------------------------------------
[12/14/2025-09:28:27] [TRT] [I] ONNX IR version:  0.0.7
[12/14/2025-09:28:27] [TRT] [I] Opset version:    13
[12/14/2025-09:28:27] [TRT] [I] Producer name:    pytorch
[12/14/2025-09:28:27] [TRT] [I] Producer version: 2.9.1
[12/14/2025-09:28:27] [TRT] [I] Domain:           
[12/14/2025-09:28:27] [TRT] [I] Model version:    0
[12/14/2025-09:28:27] [TRT] [I] Doc string:       
[12/14/2025-09:28:27] [TRT] [I] ----------------------------------------------------------------
[12/14/2025-09:28:28] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU -277, GPU +0, now: CPU 1090, GPU 1664 (MiB)
[12/14/2025-09:28:28] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[12/14/2025-09:29:38] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[12/14/2025-09:29:39] [TRT] [I] Total Host Persistent Memory: 193632 bytes
[12/14/2025-09:29:39] [TRT] [I

In [12]:
!ls -lh squeezenet_70_fp16_b1.engine
!ls -lh squeezenet_70_fp16_b64.engine
!ls -lh squeezenet_70_fp16_b128.engine

-rw-r--r-- 1 ihsiao ihsiao 1.7M Dec 14 09:29 squeezenet_70_fp16_b1.engine
-rw-r--r-- 1 ihsiao ihsiao 1.9M Dec 14 09:30 squeezenet_70_fp16_b64.engine
-rw-r--r-- 1 ihsiao ihsiao 2.0M Dec 14 09:31 squeezenet_70_fp16_b128.engine


In [13]:
import tensorrt as trt
import torch

def run_engine_static(engine_path, batch, iters=1000, warmup=50):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    inp_name  = [n for n in tensor_names if engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT][0]
    out_name  = [n for n in tensor_names if engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT][0]

    # ❌ NO set_input_shape() for static engines

    x = torch.randn(batch, 3, 32, 32, device="cuda", dtype=torch.float32)
    y = torch.empty(tuple(context.get_tensor_shape(out_name)), device="cuda", dtype=torch.float32)

    context.set_tensor_address(inp_name, int(x.data_ptr()))
    context.set_tensor_address(out_name, int(y.data_ptr()))

    stream = torch.cuda.Stream()

    # warmup
    for _ in range(warmup):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    stream.synchronize()

    start = torch.cuda.Event(enable_timing=True)
    end   = torch.cuda.Event(enable_timing=True)

    start.record(stream)
    for _ in range(iters):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    end.record(stream)
    stream.synchronize()

    elapsed_ms = start.elapsed_time(end)
    batch_latency_ms = elapsed_ms / iters
    ms_per_img = batch_latency_ms / batch
    img_per_sec = (iters * batch) / (elapsed_ms / 1000.0)

    print(f"{engine_path} | batch={batch}")
    print(f"  batch latency: {batch_latency_ms:.3f} ms/batch")
    print(f"  per-image:     {ms_per_img:.6f} ms/image")
    print(f"  throughput:    {img_per_sec:.1f} images/sec")

run_engine_static("squeezenet_70_fp16_b1.engine",   1)
run_engine_static("squeezenet_70_fp16_b64.engine",  64)
run_engine_static("squeezenet_70_fp16_b128.engine", 128)

squeezenet_70_fp16_b1.engine | batch=1
  batch latency: 0.197 ms/batch
  per-image:     0.197310 ms/image
  throughput:    5068.2 images/sec
squeezenet_70_fp16_b64.engine | batch=64
  batch latency: 0.310 ms/batch
  per-image:     0.004851 ms/image
  throughput:    206142.7 images/sec
squeezenet_70_fp16_b128.engine | batch=128
  batch latency: 0.412 ms/batch
  per-image:     0.003217 ms/image
  throughput:    310892.7 images/sec


In [14]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010)),
])

test_dataset = datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=test_transform
)

test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

57.7%IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100.0%


In [15]:
from torch.utils.data import DataLoader

test_loader_b1   = DataLoader(test_dataset, batch_size=1,   shuffle=False, num_workers=2, pin_memory=True, drop_last=True)
test_loader_b64  = DataLoader(test_dataset, batch_size=64,  shuffle=False, num_workers=2, pin_memory=True, drop_last=True)
test_loader_b128 = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2, pin_memory=True, drop_last=True)

In [16]:
import torch
import tensorrt as trt

@torch.no_grad()
def trt_accuracy_static(engine_path, test_loader, num_batches=None):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    inp = [n for n in names if engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT][0]
    out = [n for n in names if engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT][0]

    # engine fixed shapes
    in_shape = tuple(engine.get_tensor_shape(inp))
    out_shape = tuple(engine.get_tensor_shape(out))
    fixed_bsz = in_shape[0]  # should be 1 or 64 or 128

    # output dtype
    trt_dtype = engine.get_tensor_dtype(out)
    torch_dtype = {
        trt.DataType.FLOAT: torch.float32,
        trt.DataType.HALF:  torch.float16,
        trt.DataType.INT8:  torch.int8,
        trt.DataType.INT32: torch.int32,
    }[trt_dtype]

    stream = torch.cuda.current_stream()
    correct = 0
    total = 0

    for bi, (x_cpu, y_cpu) in enumerate(test_loader):
        if num_batches is not None and bi >= num_batches:
            break

        x = x_cpu.to("cuda", non_blocking=True)
        y = y_cpu.to("cuda", non_blocking=True)

        if x.shape[0] != fixed_bsz:
            raise RuntimeError(f"Batch mismatch: loader={x.shape[0]} but engine expects {fixed_bsz}")

        yhat = torch.empty(out_shape, device="cuda", dtype=torch_dtype)

        context.set_tensor_address(inp, int(x.data_ptr()))
        context.set_tensor_address(out, int(yhat.data_ptr()))

        ok = context.execute_async_v3(stream_handle=stream.cuda_stream)
        if not ok:
            raise RuntimeError("TRT execute failed")

        pred = yhat.float().argmax(dim=1)
        correct += (pred == y).sum().item()
        total += x.shape[0]

    torch.cuda.synchronize()
    return 100.0 * correct / total

In [17]:
acc1   = trt_accuracy_static("squeezenet_70_fp32_b1.engine",   test_loader_b1)
acc64  = trt_accuracy_static("squeezenet_70_fp32_b64.engine",  test_loader_b64)
acc128 = trt_accuracy_static("squeezenet_70_fp32_b128.engine", test_loader_b128)

print(f"FP32 TRT Acc b1:   {acc1:.2f}%")
print(f"FP32 TRT Acc b64:  {acc64:.2f}%")
print(f"FP32 TRT Acc b128: {acc128:.2f}%")

[12/14/2025-09:31:52] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
[12/14/2025-09:31:59] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
[12/14/2025-09:32:00] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
FP32 TRT Acc b1:   89.71%
FP32 TRT Acc b64:  89.72%
FP32 TRT Acc b128: 89.71%


In [18]:
acc1   = trt_accuracy_static("squeezenet_70_fp16_b1.engine",   test_loader_b1)
acc64  = trt_accuracy_static("squeezenet_70_fp16_b64.engine",  test_loader_b64)
acc128 = trt_accuracy_static("squeezenet_70_fp16_b128.engine", test_loader_b128)

print(f"FP16 TRT Acc b1:   {acc1:.2f}%")
print(f"FP16 TRT Acc b64:  {acc64:.2f}%")
print(f"FP16 TRT Acc b128: {acc128:.2f}%")

[12/14/2025-09:32:01] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
[12/14/2025-09:32:08] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
[12/14/2025-09:32:09] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
FP16 TRT Acc b1:   89.71%
FP16 TRT Acc b64:  89.72%
FP16 TRT Acc b128: 89.67%


In [19]:
import tensorrt as trt
import torch

# ✅ reduce spam
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# ✅ use your 3 fixed-shape ONNX files
onnx_map = {
    1:   "squeezenet_70_fp32_b1_op13.onnx",
    64:  "squeezenet_70_fp32_b64_op13.onnx",
    128: "squeezenet_70_fp32_b128_op13.onnx",
}

class EntropyCalibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, calib_loader, max_batches=200, cache_file="calib.cache"):
        super().__init__()
        self.cache_file = cache_file
        self.data_iter = iter(calib_loader)
        self.max_batches = max_batches
        self.batch_count = 0

        x0, _ = next(iter(calib_loader))
        self.batch_size = x0.shape[0]
        self.device_input = torch.empty_like(x0, device="cuda")

    def get_batch_size(self):
        return self.batch_size

    def get_batch(self, names):
        if self.batch_count >= self.max_batches:
            return None
        try:
            x, _ = next(self.data_iter)
        except StopIteration:
            return None

        self.batch_count += 1
        x = x.to("cuda", non_blocking=True)
        self.device_input.resize_(x.shape).copy_(x)
        return [int(self.device_input.data_ptr())]

    def read_calibration_cache(self):
        try:
            with open(self.cache_file, "rb") as f:
                return f.read()
        except FileNotFoundError:
            return None

    def write_calibration_cache(self, cache):
        with open(self.cache_file, "wb") as f:
            f.write(cache)

def build_int8_engine_static(onnx_path, engine_path, calib_loader, max_calib_batches=200):
    with trt.Builder(TRT_LOGGER) as builder, \
         builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
         trt.OnnxParser(network, TRT_LOGGER) as parser:

        with open(onnx_path, "rb") as f:
            if not parser.parse(f.read()):
                for i in range(parser.num_errors):
                    print(parser.get_error(i))
                raise RuntimeError(f"ONNX parse failed for {onnx_path}")

        config = builder.create_builder_config()
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)

        # INT8 PTQ
        config.set_flag(trt.BuilderFlag.INT8)
        config.int8_calibrator = EntropyCalibrator(
            calib_loader,
            max_batches=max_calib_batches,
            cache_file=engine_path.replace(".engine", ".cache")
        )

        # ✅ static ONNX => NO optimization profile needed
        serialized = builder.build_serialized_network(network, config)
        if serialized is None:
            raise RuntimeError(f"INT8 Engine build failed for {onnx_path}")

        with open(engine_path, "wb") as f:
            f.write(serialized)

    print(f"Saved: {engine_path} (calib_batches={max_calib_batches})")

# ✅ make sure these match the ONNX batch size:
calib_loader_map = {
    1:   test_loader_b1,
    64:  test_loader_b64,
    128: test_loader_b128,
}

CALIB_BATCHES = 200

for bs, onnx_path in onnx_map.items():
    build_int8_engine_static(
        onnx_path=onnx_path,
        engine_path=f"squeezenet_70_int8_b{bs}.engine",
        calib_loader=calib_loader_map[bs],
        max_calib_batches=CALIB_BATCHES
    )

  config.int8_calibrator = EntropyCalibrator(


Saved: squeezenet_70_int8_b1.engine (calib_batches=200)
Saved: squeezenet_70_int8_b64.engine (calib_batches=200)
Saved: squeezenet_70_int8_b128.engine (calib_batches=200)


In [20]:
!ls -lh squeezenet_70_int8_b1.engine
!ls -lh squeezenet_70_int8_b64.engine
!ls -lh squeezenet_70_int8_b128.engine

-rw-r--r-- 1 ihsiao ihsiao 1.2M Dec 14 09:33 squeezenet_70_int8_b1.engine
-rw-r--r-- 1 ihsiao ihsiao 1.4M Dec 14 09:35 squeezenet_70_int8_b64.engine
-rw-r--r-- 1 ihsiao ihsiao 1.3M Dec 14 09:36 squeezenet_70_int8_b128.engine


In [21]:
acc1   = trt_accuracy_static("squeezenet_70_int8_b1.engine",   test_loader_b1)
acc64  = trt_accuracy_static("squeezenet_70_int8_b64.engine",  test_loader_b64)
acc128 = trt_accuracy_static("squeezenet_70_int8_b128.engine", test_loader_b128)

print(f"INT8 TRT Acc b1:   {acc1:.2f}%")
print(f"INT8 TRT Acc b64:  {acc64:.2f}%")
print(f"INT8 TRT Acc b128: {acc128:.2f}%")

[12/14/2025-09:36:37] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
[12/14/2025-09:36:44] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
[12/14/2025-09:36:45] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
INT8 TRT Acc b1:   89.75%
INT8 TRT Acc b64:  89.71%
INT8 TRT Acc b128: 89.78%


In [22]:
@torch.no_grad()
def torch_acc(model, loader, device="cuda"):
    model.eval().to(device)
    correct = total = 0
    for x,y in loader:
        x,y = x.to(device), y.to(device)
        pred = model(x).argmax(1)
        correct += (pred==y).sum().item()
        total += y.size(0)
    return 100*correct/total

print("Torch acc:", torch_acc(model, test_loader))

Torch acc: 89.69


In [23]:
print(test_loader.dataset.transform)

Compose(
    ToTensor()
    Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.201))
)
