In [1]:
import torch
import tensorrt as trt

print("Torch:", torch.__version__, "CUDA:", torch.cuda.is_available())
print("TensorRT:", trt.__version__)

Torch: 2.9.1+cu128 CUDA: True
TensorRT: 10.14.1.48.post1


In [2]:
import torch
import torch.nn as nn
import tensorrt as trt
import numpy as np

from models.cifar_resnet32 import ResNet32

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [3]:
model = ResNet32(num_classes=10)
model.load_state_dict(
    torch.load("checkpoints/resnet32_fp32_best.pt", map_location="cpu")
)
model.eval()

ResNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential()
    )
    (1): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=

In [10]:
import torch

model.eval()
dummy_map = {
    1:   torch.randn(1,   3, 32, 32),
    64:  torch.randn(64,  3, 32, 32),
    128: torch.randn(128, 3, 32, 32),
}

for bs, dummy in dummy_map.items():
    out_path = f"resnet32_fp32_b{bs}_op13.onnx"
    torch.onnx.export(
        model, dummy, out_path,
        opset_version=13,
        do_constant_folding=True,
        input_names=["input"],
        output_names=["logits"],
        dynamic_axes=None,   # <-- IMPORTANT: static
        dynamo=False
    )
    print("Exported", out_path)

  torch.onnx.export(


Exported resnet32_fp32_b1_op13.onnx
Exported resnet32_fp32_b64_op13.onnx
Exported resnet32_fp32_b128_op13.onnx


In [11]:
!ls -lh resnet32_fp32_b1_op13.onnx
!ls -lh resnet32_fp32_b64_op13.onnx
!ls -lh resnet32_fp32_b128_op13.onnx

-rw-r--r-- 1 ihsiao ihsiao 1.8M Dec 13 08:21 resnet32_fp32_b1_op13.onnx
-rw-r--r-- 1 ihsiao ihsiao 1.8M Dec 13 08:21 resnet32_fp32_b64_op13.onnx
-rw-r--r-- 1 ihsiao ihsiao 1.8M Dec 13 08:21 resnet32_fp32_b128_op13.onnx


In [12]:
import onnx
m = onnx.load("resnet32_fp32_b1_op13.onnx")
onnx.checker.check_model(m)
print([(op.domain, op.version) for op in m.opset_import])

[('', 13)]


In [13]:
import tensorrt as trt
print(trt.__version__)

10.14.1.48.post1


In [15]:
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.INFO)

# IMPORTANT: these ONNX files must be exported with FIXED batch sizes (static)
onnx_map = {
    1:   "resnet32_fp32_b1_op13.onnx",
    64:  "resnet32_fp32_b64_op13.onnx",
    128: "resnet32_fp32_b128_op13.onnx",
}

def build_static_engine(onnx_path, engine_path):
    with trt.Builder(TRT_LOGGER) as builder, \
         builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
         trt.OnnxParser(network, TRT_LOGGER) as parser:

        with open(onnx_path, "rb") as f:
            if not parser.parse(f.read()):
                for i in range(parser.num_errors):
                    print(parser.get_error(i))
                raise RuntimeError(f"ONNX parse failed for {onnx_path}")

        config = builder.create_builder_config()
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)

        # NO optimization profile => static engine (uses whatever fixed shape is in ONNX)
        serialized = builder.build_serialized_network(network, config)
        if serialized is None:
            raise RuntimeError(f"Engine build failed for {onnx_path}")

        with open(engine_path, "wb") as f:
            f.write(serialized)

    print("Saved:", engine_path)

for bs, onnx_path in onnx_map.items():
    engine_path = f"resnet32_fp32_b{bs}.engine"
    build_static_engine(onnx_path, engine_path)

[12/13/2025-08:23:25] [TRT] [I] ----------------------------------------------------------------
[12/13/2025-08:23:25] [TRT] [I] ONNX IR version:  0.0.7
[12/13/2025-08:23:25] [TRT] [I] Opset version:    13
[12/13/2025-08:23:25] [TRT] [I] Producer name:    pytorch
[12/13/2025-08:23:25] [TRT] [I] Producer version: 2.9.1
[12/13/2025-08:23:25] [TRT] [I] Domain:           
[12/13/2025-08:23:25] [TRT] [I] Model version:    0
[12/13/2025-08:23:25] [TRT] [I] Doc string:       
[12/13/2025-08:23:25] [TRT] [I] ----------------------------------------------------------------
[12/13/2025-08:23:25] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[12/13/2025-08:23:32] [TRT] [I] Compiler backend is used during engine build.
[12/13/2025-08:23:33] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[12/13/2025-08:23:33] [TRT] [I] Total Host Persistent Memory: 188784 bytes
[12/13/2025-08:23:33] [TRT] [I] Total Device Persistent Memory: 0 bytes
[12/13

In [16]:
!ls -lh resnet32_fp32_b1.engine
!ls -lh resnet32_fp32_b64.engine
!ls -lh resnet32_fp32_b128.engine
!ls -lh resnet32_fp32_b1to128.engine

-rw-r--r-- 1 ihsiao ihsiao 2.0M Dec 13 08:23 resnet32_fp32_b1.engine
-rw-r--r-- 1 ihsiao ihsiao 2.1M Dec 13 08:23 resnet32_fp32_b64.engine
-rw-r--r-- 1 ihsiao ihsiao 2.1M Dec 13 08:23 resnet32_fp32_b128.engine
-rw-r--r-- 1 ihsiao ihsiao 2.1M Dec 12 23:49 resnet32_fp32_b1to128.engine


In [None]:
import tensorrt as trt
import torch

def benchmark_engine(engine_path, batch_size, iters=1000):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    inp = [n for n in names if engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT][0]
    out = [n for n in names if engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT][0]

    context.set_input_shape(inp, (batch_size, 3, 32, 32))
    x = torch.randn(batch_size, 3, 32, 32, device="cuda", dtype=torch.float32)
    y = torch.empty(tuple(context.get_tensor_shape(out)), device="cuda", dtype=torch.float32)

    context.set_tensor_address(inp, int(x.data_ptr()))
    context.set_tensor_address(out, int(y.data_ptr()))

    stream = torch.cuda.Stream()

    # warmup
    for _ in range(50):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    stream.synchronize()

    start = torch.cuda.Event(enable_timing=True)
    end   = torch.cuda.Event(enable_timing=True)

    start.record(stream)
    for _ in range(iters):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    end.record(stream)
    stream.synchronize()

    elapsed_ms = start.elapsed_time(end)

    batch_latency = elapsed_ms / iters
    batch_latency_ms = elapsed_ms / iters
    throughput = (iters * batch_size) / (elapsed_ms / 1000.0)
    ms_per_img = batch_latency_ms / batch_size

    print(f"{engine_path} | batch={batch_size}")
    print(f"  latency:    {batch_latency:.3f} ms/batch")
    print(f"  per-image:     {ms_per_img:.6f} ms/image")
    print(f"  throughput: {throughput:.1f} images/sec")

# âœ… CALL IT (this is the part people miss)
print("Starting benchmark...")
benchmark_engine("resnet32_fp32_b1to128.engine", batch_size=1, iters=1000)
benchmark_engine("resnet32_fp32_b1to128.engine", batch_size=64, iters=1000)
benchmark_engine("resnet32_fp32_b1to128.engine", batch_size=128, iters=1000)

In [None]:
import tensorrt as trt
import torch
import time

def run_engine(engine_path, batch):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

    # load engine
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    # names
    tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    inp_name  = [n for n in tensor_names if engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT][0]
    out_name  = [n for n in tensor_names if engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT][0]

    # shape + buffers
    context.set_input_shape(inp_name, (batch, 3, 32, 32))
    x = torch.randn(batch, 3, 32, 32, device="cuda", dtype=torch.float32)
    y = torch.empty(tuple(context.get_tensor_shape(out_name)), device="cuda", dtype=torch.float32)

    context.set_tensor_address(inp_name, int(x.data_ptr()))
    context.set_tensor_address(out_name, int(y.data_ptr()))

    # use a non-default CUDA stream (avoids TRT warning)
    stream = torch.cuda.Stream()
    torch.cuda.set_stream(stream)

    # warmup
    for _ in range(50):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    stream.synchronize()

    # timed (CUDA events = accurate GPU timing)
    iters = 1000
    starter = torch.cuda.Event(enable_timing=True)
    ender   = torch.cuda.Event(enable_timing=True)

    starter.record(stream)
    for _ in range(iters):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    ender.record(stream)
    stream.synchronize()

    elapsed_ms = starter.elapsed_time(ender)  # total ms over iters
    batch_latency_ms = elapsed_ms / iters
    img_per_sec = (iters * batch) / (elapsed_ms / 1000.0)
    ms_per_img = batch_latency_ms / batch

    print(f"{engine_path} | batch={batch}")
    print(f"  batch latency: {batch_latency_ms:.3f} ms")
    print(f"  per-image:     {ms_per_img:.6f} ms/image")
    print(f"  throughput:    {img_per_sec:.1f} images/sec")

# ONE dynamic engine that supports 1..128
engine = "resnet32_fp32_b1to128.engine"
for bs in [1, 64, 128]:
    run_engine(engine, bs)

In [None]:
import time

# Warm-up
for _ in range(50):
    context.execute_async_v3(stream_handle=stream.cuda_stream)
stream.synchronize()

iters = 1000

starter = torch.cuda.Event(enable_timing=True)
ender   = torch.cuda.Event(enable_timing=True)

starter.record(stream)
for _ in range(iters):
    context.execute_async_v3(stream_handle=stream.cuda_stream)
ender.record(stream)

stream.synchronize()

elapsed_ms = starter.elapsed_time(ender)  # total GPU time

latency_ms = elapsed_ms / iters                  # ms per batch
throughput = (iters * batch_size) / (elapsed_ms / 1000.0)

print(f"Latency:    {latency_ms:.3f} ms / batch")
print(f"Throughput:{throughput:.1f} images/sec")

In [None]:
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.INFO)
onnx_path = "resnet32_fp32_dyn.onnx"   # must be exported with dynamic batch

def build_fp16_engine_dynamic(engine_path, max_bs=128):
    with trt.Builder(TRT_LOGGER) as builder, \
         builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
         trt.OnnxParser(network, TRT_LOGGER) as parser:

        with open(onnx_path, "rb") as f:
            if not parser.parse(f.read()):
                for i in range(parser.num_errors):
                    print(parser.get_error(i))
                raise RuntimeError("ONNX parse failed")

        config = builder.create_builder_config()
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)

        # enable FP16
        config.set_flag(trt.BuilderFlag.FP16)

        # one profile supports batch 1..max_bs
        profile = builder.create_optimization_profile()
        inp_name = network.get_input(0).name
        profile.set_shape(
            inp_name,
            min=(1, 3, 32, 32),
            opt=(min(64, max_bs), 3, 32, 32),
            max=(max_bs, 3, 32, 32),
        )
        config.add_optimization_profile(profile)

        serialized = builder.build_serialized_network(network, config)
        if serialized is None:
            raise RuntimeError("Engine build failed")

        with open(engine_path, "wb") as f:
            f.write(serialized)

    print("Saved:", engine_path)

build_fp16_engine_dynamic("resnet32_fp16_b1to128.engine", max_bs=128)

In [None]:
!ls -lh resnet32_fp16_b1.engine
!ls -lh resnet32_fp16_b64.engine
!ls -lh resnet32_fp16_b128.engine
!ls -lh resnet32_fp16_b1to128.engine

In [None]:
import tensorrt as trt
import torch

def run_engine(engine_path, batch, iters=1000, warmup=50):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    inp_name  = [n for n in tensor_names if engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT][0]
    out_name  = [n for n in tensor_names if engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT][0]

    context.set_input_shape(inp_name, (batch, 3, 32, 32))

    x = torch.randn(batch, 3, 32, 32, device="cuda", dtype=torch.float32)
    y = torch.empty(tuple(context.get_tensor_shape(out_name)), device="cuda", dtype=torch.float32)

    context.set_tensor_address(inp_name, int(x.data_ptr()))
    context.set_tensor_address(out_name, int(y.data_ptr()))

    # non-default stream + GPU-accurate timing
    stream = torch.cuda.Stream()

    # warmup
    for _ in range(warmup):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    stream.synchronize()

    start = torch.cuda.Event(enable_timing=True)
    end   = torch.cuda.Event(enable_timing=True)

    start.record(stream)
    for _ in range(iters):
        context.execute_async_v3(stream_handle=stream.cuda_stream)
    end.record(stream)
    stream.synchronize()

    elapsed_ms = start.elapsed_time(end)          # total ms for iters
    batch_latency_ms = elapsed_ms / iters         # ms per batch
    ms_per_img = batch_latency_ms / batch
    img_per_sec = (iters * batch) / (elapsed_ms / 1000.0)

    print(f"{engine_path} | batch={batch}")
    print(f"  batch latency: {batch_latency_ms:.3f} ms/batch")
    print(f"  per-image:     {ms_per_img:.6f} ms/image")
    print(f"  throughput:    {img_per_sec:.1f} images/sec")

# latency case
run_engine("resnet32_fp16_b1.engine", 1)

# throughput cases
run_engine("resnet32_fp16_b64.engine", 64)
run_engine("resnet32_fp16_b128.engine", 128)

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010)),
])

test_dataset = datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=test_transform
)

test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

In [None]:
import torch
import tensorrt as trt

@torch.no_grad()
def trt_accuracy(engine_path, test_loader, num_batches=None):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    inp = [n for n in names if engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT][0]
    out = [n for n in names if engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT][0]

    # map TRT dtype -> torch dtype for output buffer
    trt_dtype = engine.get_tensor_dtype(out)
    torch_dtype = {
        trt.DataType.FLOAT: torch.float32,
        trt.DataType.HALF:  torch.float16,
        trt.DataType.INT8:  torch.int8,
        trt.DataType.INT32: torch.int32,
    }[trt_dtype]

    correct = 0
    total = 0
    stream = torch.cuda.current_stream()

    for bi, (x_cpu, y_cpu) in enumerate(test_loader):
        if num_batches is not None and bi >= num_batches:
            break

        x = x_cpu.to("cuda", non_blocking=True)
        y = y_cpu.to("cuda", non_blocking=True)
        bsz = x.shape[0]

        # dynamic engines: set shape each batch (ok)
        context.set_input_shape(inp, (bsz, 3, 32, 32))
        out_shape = tuple(context.get_tensor_shape(out))

        yhat = torch.empty(out_shape, device="cuda", dtype=torch_dtype)

        context.set_tensor_address(inp, int(x.data_ptr()))
        context.set_tensor_address(out, int(yhat.data_ptr()))

        ok = context.execute_async_v3(stream_handle=stream.cuda_stream)
        if not ok:
            raise RuntimeError("TRT execute failed")

        pred = yhat.float().argmax(dim=1)
        correct += (pred == y).sum().item()
        total += bsz

    torch.cuda.synchronize()
    return 100.0 * correct / total

In [None]:
acc_fp32 = trt_accuracy("resnet32_fp32_b1to128.engine", test_loader)
print(f"TRT FP32 Test Acc: {acc_fp32:.2f}%")

In [None]:
import torch

model = ResNet32(num_classes=10)
model.load_state_dict(torch.load("checkpoints/resnet32_fp32_best.pt", map_location="cpu"))
model.eval().cuda()

correct = 0
total = 0
with torch.no_grad():
    for x, y in test_loader:
        x = x.cuda(non_blocking=True)
        y = y.cuda(non_blocking=True)
        pred = model(x).argmax(dim=1)
        correct += (pred == y).sum().item()
        total += y.size(0)

print("PyTorch FP32 Test Acc:", 100*correct/total)