## 1. Import packages

In [1]:
import glob
import time
import cv2
import torch
import numpy as np
from tqdm import tqdm
from ultralytics import YOLO

In [1]:
!nvidia-smi

Fri Jan 16 16:12:02 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060        Off |   00000000:01:00.0 Off |                  N/A |
|  0%   48C    P8             19W /  170W |     639MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

## 2. Config

In [2]:
ENGINE_PATH = "/home/mlops/Repository/aio2025-onnx-tensorrt/models/yolo26l.engine"
IMAGE_GLOB = "/home/mlops/Repository/aio2025-onnx-tensorrt/images/val2014/*.jpg"
IMG_SIZE = 640
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
WARMUP = 20
SAMPLE_SIZE = 1000

## 3. Preprocess function

In [3]:
def preprocess(img, size=640):
    img = cv2.resize(img, (size, size))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32) / 255.0
    img = np.transpose(img, (2, 0, 1))
    img = np.expand_dims(img, 0)
    return torch.from_numpy(img)

# Load images
image_paths = sorted(glob.glob(IMAGE_GLOB))
assert len(image_paths) > 0, "No images found"
if SAMPLE_SIZE != -1:
    image_paths = image_paths[:SAMPLE_SIZE]

## 4. Model

In [4]:
# Load model
model = YOLO(ENGINE_PATH, task='detect')

# Warmup
dummy = torch.zeros((1, 3, IMG_SIZE, IMG_SIZE), device=DEVICE)
for _ in range(WARMUP):
    _ = model(dummy, verbose=False)

Loading /home/mlops/Repository/aio2025-onnx-tensorrt/models/yolo26l.engine for TensorRT inference...
[01/16/2026-16:04:43] [TRT] [I] Loaded engine size: 116 MiB
[01/16/2026-16:04:43] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +75, now: CPU 1, GPU 188 (MiB)


## 5. Test loop

In [5]:
# FPS test
start = time.perf_counter()

for p in tqdm(image_paths):
    img = cv2.imread(p)
    inp = preprocess(img, IMG_SIZE).to(DEVICE)
    _ = model(inp, verbose=False)
end = time.perf_counter()

total_time = end - start
fps = len(image_paths) / total_time

100%|██████████| 1000/1000 [00:17<00:00, 57.94it/s]


In [6]:
print("=" * 40)
print(f"Device           : {DEVICE}")
print(f"Images processed : {len(image_paths)}")
print(f"Total time       : {total_time:.2f} s")
print(f"FPS              : {fps:.2f}")
print("=" * 40)

Device           : cuda
Images processed : 1000
Total time       : 17.26 s
FPS              : 57.93
