### Export ONNX

ONNX tạo export đồ thị độc lập với phần cứng, tức là dù export trên CPU hay GPU thì graph được tạo ra cũng không đổi. ONNXRuntime mới tối ưu phụ thuộc vào phần cứng, đây là nơi tạo ra sự khác biệt.

In [1]:
import torch
from torchvision.models import resnet50

model = resnet50(pretrained=True)
model = model.eval()



In [2]:
ONNX_PATH = "resnet50.onnx"
IMG_SIZE = 224
OPSET = 18

In [3]:
dummy_input = torch.randn(1, 3, IMG_SIZE, IMG_SIZE)
torch.onnx.export(
    model,
    dummy_input,
    ONNX_PATH,
    opset_version=OPSET,
    input_names=['input'],
    output_names=['output'],
    dynamic_axes=None # Optional: specify dynamic axes if needed
)

  torch.onnx.export(


### Evaluate:

Đánh giá trên các EP khác nhau.

In [4]:
import onnxruntime as ort
from tqdm.notebook import tqdm
import numpy as np
import time

SAMPLES = 1000
IMG_SIZE = 224

In [5]:
def eval_fn(session):
    print(f"Evaluating ONNX model with providers: {session._providers}")
    inference_times = []
    for _ in tqdm(range(SAMPLES), desc=f"Running ONNX Inference: "):
        input_data = np.random.randn(1, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
        start_time = time.perf_counter()
        __ = session.run(['output'], {'input': input_data})
        inference_times.append((time.perf_counter() - start_time) * 1e3) # Convert to milliseconds
    # Calculate statistics
    mean_time = np.mean(inference_times)
    std_time = np.std(inference_times)
    min_time = np.min(inference_times)
    max_time = np.max(inference_times)
    median_time = np.median(inference_times)
    p95_time = np.percentile(inference_times, 95)
    p99_time = np.percentile(inference_times, 99)

    print(f"Inference Time Statistics (milliseconds):")
    print(f"  Mean: {mean_time:.6f}")
    print(f"  Std: {std_time:.6f}")
    print(f"  Min: {min_time:.6f}")
    print(f"  Max: {max_time:.6f}")
    print(f"  Median: {median_time:.6f}")
    print(f"  95th percentile: {p95_time:.6f}")
    print(f"  99th percentile: {p99_time:.6f}")
    print(f"\nThroughput: {SAMPLES / (sum(inference_times) / 1e3):.2f} samples/sec")

#### CPUExcutionProvider:

In [19]:
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session_options.intra_op_num_threads = 4  # giá trị mặc định là 0 (tự động chọn số luồng dựa trên số lõi CPU)
session = ort.InferenceSession(ONNX_PATH, sess_options=session_options, providers=['CPUExecutionProvider'])
eval_fn(session)

Evaluating ONNX model with providers: ['CPUExecutionProvider']


Running ONNX Inference:   0%|          | 0/1000 [00:00<?, ?it/s]

Inference Time Statistics (milliseconds):
  Mean: 21.561758
  Std: 6.173452
  Min: 18.684868
  Max: 95.983619
  Median: 19.552237
  95th percentile: 33.455162
  99th percentile: 49.699594

Throughput: 46.38 samples/sec


#### CUDAExecutionProvider:

Nếu thiếu cudnn, cài đặt bằng: `conda install cudnn=9 cuda=12 -c nvidia`

In [None]:
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session = ort.InferenceSession(
    ONNX_PATH, 
    sess_options=session_options, 
    providers = [
        ("CUDAExecutionProvider", {
            "device_id": 0,
            "arena_extend_strategy": "kNextPowerOfTwo",
            "cudnn_conv_algo_search": "EXHAUSTIVE",
            "enable_cuda_graph": True, # Enable CUDA Graphs for better performance on repeated workloads
            "gpu_mem_limit": 4 * 1024 * 1024 * 1024,
        }),
    ]
)
eval_fn(session)

Evaluating ONNX model with providers: ['CUDAExecutionProvider', 'CPUExecutionProvider']


Running ONNX Inference:   0%|          | 0/1000 [00:00<?, ?it/s]

Inference Time Statistics (milliseconds):
  Mean: 1.543070
  Std: 3.997516
  Min: 1.368160
  Max: 127.825740
  Median: 1.419127
  95th percentile: 1.453051
  99th percentile: 1.473385

Throughput: 648.06 samples/sec


#### TensorRT:

In [6]:
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session = ort.InferenceSession(
    ONNX_PATH, 
    sess_options=session_options, 
    providers = ['TensorRTExecutionProvider']
)
eval_fn(session)



*************** EP Error ***************
EP Error Unknown Provider Type: TensorRTExecutionProvider when using ['TensorRTExecutionProvider']
Falling back to ['CPUExecutionProvider'] and retrying.
****************************************
Evaluating ONNX model with providers: ['CPUExecutionProvider']


Running ONNX Inference:   0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 