### Step 1: Import Required Libraries

In [1]:
import torch
import torch.hub
import onnx
from torchvision import transforms
import numpy as np
import time
import onnxruntime as ort
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit


### Step 2: Load the Pre-Trained Vision Transformer Model

In [2]:
# Load a ViT model from Torch Hub
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')  # Example: DinoV2

# Set the model to evaluation mode
model.eval()


Using cache found in /home/z004x2xz/.cache/torch/hub/facebookresearch_dinov2_main


DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-11): 12 x NestedTensorBlock(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (n

### Step 3: Export the Model to ONNX Format

In [33]:
# Move the model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create dummy input and move it to the same device as the model
dummy_input = torch.randn(1, 3, 224, 224, device=device)

# Path to save the ONNX model
onnx_file_path = "vit_model.onnx"

# Export the model to ONNX format
torch.onnx.export(
    model, 
    dummy_input, 
    onnx_file_path, 
    input_names=['input'], 
    output_names=['output'], 
    opset_version=13,
    dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)
print(f"Model exported to ONNX format at {onnx_file_path}")


RuntimeError: Found an unsupported argument type c10::SymInt in the JIT tracer. File a bug report.

### Step 4: Convert the ONNX Model to TensorRT

In [43]:
# Use the `trtexec` tool to convert the ONNX model to TensorRT.
# Run the following command in your terminal:
!trtexec --onnx=vit_model.onnx --saveEngine=vit_model.trt --fp16


/bin/sh: 1: trtexec: not found


### Step 5: Run Inference Using TensorRT and Measure Performance

In [None]:
# Load ONNX model
ort_session = ort.InferenceSession(onnx_file_path)

# Generate random input
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)

# Measure inference time for ONNX model
start_time = time.time()
for _ in range(100):  # Run multiple iterations for better averaging
    outputs = ort_session.run(None, {'input': input_data})
onnx_time = (time.time() - start_time) / 100
print(f"ONNX Model Average Inference Time: {onnx_time * 1000:.2f} ms")


### Load the TensorRT Engine and Measure Inference Time

In [37]:
# Load TensorRT Engine
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
with open("vit_model.trt", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

# Allocate buffers
context = engine.create_execution_context()
inputs = np.random.randn(1, 3, 224, 224).astype(np.float32)
input_shape = (1, 3, 224, 224)
output_shape = (1, 1000)  # Adjust based on model output
d_input = cuda.mem_alloc(inputs.nbytes)
d_output = cuda.mem_alloc(np.prod(output_shape).astype(np.int32) * 4)
bindings = [int(d_input), int(d_output)]

# Copy input to device and perform inference
stream = cuda.Stream()
cuda.memcpy_htod_async(d_input, inputs, stream)
start_time = time.time()
for _ in range(100):  # Multiple iterations
    context.execute_async_v2(bindings, stream.handle)
    cuda.memcpy_dtoh_async(outputs, d_output, stream)
    stream.synchronize()
trt_time = (time.time() - start_time) / 100
print(f"TensorRT Model Average Inference Time: {trt_time * 1000:.2f} ms")


FileNotFoundError: [Errno 2] No such file or directory: 'vit_model.trt'

### Step 6: Compare Inference Times

In [None]:
speedup = onnx_time / trt_time
print(f"Speedup Achieved by TensorRT: {speedup:.2f}x")


In [42]:
!alias trtexec="/usr/src/tensorrt/bin/trtexec"