In [1]:
# Check the GPU is enabled
!nvidia-smi

# Install PyTorch with CUDA and torch_tensorrt
!pip install torch torchvision tensorrt
!pip install --pre torch-tensorrt --extra-index-url https://download.pytorch.org/whl/nightly/cu118

Mon Mar 24 21:46:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
!pip install --upgrade torchvision

Collecting torch==2.6.0 (from torchvision)
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch==2.6.0->torchvision)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl (766.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy, torch
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.3
    Uninstalling sympy-1.13.3:
      Successfully uninstalled sympy-1.13.3
  Attempting uninstall: torch
    Found existing installation: torch 2.7.0.dev20250312+cu118
    Uninstalling torch-2.7.0.dev20250312+cu118:
      Successfully uninstalled torch-2.7.0.dev20250312+

In [4]:
import torch
import torch_tensorrt
import time
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from transformers import ViTFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel
from torchvision import transforms
import io

print("Libraries imported successfully")

# Download a sample image
!wget -q -O sample_image.jpg "https://tensorflow.org/images/surf.jpg"
from IPython.display import Image, display
display(Image("sample_image.jpg"))

# Load ViT-GPT2 image captioning model
model_name = "nlpconnect/vit-gpt2-image-captioning"
model = VisionEncoderDecoderModel.from_pretrained(model_name).to("cuda")
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare image preprocessing
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load and preprocess the test image
image = Image.open("sample_image.jpg").convert('RGB')
image_tensor = preprocess(image).unsqueeze(0).to("cuda")

# Define function to generate captions
def generate_caption(model, image_tensor, tokenizer):
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            image_tensor,
            max_length=16,
            num_beams=4,
            return_dict_in_generate=True
        )
    caption = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    return caption

# Benchmark original PyTorch model
def benchmark_pytorch_model(iterations=50):
    latencies = []

    # Warm-up
    for _ in range(10):
        _ = generate_caption(model, image_tensor, tokenizer)

    # Actual benchmark
    for i in range(iterations):
        torch.cuda.synchronize()
        start_time = time.time()
        _ = generate_caption(model, image_tensor, tokenizer)
        torch.cuda.synchronize()
        end_time = time.time()
        latencies.append((end_time - start_time) * 1000)  # ms

    # Generate one final caption for verification
    final_caption = generate_caption(model, image_tensor, tokenizer)

    return {
        "mean_latency_ms": np.mean(latencies),
        "median_latency_ms": np.median(latencies),
        "min_latency_ms": np.min(latencies),
        "max_latency_ms": np.max(latencies),
        "std_latency_ms": np.std(latencies),
        "caption": final_caption
    }

print("Starting PyTorch benchmark...")
pytorch_results = benchmark_pytorch_model()
print("PyTorch Model Results:")
print(f"Caption: {pytorch_results['caption']}")
print(f"Mean latency: {pytorch_results['mean_latency_ms']:.2f} ms")
print(f"Median latency: {pytorch_results['median_latency_ms']:.2f} ms")
print(f"Min latency: {pytorch_results['min_latency_ms']:.2f} ms")
print(f"Max latency: {pytorch_results['max_latency_ms']:.2f} ms")
print(f"Std Dev: {pytorch_results['std_latency_ms']:.2f} ms")

RuntimeError: Failed to import transformers.models.vit.feature_extraction_vit because of the following error (look up to see its traceback):
partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [None]:
# Extract encoder module for optimization since it's the most compute-intensive part
encoder = model.encoder

# Create inputs for tracing
dummy_inputs = [image_tensor]

# Check inputs
print(f"Input shape: {image_tensor.shape}")
print(f"Input dtype: {image_tensor.dtype}")

# Compile with torch_tensorrt
print("Compiling with torch_tensorrt...")
try:
    # Use the torch.compile API with tensorrt backend
    optimized_encoder = torch.compile(
        encoder,
        backend="tensorrt",
        options={
            "enabled": True,
            "min_block_size": 1,
            "torch_executed_ops": [],
            "debug": False,
            "strict_types": False,
            "max_workspace_size": 1 << 25,
            "precision": "fp16"  # Use FP16 precision for better performance on T4
        },
    )

    print("Compilation successful")

    # Create a wrapper model that uses the optimized encoder
    class OptimizedCaptioningModel(torch.nn.Module):
        def __init__(self, encoder, decoder):
            super().__init__()
            self.encoder = encoder
            self.decoder = decoder

        def generate(self, pixel_values, **kwargs):
            # Get encoder outputs
            encoder_outputs = self.encoder(pixel_values)
            # Pass to decoder for generation
            return self.decoder.generate(encoder_hidden_states=encoder_outputs.last_hidden_state, **kwargs)

    optimized_model = OptimizedCaptioningModel(optimized_encoder, model.decoder)

    # Benchmark function for TensorRT optimized model
    def benchmark_trt_model(iterations=50):
        latencies = []

        # Define generation function
        def generate_with_trt():
            with torch.no_grad():
                outputs = optimized_model.generate(
                    image_tensor,
                    max_length=16,
                    num_beams=4,
                    return_dict_in_generate=True
                )
            caption = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
            return caption

        # Warm-up
        for _ in range(10):
            _ = generate_with_trt()

        # Actual benchmark
        for i in range(iterations):
            torch.cuda.synchronize()
            start_time = time.time()
            _ = generate_with_trt()
            torch.cuda.synchronize()
            end_time = time.time()
            latencies.append((end_time - start_time) * 1000)  # ms

        # Generate one final caption for verification
        final_caption = generate_with_trt()

        return {
            "mean_latency_ms": np.mean(latencies),
            "median_latency_ms": np.median(latencies),
            "min_latency_ms": np.min(latencies),
            "max_latency_ms": np.max(latencies),
            "std_latency_ms": np.std(latencies),
            "caption": final_caption
        }

    print("Starting TensorRT benchmark...")
    trt_results = benchmark_trt_model()
    print("TensorRT Model Results:")
    print(f"Caption: {trt_results['caption']}")
    print(f"Mean latency: {trt_results['mean_latency_ms']:.2f} ms")
    print(f"Median latency: {trt_results['median_latency_ms']:.2f} ms")
    print(f"Min latency: {trt_results['min_latency_ms']:.2f} ms")
    print(f"Max latency: {trt_results['max_latency_ms']:.2f} ms")
    print(f"Std Dev: {trt_results['std_latency_ms']:.2f} ms")

    # Compare results
    speedup = pytorch_results['mean_latency_ms'] / trt_results['mean_latency_ms']
    print(f"\nLatency Comparison:")
    print(f"Original model: {pytorch_results['mean_latency_ms']:.2f} ms")
    print(f"TensorRT model: {trt_results['mean_latency_ms']:.2f} ms")
    print(f"Speedup: {speedup:.2f}x")

    # Visualize with a bar chart
    labels = ['PyTorch Model', 'TensorRT Model']
    latencies = [pytorch_results['mean_latency_ms'], trt_results['mean_latency_ms']]

    plt.figure(figsize=(10, 6))
    bars = plt.bar(labels, latencies, color=['blue', 'green'])
    plt.title('Image Captioning Model Latency Comparison')
    plt.ylabel('Latency (ms)')
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Add latency values above bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 5,
                 f'{height:.1f} ms', ha='center', va='bottom')

    plt.savefig('latency_comparison.png')
    plt.show()

except Exception as e:
    print(f"Compilation failed: {str(e)}")
    print("Falling back to alternative optimization method...")

    # Fallback to half-precision (FP16) optimization as an alternative
    model_fp16 = model.half()
    image_tensor_fp16 = image_tensor.half()

    def benchmark_fp16_model(iterations=50):
        latencies = []

        # Define generation function
        def generate_with_fp16():
            with torch.no_grad():
                outputs = model_fp16.generate(
                    image_tensor_fp16,
                    max_length=16,
                    num_beams=4,
                    return_dict_in_generate=True
                )
            caption = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
            return caption

        # Warm-up
        for _ in range(10):
            _ = generate_with_fp16()

        # Actual benchmark
        for i in range(iterations):
            torch.cuda.synchronize()
            start_time = time.time()
            _ = generate_with_fp16()
            torch.cuda.synchronize()
            end_time = time.time()
            latencies.append((end_time - start_time) * 1000)  # ms

        # Generate one final caption for verification
        final_caption = generate_with_fp16()

        return {
            "mean_latency_ms": np.mean(latencies),
            "median_latency_ms": np.median(latencies),
            "min_latency_ms": np.min(latencies),
            "max_latency_ms": np.max(latencies),
            "std_latency_ms": np.std(latencies),
            "caption": final_caption
        }

    print("Starting FP16 benchmark...")
    fp16_results = benchmark_fp16_model()
    print("FP16 Model Results:")
    print(f"Caption: {fp16_results['caption']}")
    print(f"Mean latency: {fp16_results['mean_latency_ms']:.2f} ms")
    print(f"Median latency: {fp16_results['median_latency_ms']:.2f} ms")
    print(f"Min latency: {fp16_results['min_latency_ms']:.2f} ms")
    print(f"Max latency: {fp16_results['max_latency_ms']:.2f} ms")
    print(f"Std Dev: {fp16_results['std_latency_ms']:.2f} ms")

    # Compare results
    speedup = pytorch_results['mean_latency_ms'] / fp16_results['mean_latency_ms']
    print(f"\nLatency Comparison:")
    print(f"Original model: {pytorch_results['mean_latency_ms']:.2f} ms")
    print(f"FP16 model: {fp16_results['mean_latency_ms']:.2f} ms")
    print(f"Speedup: {speedup:.2f}x")

    # Visualize with a bar chart
    labels = ['PyTorch Model', 'FP16 Model']
    latencies = [pytorch_results['mean_latency_ms'], fp16_results['mean_latency_ms']]

    plt.figure(figsize=(10, 6))
    bars = plt.bar(labels, latencies, color=['blue', 'green'])
    plt.title('Image Captioning Model Latency Comparison')
    plt.ylabel('Latency (ms)')
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Add latency values above bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 5,
                 f'{height:.1f} ms', ha='center', va='bottom')

    plt.savefig('latency_comparison.png')
    plt.show()