In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from ultralytics import YOLO 
from transformers import CLIPVisionModel, CLIPImageProcessor


In [60]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [23]:
class YOLOv11(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = YOLO("yolov11l-face.pt").model
        # self.backbone = torch.nn.Sequential(*list(self.model.model.children())[:7])  # Stops after C3k2 (layer 6)
        self.feature_model = torch.nn.Sequential(*list(self.model.model.children())[:10])  # Stops after SPPF (layer 9)
        
    def forward(self, x):
        return self.feature_model(x)

extractor = YOLOv11()
features = extractor(torch.rand(1, 3, 224, 224)) 
features.shape

torch.Size([1, 512, 7, 7])

In [59]:
# Method 2: Custom forward pass by modifying the encoder
class CLIP_Method2(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
    def forward(self, images):
        # Process images
        if isinstance(images, Image.Image):
            images = [images]
        elif not isinstance(images, list):
            images = images
            
        inputs = self.processor(images=images, return_tensors="pt")
        pixel_values = inputs['pixel_values']
        
        # Get embeddings
        vision_model = self.clip_model.vision_model
        embeddings = vision_model.embeddings(pixel_values)
        embeddings = vision_model.pre_layrnorm(embeddings)
        
        # Pass through first 10 encoder layers only
        hidden_states = embeddings
        for idx, encoder_layer in enumerate(vision_model.encoder.layers[:10]):
            layer_outputs = encoder_layer(
                hidden_states,
                attention_mask=None,
                causal_attention_mask=None,
                output_attentions=False,
            )
            hidden_states = layer_outputs[0]
        
        return hidden_states

model2 = CLIP_Method2()
output2 = model2(image)

In [58]:
class CLIP(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")

        # CLIP's final hidden state before projection (not the projection itself)
        self.clip_output_dim = self.clip_model.config.hidden_size  # usually 512

    def forward(self, x):
        # Resize and normalize input for CLIP
        processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
        inputs = processor(images=x, return_tensors="pt")#.to(device)
        outputs = self.clip_model(**inputs)
        pooled_output = outputs.pooler_output  # shape: [batch_size, 512]
        return pooled_output

model = CLIP()
image = Image.open("image.jpg")
inputs = model(image) 
outputs.shape

model.clip_model

CLIPVisionModel(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (position_embedding): Embedding(50, 768)
    )
    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
        

In [53]:
class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.yolov11 = YOLOv11()
        self.clip = CLIP()

        # TODO:
        self.linear_head = pass
        self.softmax = pass

    def forward(self, x):
        # TODO: preprocess accordingly, since the models expect different normalizations
        yolo_output = self.yolov11(x)
        clip_output = self.clip(x)

        # concat yolo_output + clip_output

        # run through linear and softmax
        

SyntaxError: invalid syntax (4277434784.py, line 9)

In [28]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

# 1. Load model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 2. Process image
image = Image.open("image.jpg")
inputs = processor(images=image, return_tensors="pt")

# 3. Get image features
with torch.no_grad():
    outputs = model.get_image_features(**inputs)

outputs.shape
# outputs.shape = [1, 512] (512-dimensional embedding)

torch.Size([1, 512])

In [69]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from transformers import CLIPVisionModel, CLIPImageProcessor
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor
import threading

# Set device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    # Enable TF32 for Ampere GPUs (A100, RTX 30xx)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

# SOLUTION 1: CPU Preprocessing Bottleneck Fix
class CLIP_ParallelPreprocess(torch.nn.Module):
    """Addresses CPU preprocessing bottleneck by parallelizing image preprocessing"""
    def __init__(self, num_workers=4):
        super().__init__()
        self.clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_model = self.clip_model.to(device)
        self.clip_model.eval()
        self.processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.num_workers = num_workers
        
    def preprocess_batch(self, images):
        """Preprocess images in parallel using ThreadPoolExecutor"""
        with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
            # Process each image in parallel
            processed = list(executor.map(
                lambda img: self.processor(images=img, return_tensors="pt")['pixel_values'].squeeze(0),
                images
            ))
        # Stack into batch tensor
        return torch.stack(processed).to(device, non_blocking=True)
    
    def forward(self, images):
        if isinstance(images, Image.Image):
            images = [images]
        
        # Parallel preprocessing
        pixel_values = self.preprocess_batch(images)
        
        # Get embeddings directly
        vision_model = self.clip_model.vision_model
        with torch.no_grad():
            with torch.cuda.amp.autocast(dtype=torch.float16):
                embeddings = vision_model.embeddings(pixel_values)
                embeddings = vision_model.pre_layrnorm(embeddings)
                
                # First 10 layers only
                hidden_states = embeddings
                for layer in vision_model.encoder.layers[:10]:
                    layer_outputs = layer(hidden_states, None, None, False)
                    hidden_states = layer_outputs[0]
        
        return hidden_states


# SOLUTION 2: Use Larger Model / Different Architecture
class CLIP_LargerModel(torch.nn.Module):
    """Use a larger model that better utilizes GPU compute"""
    def __init__(self, model_name="openai/clip-vit-large-patch14"):
        super().__init__()
        # ViT-L/14 is 4x larger than ViT-B/32, better GPU utilization
        self.clip_model = CLIPVisionModel.from_pretrained(model_name)
        self.clip_model = self.clip_model.to(device)
        self.clip_model.eval()
        self.processor = CLIPImageProcessor.from_pretrained(model_name)
        
    def forward(self, images):
        if isinstance(images, Image.Image):
            images = [images]
            
        inputs = self.processor(images=images, return_tensors="pt")
        pixel_values = inputs['pixel_values'].to(device, non_blocking=True)
        
        with torch.no_grad():
            with torch.cuda.amp.autocast(dtype=torch.float16):
                outputs = self.clip_model(pixel_values=pixel_values, output_hidden_states=True)
        
        # Return 10th layer (or adjust based on model depth)
        return outputs.hidden_states[min(10, len(outputs.hidden_states)-1)]


# SOLUTION 3: Memory-Compute Trade-off with Flash Attention
class CLIP_OptimizedAttention(torch.nn.Module):
    """
    Optimize attention computation for better memory bandwidth utilization.
    Note: Requires PyTorch 2.0+ with Flash Attention support
    """
    def __init__(self):
        super().__init__()
        self.clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Enable Flash Attention if available (PyTorch 2.0+)
        if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
            print("Flash Attention available - enabling optimized attention")
            # This enables memory-efficient attention computation
            torch.backends.cuda.enable_flash_sdp(True)
            torch.backends.cuda.enable_mem_efficient_sdp(True)
        
        self.clip_model = self.clip_model.to(device)
        self.clip_model.eval()
        self.processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    def forward(self, images):
        if isinstance(images, Image.Image):
            images = [images]
            
        inputs = self.processor(images=images, return_tensors="pt")
        pixel_values = inputs['pixel_values'].to(device, non_blocking=True)
        
        with torch.no_grad():
            with torch.cuda.amp.autocast(dtype=torch.float16):
                outputs = self.clip_model(pixel_values=pixel_values, output_hidden_states=True)
        
        return outputs.hidden_states[10]


# SOLUTION 4: True Batch Processing with Pre-allocated Buffers
class CLIP_PreallocatedBatch(torch.nn.Module):
    """Pre-allocate buffers to avoid memory allocation overhead"""
    def __init__(self, max_batch_size=64):
        super().__init__()
        self.clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_model = self.clip_model.to(device)
        self.clip_model.eval()
        self.processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
        # Pre-allocate buffers
        self.max_batch_size = max_batch_size
        self.pixel_buffer = torch.zeros(
            (max_batch_size, 3, 224, 224), 
            device=device, 
            dtype=torch.float16
        )
        
    def forward(self, images):
        if isinstance(images, Image.Image):
            images = [images]
        
        batch_size = len(images)
        assert batch_size <= self.max_batch_size
        
        # Process images and fill pre-allocated buffer
        for i, img in enumerate(images):
            processed = self.processor(images=img, return_tensors="pt")['pixel_values']
            self.pixel_buffer[i] = processed.squeeze(0).to(device, dtype=torch.float16)
        
        # Use only the filled portion of the buffer
        pixel_values = self.pixel_buffer[:batch_size]
        
        vision_model = self.clip_model.vision_model
        with torch.no_grad():
            with torch.cuda.amp.autocast():
                embeddings = vision_model.embeddings(pixel_values)
                embeddings = vision_model.pre_layrnorm(embeddings)
                
                hidden_states = embeddings
                for layer in vision_model.encoder.layers[:10]:
                    layer_outputs = layer(hidden_states, None, None, False)
                    hidden_states = layer_outputs[0]
        
        return hidden_states


# SOLUTION 5: The Real Fix - Understanding the Problem
def diagnose_gpu_bottleneck():
    """Diagnose why batching isn't helping"""
    
    print("=== GPU Bottleneck Diagnosis ===\n")
    
    # 1. Check if model is compute or memory bound
    model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32").to(device).eval()
    
    # Count parameters and compute requirements
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Model parameters: {total_params/1e6:.1f}M")
    
    # 2. Measure memory bandwidth vs compute
    batch_sizes = [1, 8, 32, 128]
    
    for bs in batch_sizes:
        dummy_input = torch.randn(bs, 3, 224, 224, device=device, dtype=torch.float16)
        
        # Warmup
        for _ in range(5):
            with torch.no_grad():
                _ = model(dummy_input)
        
        torch.cuda.synchronize()
        start = time.time()
        
        with torch.no_grad():
            for _ in range(20):
                _ = model(dummy_input)
        
        torch.cuda.synchronize()
        elapsed = time.time() - start
        
        # Calculate theoretical FLOPS
        # ViT-B/32: ~4.4 GFLOPs per image
        gflops_per_image = 4.4
        total_gflops = gflops_per_image * bs * 20
        achieved_tflops = total_gflops / elapsed / 1000
        
        print(f"Batch size {bs:3d}: {elapsed:.3f}s, {achieved_tflops:.2f} TFLOPS")
    
    print("\nDiagnosis: If TFLOPS doesn't scale with batch size, model is memory-bandwidth bound.")
    print("Solutions: Use larger models, optimize memory access patterns, or use multiple models in parallel.")


# Benchmark all solutions
if __name__ == "__main__":
    image = Image.open("image.jpg")
    batch_sizes = [1, 8, 16, 32]
    
    print("\n=== Testing Different Solutions ===\n")
    
    # Test each solution
    solutions = [
        ("Parallel Preprocessing", CLIP_ParallelPreprocess()),
        ("Pre-allocated Buffers", CLIP_PreallocatedBatch()),
        # ("Larger Model (ViT-L)", CLIP_LargerModel()),  # Uncomment if you have enough memory
    ]

    
    # Run diagnosis
    print("\n")
    diagnose_gpu_bottleneck()
    
    # Additional recommendations
    print("\n=== Recommendations ===")
    print("1. Your GPU is likely memory-bandwidth limited for this small model")
    print("2. Consider using ViT-L models which better utilize GPU compute")
    print("3. For production, consider model serving frameworks like TorchServe or Triton")
    print("4. If you must use ViT-B/32, run multiple models in parallel on different CUDA streams")

Using device: cuda
GPU: NVIDIA GeForce RTX 3090

=== Testing Different Solutions ===



=== GPU Bottleneck Diagnosis ===

Model parameters: 87.5M
Batch size   1: 0.127s, 0.69 TFLOPS
Batch size   8: 0.128s, 5.51 TFLOPS
Batch size  32: 0.282s, 9.98 TFLOPS
Batch size 128: 1.133s, 9.95 TFLOPS

Diagnosis: If TFLOPS doesn't scale with batch size, model is memory-bandwidth bound.
Solutions: Use larger models, optimize memory access patterns, or use multiple models in parallel.

=== Recommendations ===
1. Your GPU is likely memory-bandwidth limited for this small model
2. Consider using ViT-L models which better utilize GPU compute
3. For production, consider model serving frameworks like TorchServe or Triton
4. If you must use ViT-B/32, run multiple models in parallel on different CUDA streams
