## 1. Load Pre-Trained BLIP for Image Captioning

In [1]:
import os
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

# Load captioning model
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

2025-04-19 20:05:21.300431: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745093121.537878      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745093121.610720      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

## 2. Generate Caption for each image

In [3]:
captions = dict()
#Loop through all images in folder
for filename in os.listdir("/kaggle/input/cv-assgn3/samples"):
    image_path = os.path.join("/kaggle/input/cv-assgn3/samples", filename)
    image = Image.open(image_path).convert("RGB")

    # Preprocess and caption
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device)
    with torch.no_grad():
        output = blip_model.generate(**inputs)
        caption = blip_processor.decode(output[0], skip_special_tokens=True)

    captions[image_path] = caption

captions

{'/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000004.jpg': 'a small dog running across a green field',
 '/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000022.jpg': 'a small dog standing on a stone ledge',
 '/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000023.jpg': 'a man riding a bike down a wet street',
 '/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000026.jpg': 'a man in a suit and tie sitting on a couch',
 '/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000018.jpg': 'a family sitting in a pool with a towel',
 '/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000003.jpg': 'a small dog walking on a green carpet',
 '/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000019.jpg': 'a small bird sitting on a plant',
 '/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000030.jpg': 'a duck drinking water from a pond',
 '/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000034.jpg': 'a coffee machine with two cups on it',
 '/kaggle/input/cv-assgn3/samples/ILSVRC2012_te

## 3. Evaluation using CLIP

In [23]:
from transformers import CLIPProcessor, CLIPModel

# Load CLIP 
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

for image_path, caption in captions.items():
    image = Image.open(image_path).convert("RGB")

    clip_inputs = clip_processor(text=[caption], images=image, return_tensors="pt", padding=True)
    with torch.no_grad():
        clip_outputs = clip_model(**clip_inputs)
        # Get features 
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        # Normalize features
        image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
        # Compute cosine similarity
        similarity = (100*image_features @ text_features.T).item()

    # ----- Output -----
    print(f"{image_path} | CLIP Similarity: {similarity:.4f} ")

/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000004.jpg | CLIP Similarity: 32.7026 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000022.jpg | CLIP Similarity: 31.0347 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000023.jpg | CLIP Similarity: 30.8345 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000026.jpg | CLIP Similarity: 28.8953 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000018.jpg | CLIP Similarity: 31.3365 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000003.jpg | CLIP Similarity: 31.5660 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000019.jpg | CLIP Similarity: 28.9383 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000030.jpg | CLIP Similarity: 30.5252 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000034.jpg | CLIP Similarity: 27.9613 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000025.jpg | CLIP Similarity: 28.9163 


## 4. Evaluation using CLIPS

In [28]:
!pip install open_clip_torch --quiet
from open_clip import create_model_from_pretrained, get_tokenizer

# Load CLIP 
clips_model, clips_preprocess = create_model_from_pretrained("hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-224-Recap-DataComp-1B")
clips_tokenizer = get_tokenizer("hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-224-Recap-DataComp-1B")

for image_path, caption in captions.items():
    image = Image.open(image_path).convert("RGB")

    image_input = clips_preprocess(image).unsqueeze(0)
    text_input = clips_tokenizer([caption])

    with torch.no_grad():
        
        # Get features correctly
        image_features = clips_model.encode_image(image_input)
        text_features = clips_model.encode_text(text_input)
        # Normalize features
        image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
        # Compute cosine similarity
        similarity = (100*image_features @ text_features.T).item()

    # ----- Output -----
    print(f"{image_path} | CLIPS Similarity: {similarity:.4f} ")

/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000004.jpg | CLIPS Similarity: 19.1549 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000022.jpg | CLIPS Similarity: 15.1255 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000023.jpg | CLIPS Similarity: 16.9236 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000026.jpg | CLIPS Similarity: 12.7573 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000018.jpg | CLIPS Similarity: 14.8474 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000003.jpg | CLIPS Similarity: 18.1808 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000019.jpg | CLIPS Similarity: 16.9768 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000030.jpg | CLIPS Similarity: 16.5172 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000034.jpg | CLIPS Similarity: 15.6065 
/kaggle/input/cv-assgn3/samples/ILSVRC2012_test_00000025.jpg | CLIPS Similarity: 16.7442 
