## 1. CLIP dependendy 

In [8]:
!pip install transformers torch torchvision  open_clip_torch --quiet

## 2. Download CLIP Model and Load Weights

In [23]:
from transformers import CLIPProcessor, CLIPModel
import torch.nn.functional as F
import torch
from PIL import Image
import requests

# Load model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

## 3. Generate CLIP Similarity Scores

In [24]:
# Load sample image (replace with your image path or URL)
image = Image.open("/kaggle/input/cv-assgn3/sample_image.jpg").convert("RGB")

# 10 custom textual descriptions
texts = [
    "A man holding a dog",
    "A woman holding a dog",
    "A man holding an elephant",
    "A woman holding an elephant",
    "A man cooking in a kitchen",
    "A beautiful sunset",
    "A person sitting on a bench",
    "A dog and its owner",
    "A city skyline at night",
    "A Room full of toys",
    "A dog looking at a human"
]

# Preprocess
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)

# Forward pass
outputs = model(**inputs)

# Get image and text embeddings
image_embeds = outputs.image_embeds
text_embeds = outputs.text_embeds

# Normalize
image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

# Cosine similarity
similarity_scores = (100*image_embeds @ text_embeds.T).squeeze().tolist()

# Print results
for desc, score in zip(texts, similarity_scores):
    print(f"{desc} --> Similarity Score: {score:.4f}")


A man holding a dog --> Similarity Score: 30.5882
A woman holding a dog --> Similarity Score: 26.2149
A man holding an elephant --> Similarity Score: 26.6754
A woman holding an elephant --> Similarity Score: 21.1037
A man cooking in a kitchen --> Similarity Score: 21.0087
A beautiful sunset --> Similarity Score: 16.3746
A person sitting on a bench --> Similarity Score: 20.0851
A dog and its owner --> Similarity Score: 27.6654
A city skyline at night --> Similarity Score: 12.8141
A Room full of toys --> Similarity Score: 19.5991
A dog looking at a human --> Similarity Score: 25.2832


## 4. CLIPS dependency

In [11]:
# !git clone https://github.com/UCSC-VLAA/
# CLIPS.git
# !pip install -r CLIPS/requirements.txt

## 5. Download CLIPS Model and Load Weight

In [25]:
# Load model
model, preprocess = create_model_from_pretrained("hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-224-Recap-DataComp-1B")
tokenizer = get_tokenizer("hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-224-Recap-DataComp-1B")

## 6. Generate CLIPS Similarity Scores

In [26]:
# Preprocess
image_input = preprocess(image).unsqueeze(0)  
text_input = tokenizer(texts)

# Forward pass
with torch.no_grad():
    image_embeds = model.encode_image(image_input)
    text_embeds = model.encode_text(text_input)

# Normalize
image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

# Cosine similarity
similarity_scores = (100*image_embeds @ text_embeds.T).squeeze().tolist()

# Print results
for desc, score in zip(texts, similarity_scores):
    print(f"{desc} --> Similarity Score: {score:.4f}")

A man holding a dog --> Similarity Score: 16.3031
A woman holding a dog --> Similarity Score: 10.1649
A man holding an elephant --> Similarity Score: 16.4388
A woman holding an elephant --> Similarity Score: 10.8382
A man cooking in a kitchen --> Similarity Score: 6.4267
A beautiful sunset --> Similarity Score: 3.7355
A person sitting on a bench --> Similarity Score: -0.2385
A dog and its owner --> Similarity Score: 15.3522
A city skyline at night --> Similarity Score: -0.2308
A Room full of toys --> Similarity Score: 6.8924
A dog looking at a human --> Similarity Score: 12.7015
