## Idea 1

In [None]:
from io import BytesIO

import requests
from PIL import Image
from sentence_transformers import SentenceTransformer

In [None]:
response = requests.get(
    "https://github.com/PacktPublishing/LLM-Engineering/blob/main/images/crazy_cat.jpg?raw=true"
)

In [None]:
image = Image.open(BytesIO(response.content))

In [None]:
model = SentenceTransformer("clip-Vit-B-32")

In [None]:
img_embed = model.encode(image)

text_embed = model.encode([
    "A crazy cat smiling",
    "A white and brown cat with a yellow banana",
    "A man eating in the garden"
])

In [None]:
print(text_embed.shape)

In [None]:
similarity_scores = model.similarity(img_embed, text_embed)

In [None]:
similarity_scores

## Idea 2

In [1]:
import torch
import clip
from PIL import Image

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [3]:
image = preprocess(Image.open("../../Images/Cyclist.png")).unsqueeze(0).to(device)

In [4]:
# Text descriptions
texts = ["a cyclist", "a sunny beach", "a painting of a castle", "a woman cyclist", "A cyclist cruises along the canal on a crisp autumn afternoon."]
text_tokens = clip.tokenize(texts).to(device)

In [5]:
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)

In [6]:
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

In [7]:
similarity = (image_features @ text_features.T).squeeze(0)
for desc, score in zip(texts, similarity):
    print(f"{desc}: {score.item():.4f}")

a cyclist: 0.3110
a sunny beach: 0.2058
a painting of a castle: 0.1813
a woman cyclist: 0.3324
A cyclist cruises along the canal on a crisp autumn afternoon.: 0.3265


In [8]:
threshold = 0.30
for desc, score in zip(texts, similarity):
    if score > threshold:
        print(f"✔️ Match: '{desc}' (Score: {score:.4f})")
    else:
        print(f"❌ Not a good match: '{desc}' (Score: {score:.4f})")

✔️ Match: 'a cyclist' (Score: 0.3110)
❌ Not a good match: 'a sunny beach' (Score: 0.2058)
❌ Not a good match: 'a painting of a castle' (Score: 0.1813)
✔️ Match: 'a woman cyclist' (Score: 0.3324)
✔️ Match: 'A cyclist cruises along the canal on a crisp autumn afternoon.' (Score: 0.3265)
