In [2]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

image = Image.open("view_1.png")
image = image.convert("RGB")
image_np = np.array(image)

text_prompt = "rabbit"

inputs = processor(text=text_prompt, images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = model(**inputs)
    image_embeds = outputs.image_embeds
    text_embeds = outputs.text_embeds

similarity = torch.cosine_similarity(image_embeds, text_embeds)
print(f"Cosine similarity between text and image: {similarity.item()}")

patch_size = 32 
height, width, _ = image_np.shape
patches = []

for i in range(0, height, patch_size):
    for j in range(0, width, patch_size):
        patch = image_np[i:i+patch_size, j:j+patch_size]
        patches.append((patch, (i, j)))

patch_similarities = []

for patch, (i, j) in patches:
    patch_image = Image.fromarray(patch)
    inputs_patch = processor(text=text_prompt, images=patch_image, return_tensors="pt", padding=True)

    with torch.no_grad():
        outputs_patch = model(**inputs_patch)
        image_embeds_patch = outputs_patch.image_embeds
        similarity_patch = torch.cosine_similarity(image_embeds_patch, text_embeds)
        patch_similarities.append((similarity_patch.item(), (i, j, patch)))

similarities = [sim for sim, _ in patch_similarities]
scaler = MinMaxScaler()
normalized_similarities = scaler.fit_transform(np.array(similarities).reshape(-1, 1))

highlighted_image = image_np.copy()

for (similarity_value, (i, j, patch)), norm_similarity in zip(patch_similarities, normalized_similarities):
    if norm_similarity > 0.7: 
        highlighted_image[i:i+patch_size, j:j+patch_size] = np.array([255, 0, 0])

plt.imshow(highlighted_image)
plt.title(f"Highlighted Regions (Similarity > 0.8)")
plt.axis("off")
plt.show()


Cosine similarity between text and image: 0.2859526574611664
