<a href="https://colab.research.google.com/github/harunkurtdev/ayvos_staj/blob/master/clipanddinov2_comparasion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch==2.5.1 torchvision==0.20.1 pytorch_lightning==2.4.0 matplotlib

In [23]:
import torch
from PIL import Image
import requests
import torchvision.transforms as T
from transformers import CLIPProcessor, CLIPModel
import timm

# --- Görseli yükle ---
image_url = "https://avatars.mds.yandex.net/i?id=bc3fc5bd425db819f5ae6cd6f03c69b64d5387af-4551637-images-thumbs&n=13"
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")

# --- CLIP Hazırlığı ---
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

clip_inputs = clip_processor(images=image, return_tensors="pt")
with torch.no_grad():
    clip_features = clip_model.get_image_features(**clip_inputs)  # (1, 512)

# --- DINOv2 Hazırlığı ---
transform = T.Compose([
    T.Resize((518, 518)),  # DINOv2 modelinin beklediği giriş boyutu
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])
image_tensor = transform(image).unsqueeze(0)  # (1, 3, 518, 518)


# DINOv2 (vit_large_patch14) modelini yükle
dino_model = timm.create_model('vit_large_patch14_dinov2.lvd142m', pretrained=True)
dino_model.eval()
with torch.no_grad():
    dino_features = dino_model(image_tensor)  # (1, 1024)

# --- Normalize et ---
clip_features = torch.nn.functional.normalize(clip_features, dim=-1)
dino_features = torch.nn.functional.normalize(dino_features, dim=-1)

# --- Birleştir ---
combined_features = torch.cat([clip_features, dino_features], dim=-1)  # (1, 1536)

print("CLIP Feature Shape:     ", clip_features.shape)
print("DINOv2 Feature Shape:   ", dino_features.shape)
print("Combined Feature Shape: ", combined_features.shape)


CLIP Feature Shape:      torch.Size([1, 512])
DINOv2 Feature Shape:    torch.Size([1, 1024])
Combined Feature Shape:  torch.Size([1, 1536])


In [24]:
import torch.nn.functional as F
# --- CLIP transform ---
clip_inputs = clip_processor(images=image, return_tensors="pt")

# --- DINOv2 transform ---
transform_dino = T.Compose([
    T.Resize((518, 518)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

image_tensor_dino = transform_dino(image).unsqueeze(0)

# --- Görsel embedding'leri ---
with torch.no_grad():
    clip_image_emb = clip_model.get_image_features(**clip_inputs)
    dino_emb = dino_model(image_tensor_dino)

# --- Normalize ve birleştir ---
clip_image_emb = F.normalize(clip_image_emb, dim=-1)
dino_emb = F.normalize(dino_emb, dim=-1)
image_embedding = torch.cat([clip_image_emb, dino_emb], dim=-1)
image_embedding = F.normalize(image_embedding, dim=-1)

# --- Textleri tanımla ---
text_list = ["a dog", "a cat", "a mount","a person", "a truck", "a tree", "a firefighter"]
text_inputs = clip_processor(text=text_list, return_tensors="pt", padding=True)

# --- Metin embedding ---
with torch.no_grad():
    text_emb = clip_model.get_text_features(**text_inputs)
text_emb = F.normalize(text_emb, dim=-1)

# --- Benzerlik hesapla ---
# (1, 1536) x (6, 512) => dikkat: önce text embedding'lerini 1536 boyuta çıkarmamız lazım
# Basit çözüm: sadece CLIP text + CLIP image kullanmak
# Ama burada CLIP text ile combined (CLIP+DINOv2) görsel karşılaştıracağız

# Bu durumda DINOv2 kısmını dışarda bırakabiliriz — veya bir MLP ile metin boyutunu 1536’ya çıkarmak gerek

# Şimdi sadece CLIP kısmıyla karşılaştıralım:
similarity_scores = (clip_image_emb @ text_emb.T).squeeze()

# --- Sonuçları yazdır ---
print("📷 Görsel URL:", image_url)
print("\n🔍 Metin Benzerlikleri:")
for i, text in enumerate(text_list):
    print(f"  \"{text}\" : {similarity_scores[i]:.4f}")

📷 Görsel URL: https://avatars.mds.yandex.net/i?id=bc3fc5bd425db819f5ae6cd6f03c69b64d5387af-4551637-images-thumbs&n=13

🔍 Metin Benzerlikleri:
  "a dog" : 0.2544
  "a cat" : 0.1864
  "a mount" : 0.2200
  "a person" : 0.2045
  "a truck" : 0.1906
  "a tree" : 0.1568
  "a firefighter" : 0.1859
