In [1]:
import torch
import requests
from PIL import Image
from transformers import AutoProcessor, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

model = AutoModel.from_pretrained("google/siglip2-base-patch16-512", dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-512")

image_path = "datasets/juice_bottle/juice_bottle/train/good/000.png"
image = Image.open(image_path).convert("RGB")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [75]:

candidate_labels = [
    "yellow orange juice bottle", 
    # "red apple juice bottle", 
    # "green vegetable juice bottle"
]

# follows the pipeline prompt template to get same results
texts = [f'This is a photo of {label}.' for label in candidate_labels]

# IMPORTANT: we pass `padding=max_length` and `max_length=64` since the model was trained with this
inputs = processor(
    text=texts, 
    images=image, 
    padding="max_length", 
    max_length=64,
    return_tensors="pt"
).to(model.device)

In [76]:
with torch.no_grad():
    outputs = model(**inputs)

In [77]:
outputs.keys()

odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])

In [78]:
text_embeds = outputs.text_embeds  # (num_labels, embed_dim)
image_embeds = outputs.image_embeds  # (1, embed_dim)

In [79]:
image_embeds.shape, text_embeds.shape

(torch.Size([1, 768]), torch.Size([1, 768]))

In [92]:
logit_scale = model.logit_scale.exp()
bias = model.logit_bias
sim_1 = (image_embeds @ text_embeds.T) * logit_scale + bias


In [101]:
sim_contrib = image_embeds*text_embeds.T * logit_scale + bias


In [102]:
sim_contrib.shape

torch.Size([768, 768])

In [103]:
sim_contrib.sum(dim=-1) / text_embeds.shape[-1]

tensor([-16.7656, -16.7656, -16.7656, -16.7656, -16.7500, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.7656,
        -16.7656, -16.7656, -16.7656, -16.7656, -16.7656, -16.76

In [109]:
# sim_contrib 중 0.5 이상인 것들만 뽑아서 더해보기

sim_contrib_thresholded = sim_contrib.clone()
sim_contrib_thresholded[sim_contrib_thresholded < 0.5] = 0
# 0보다 큰 index
print((sim_contrib_thresholded > 0).nonzero())

tensor([[  4, 242],
        [626, 242]], device='cuda:0')


In [57]:


logits_per_image = outputs.logits_per_image
probs = torch.softmax(logits_per_image, dim=-1)
for i, label in enumerate(candidate_labels):
    print(f"{probs[0][i]:.1%} that image 0 is '{label}'")

99.3% that image 0 is 'yellow orange juice bottle'
0.6% that image 0 is 'red apple juice bottle'
0.1% that image 0 is 'green vegetable juice bottle'
