In [None]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m35.6 MB/s[0m eta [36m0:00:0

Simple Version

In [None]:
from PIL import Image
import requests
import torch

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a giraffe"], images=image, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

In [None]:
logits_per_image

tensor([[24.5701, 27.9732]])

In [None]:
probs

tensor([[0.0322, 0.9678]])

Expanded Version

In [None]:
from PIL import Image
import requests
import torch

from transformers import CLIPProcessor, CLIPModel

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [None]:
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

In [None]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values'])

In [None]:
image_features = model.get_image_features(**inputs)

TypeError: ignored

In [None]:
inputs['pixel_values'].shape, inputs['input_ids'].shape

(torch.Size([1, 3, 224, 224]), torch.Size([2, 7]))

In [None]:
inputs['input_ids']

tensor([[49406,   320,  1125,   539,   320,  2368, 49407],
        [49406,   320,  1125,   539,   320,  1929, 49407]])

In [None]:
with torch.no_grad():
    vision_outputs = model.vision_model(
        pixel_values=inputs['pixel_values']
    )

    text_outputs = model.text_model(
        input_ids=inputs['input_ids']
    )

    image_embeds_ = vision_outputs[1]
    image_embeds = model.visual_projection(image_embeds_)

    text_embeds_ = text_outputs[1]
    text_embeds = model.text_projection(text_embeds_)

    # normalized features
    image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

    # cosine similarity as logits
    logit_scale = model.logit_scale.exp()
    logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
    logits_per_image = logits_per_text.t()

In [None]:
image_embeds_.shape, text_embeds_.shape

(torch.Size([1, 768]), torch.Size([2, 512]))

In [None]:
image_embeds.shape, text_embeds.shape

(torch.Size([1, 512]), torch.Size([2, 512]))

In [None]:
logits_per_text.shape, logits_per_image.shape

(torch.Size([2, 1]), torch.Size([1, 2]))

In [None]:
probs = logits_per_image.softmax(dim=1)

In [None]:
probs

tensor([[0.9949, 0.0051]])

The model should not differentiate between the order of positive pairs. In other words, (Image1, Text1) and (Text1, Image1) are the same positive pair. By using both logits_per_text and logits_per_image, we ensure that we handle both directions of positive pairs correctly.