In [1]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "cat sleeping", "cats sleeping"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [2]:
logits_per_image


tensor([[24.5701, 26.4551, 29.2379]], grad_fn=<TBackward0>)

In [3]:
probs


tensor([[0.0088, 0.0577, 0.9335]], grad_fn=<SoftmaxBackward0>)

In [4]:
inputs

{'input_ids': tensor([[49406,   320,  1125,   539,   320,  2368, 49407],
        [49406,  2368,  6982, 49407, 49407, 49407, 49407],
        [49406,  3989,  6982, 49407, 49407, 49407, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0]]), 'pixel_values': tensor([[[[ 0.5873,  0.5873,  0.6165,  ...,  0.0617,  0.0471, -0.0259],
          [ 0.5727,  0.5727,  0.6603,  ...,  0.1201,  0.0763,  0.0909],
          [ 0.5873,  0.5435,  0.6165,  ...,  0.0325,  0.1201,  0.0617],
          ...,
          [ 1.8719,  1.8573,  1.8719,  ...,  1.3902,  1.4340,  1.4194],
          [ 1.8281,  1.8719,  1.8427,  ...,  1.4486,  1.4340,  1.5070],
          [ 1.8573,  1.9011,  1.8281,  ...,  1.3756,  1.3610,  1.4486]],

         [[-1.3169, -1.3019, -1.3169,  ..., -1.4970, -1.4369, -1.4820],
          [-1.2418, -1.2718, -1.2268,  ..., -1.4369, -1.4669, -1.4519],
          [-1.2568, -1.3169, -1.2268,  ..., -1.4669, -1.4069, -1.4519],
          ...

In [5]:
outputs

CLIPOutput(loss=None, logits_per_image=tensor([[24.5701, 26.4551, 29.2379]], grad_fn=<TBackward0>), logits_per_text=tensor([[24.5701],
        [26.4551],
        [29.2379]], grad_fn=<MulBackward0>), text_embeds=tensor([[ 0.0148,  0.0070, -0.0234,  ..., -0.0508, -0.0438,  0.0033],
        [ 0.0118, -0.0099, -0.0444,  ...,  0.0002, -0.0540, -0.0055],
        [ 0.0160, -0.0136, -0.0537,  ...,  0.0076, -0.0544, -0.0077]],
       grad_fn=<DivBackward0>), image_embeds=tensor([[-9.7877e-03,  1.2770e-02, -2.7419e-02,  1.9675e-03, -5.9326e-03,
         -1.5613e-02, -1.2514e-02, -2.2675e-04,  4.3869e-02, -1.6322e-02,
          2.2630e-02, -3.5160e-02,  4.4747e-03, -1.2946e-02, -3.1524e-02,
         -1.1737e-02, -2.1543e-02, -2.7556e-02,  1.6562e-02,  4.5935e-03,
         -1.2106e-01, -3.0035e-03,  3.9024e-02, -3.0893e-02, -4.3866e-03,
          2.7598e-02,  2.2140e-02, -1.7065e-02,  1.4509e-02, -4.5196e-03,
         -7.1842e-03,  2.3971e-02, -6.8107e-03,  1.6382e-02, -5.3629e-02,
         -4.555

In [6]:
import clip
import torch
from PIL import Image

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load an image
image = Image.open("path/to/your/image.jpg")
image = preprocess(image).unsqueeze(0).to(device)

# Prepare text
text = clip.tokenize(["a description of what you're looking for"]).to(device)

# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

    # Compute similarity
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Probabilities:", probs)

ModuleNotFoundError: No module named 'clip'