# CLIP Vision Model

In [8]:
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPVisionModel
from transformers import logging

In [9]:
logging.set_verbosity_error()
model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
logging.set_verbosity_warning()

In [22]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [11]:
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

In [34]:
inputs['pixel_values'].shape

torch.Size([1, 3, 224, 224])

In [16]:
last_hidden_states = outputs.last_hidden_state
pooler_output = outputs.pooler_output

In [18]:
last_hidden_states.shape

torch.Size([1, 50, 768])

In [17]:
pooler_output.shape

torch.Size([1, 768])

# CLIP Model

In [1]:
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

In [2]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [None]:
inputs = processor(images=image, return_tensors="pt")

image_features = model.get_image_features(**inputs)

In [None]:
image_features.size()

# Multimodal Example

In [None]:
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [None]:
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)

In [None]:
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

In [None]:
probs