In [14]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel, CLIPFeatureExtractor, CLIPTextConfig, CLIPVisionConfig, CLIPImageProcessor

model_ref = "openai/clip-vit-base-patch32"
# model_ref = "openai/clip-vit-large-patch14"
model = CLIPModel.from_pretrained(model_ref)
processor = CLIPProcessor.from_pretrained(model_ref)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [15]:
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import torch

model_ref = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_ref)
processor = CLIPProcessor.from_pretrained(model_ref)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
pixel_values1 = processor(images=torch.stack([torch.from_numpy(np.array(image)), torch.from_numpy(np.array(image))]), return_tensors="pt").pixel_values
embedding=model.get_image_features(pixel_values=pixel_values1)
embedding=embedding.unsqueeze(1)

In [27]:

pixel_values1 = processor(images=torch.from_numpy(np.array(image).transpose(1,2,0)), return_tensors="pt").pixel_values
embedding1=model.get_image_features(pixel_values=pixel_values1)
pixel_values2 = processor(images=torch.from_numpy(np.array(image).astype('float32')), return_tensors="pt").pixel_values
embedding2=model.get_image_features(pixel_values=pixel_values2)
torch.allclose(embedding1, embedding2)

In [11]:
from transformers import ViTFeatureExtractor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)



In [15]:
feature_extractor(image, return_tensors='pt')['pixel_values'].shape

torch.Size([1, 3, 224, 224])

In [24]:
config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
config

CLIPVisionConfig {
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 1024,
  "image_size": 224,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "model_type": "clip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 14,
  "projection_dim": 768,
  "transformers_version": "4.28.1"
}