In [37]:
# Import necessary libraries

from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np
import os
import sys
from PIL import Image
import base64
from IPython.display import HTML, display
from torch.nn.functional import cosine_similarity

In [3]:
# download the clip model from huggingface
model_name = "openai/clip-vit-base-patch32"
latest_model_name = "openai/clip-vit-large-patch14"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
latest_model = CLIPModel.from_pretrained(latest_model_name)
latest_processor = CLIPProcessor.from_pretrained(latest_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Above code will download the required CLIPModel and ClIPProcessor from CLIP Vision Transformer.
Such files include : tokenizer.json, vocab.json, special_tokens_map.json and their config files

In [6]:
# calculating cosine similarity between Image and Text embeddings
def calculate_cosine_similarity(image_embedding, text_embedding):
  # Calculate the cosine similarity between the image and text embeddings
  cosine_similarity = np.dot(image_embedding, text_embedding) / (np.linalg.norm(image_embedding) * np.linalg.norm(text_embedding))
  return cosine_similarity


In [11]:
# Calculating Image embedding for Image using CLIP processor
def get_image_embedding(image_paths):
  # Preprocess the list of images
  with torch.no_grad():
    images = [Image.open(image_path).convert("RGB") for image_path in image_paths]
    image_inputs = processor(images=images, return_tensors="pt")
    image_features = model.get_image_features(**image_inputs)

  return image_features


In [12]:
# Calculating Text embedding for Texts using CLIP processor
def get_text_embedding(texts):
  # Preprocess the list of texts
  with torch.no_grad():
    inputs = processor(text=texts, return_tensors="pt", truncation=True, padding=True)
    text_features = model.get_text_features(**inputs)

  return text_features

In [30]:
# input images path
input_image_paths = ['/content/airplane_pic.png', '/content/apple_middle_right.png',
                     '/content/apple_pic.png', '/content/blue_square.png',
                     '/content/green_square_center.png', '/content/house.png']

# input texts
input_texts = ["photo of an airplane", "small apple", "photo of big apple",
               "blue color square photo", "photo of green colored square",
               "photo of nice house"]


In [43]:
# get embeddings
image_embeddings = get_image_embedding(input_image_paths)
text_embeddings = get_text_embedding(input_texts)

In [44]:
image_embeddings.shape, text_embeddings.shape

(torch.Size([6, 512]), torch.Size([6, 512]))

In [45]:
image_embeddings.unsqueeze(1).shape, text_embeddings.unsqueeze(0).shape

(torch.Size([6, 1, 512]), torch.Size([1, 6, 512]))

In [46]:
# checking for first image = airplane_pic
similarity_value = calculate_cosine_similarity(image_embeddings[0], text_embeddings[0])
similarity_value

0.25439373

In [48]:
# full similarity matrix
cosine_similarity_matrix = cosine_similarity(image_embeddings.unsqueeze(1), text_embeddings.unsqueeze(0), dim=2)
print(cosine_similarity_matrix)

tensor([[0.2544, 0.1765, 0.1812, 0.1482, 0.1525, 0.1574],
        [0.2104, 0.2979, 0.2740, 0.1916, 0.2377, 0.2092],
        [0.1865, 0.2914, 0.2771, 0.1825, 0.2092, 0.1931],
        [0.2458, 0.2100, 0.2334, 0.2960, 0.2392, 0.2210],
        [0.2198, 0.2352, 0.2204, 0.2715, 0.3437, 0.2217],
        [0.1534, 0.1896, 0.1749, 0.1911, 0.1872, 0.2551]])


_________________________________
# Result Analysis

[**0.2544**, 0.1765, 0.1812, 0.1482, 0.1525, 0.1574]

[0.2104, **0.2979**, 0.2740, 0.1916, 0.2377, 0.2092]

[0.1865, **0.2914**, 0.2771, 0.1825, 0.2092, 0.1931]

[0.2458, 0.2100, 0.2334, **0.2960**, 0.2392, 0.2210]

[0.2198, 0.2352, 0.2204, 0.2715, **0.3437**, 0.2217]

[0.1534, 0.1896, 0.1749, 0.1911, 0.1872, **0.2551**]

**********************

As per above similarity graphs from embeddings vector. It is clearly seen that texts on the Y-axis and Image on the X-axis shows the similarity matrix with most probable description of image.
This shows the zero-shot technique for understanding capapbility of CLIP model which was trained on 400 millions of images with text description pair from the internet.

********************************************
_________________________________________
**Advantages** of CLIP model:

1. It can perform wide set of tasks during pre-training including **OCR**,  **geolocalization**, action recognition, **classification** and many more.


****************************** ****
_______________________________


**Limitations** of using CLIP model:
1. CLIP model is not good in doing **mathematical operations** written in image
2. CLIP model performs poorly to understand **directions/positions** of object
