# MultiModal Models

![image](https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg)

## Image Embedding Models

In [1]:
import torch
import requests
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Load the model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# Load the processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



In [2]:
# Load an example image - DOG
img = Image.open("img/dog.jpg")
# Preprocess the image using the processor

# Add a text input to the processor
text = "a photo of a dog"
inputs = processor(text=[text], images=img, return_tensors="pt", padding=True)

# Geneprate the embeddings using the model
outputs = model(**inputs)

# Extract the image embeddings from the outputs
img_embeds_dog = outputs.image_embeds

In [3]:
# Load an example image - DOG2
img = Image.open("img/dog2.jpg")
# Preprocess the image using the processor

# Add a text input to the processor
text = "a photo of a dog"
inputs = processor(text=[text], images=img, return_tensors="pt", padding=True)

# Geneprate the embeddings using the model
outputs = model(**inputs)

# Extract the image embeddings from the outputs
img_embeds_dog2 = outputs.image_embeds

In [4]:
# Load an example image - cat
img = Image.open("img/cat.jpg")
# Preprocess the image using the processor

# Add a text input to the processor
text = "a photo of a cat"
inputs = processor(text=[text], images=img, return_tensors="pt", padding=True)

# Geneprate the embeddings using the model
outputs = model(**inputs)

# Extract the image embeddings from the outputs
img_embeds_cat = outputs.image_embeds

In [5]:
# Load an example image - cat
img = Image.open("img/server.jpg")
# Preprocess the image using the processor

# Add a text input to the processor
text = "a photo of a server"
inputs = processor(text=[text], images=img, return_tensors="pt", padding=True)

# Geneprate the embeddings using the model
outputs = model(**inputs)

# Extract the image embeddings from the outputs
img_embeds_server = outputs.image_embeds

In [6]:
## Diference between dogs:
# Compute Euclidean distance
euclidean_distance = torch.dist(img_embeds_dog, img_embeds_dog2)
print("Euclidean distance between dogs:", euclidean_distance)

euclidean_distance = torch.dist(img_embeds_dog, img_embeds_cat)
print("Euclidean distance between dog and cat:", euclidean_distance)

euclidean_distance = torch.dist(img_embeds_dog, img_embeds_server)
print("Euclidean distance between dog and server:", euclidean_distance)


Euclidean distance between dogs: tensor(0.8688, grad_fn=<DistBackward0>)
Euclidean distance between dog and cat: tensor(0.9053, grad_fn=<DistBackward0>)
Euclidean distance between dog and server: tensor(0.8935, grad_fn=<DistBackward0>)


## Multimodality

In [7]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering

processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

img = Image.open("img/cat.jpg")

question = "Which animal do you see?"
inputs = processor(img, question, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

cat


