In [None]:
%uv add transformers faiss-cpu torch sentence-transformers PIL opencv-python

In [None]:
import torch
import faiss
import cv2
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

text_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
dataset_texts = ["A cat sitting on a table", "A dog playing in the park", "A red sports car", "A bowl of fresh fruit"]
dataset_images = ["cat.jpg", "dog.jpg", "car.jpg", "fruit.jpg"] 

text_embeddings = text_model.encode(dataset_texts, convert_to_tensor=True)

image_embeddings = []
for img_path in dataset_images:
    image = Image.open(img_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    img_emb = image_model.generate(**inputs)
    image_embeddings.append(img_emb)

image_embeddings = torch.cat(image_embeddings)

In [None]:
data_embeddings = torch.cat((text_embeddings, image_embeddings)).detach().numpy()

index = faiss.IndexFlatL2(data_embeddings.shape[1])
index.add(data_embeddings)

In [None]:
query_text = "A cute kitten"
query_embedding = text_model.encode([query_text], convert_to_tensor=True).detach().numpy()

distances, indices = index.search(query_embedding, k=3)

print("Top 3 nearest MultiModal results:", indices)