<a href="https://colab.research.google.com/github/keshavkrk/SDC/blob/main/multimodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install Pillow




In [4]:
!pip install faiss-cpu transformers sentence-transformers torchvision Pillow


Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [4]:
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import faiss
import torch
import os
import numpy as np


# Load models
text_model = SentenceTransformer("all-MiniLM-L6-v2")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [2]:
def get_text_embedding(text):
    return text_model.encode([text])[0]

def get_image_embedding(image_path):
    image = Image.open(image_path)
    inputs = clip_processor(text=[""], images=image, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = clip_model(**inputs)
    return outputs.image_embeds[0].numpy()


In [6]:
docs = [
    "Diabetes is a chronic condition that affects blood sugar levels.",
    "Common symptoms of loose motion include diarrhea and dehydration.",
    "MRI scans provide detailed images of the brain and spinal cord.",
    "Paracetamol is often used to treat fever and mild pain."
]

text_embeddings = [get_text_embedding(doc) for doc in docs]
dimension = len(text_embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(text_embeddings).astype("float32"))


In [7]:
def retrieve(query_text=None, image_path=None, top_k=2):
    if query_text and image_path:
        text_vec = get_text_embedding(query_text)
        image_vec = get_image_embedding(image_path)
        final_vec = (text_vec + image_vec) / 2  # simple fusion
    elif query_text:
        final_vec = get_text_embedding(query_text)
    elif image_path:
        final_vec = get_image_embedding(image_path)
    else:
        raise ValueError("Provide text or image")

    scores, indices = index.search(np.array([final_vec]), top_k)
    return [docs[i] for i in indices[0]]


In [8]:
def generate_response(query, retrieved_docs):
    context = "\n".join(retrieved_docs)
    return f"User asked: {query}\n\nBased on documents:\n{context}\n\n→ Advice: Stay hydrated and consult a doctor if symptoms persist."


In [9]:
query = "What causes loose motion?"
relevant = retrieve(query_text=query)
answer = generate_response(query, relevant)
print(answer)


User asked: What causes loose motion?

Based on documents:
Common symptoms of loose motion include diarrhea and dehydration.
Diabetes is a chronic condition that affects blood sugar levels.

→ Advice: Stay hydrated and consult a doctor if symptoms persist.


In [10]:
# Step 1: Install required libraries
!pip install faiss-cpu sentence-transformers torchvision transformers




In [11]:
# Step 2: Import libraries
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from torchvision import transforms
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
import os


In [12]:
# Step 3: Load embedding models
text_model = SentenceTransformer("all-MiniLM-L6-v2")

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [13]:
# Step 4: Define functions

# Text embedding
def get_text_embedding(text):
    return text_model.encode(text, convert_to_tensor=False).astype("float32")

# Image embedding
def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = clip_model.get_image_features(**inputs)
    return outputs[0].cpu().numpy().astype("float32")

# Simple answer generation
def generate_answer(query, retrieved_docs):
    print(f"Query: {query}")
    print("\nRetrieved relevant context:")
    for i, doc in enumerate(retrieved_docs, 1):
        print(f"{i}. {doc}")
    return "This is a mock answer based on retrieved context."


In [14]:
# Step 5: Prepare your data (text and image paths)

docs = [
    "Diabetes is a chronic condition that affects blood sugar levels.",
    "Loose motion is often caused by infections and leads to dehydration.",
    "MRI scans provide detailed brain and spinal cord images.",
    "Paracetamol is used to treat fever and mild pain."
]

images = [
    "https://upload.wikimedia.org/wikipedia/commons/6/69/Diabetes_blue_circle_symbol.svg",
    "https://upload.wikimedia.org/wikipedia/commons/2/2c/Toilet_bowl.svg"
]

# Download images
os.makedirs("images", exist_ok=True)
import requests

image_paths = []
for i, url in enumerate(images):
    img_path = f"images/img{i}.png"
    with open(img_path, "wb") as f:
        f.write(requests.get(url).content)
    image_paths.append(img_path)


In [16]:
# Download images correctly
import requests
from PIL import Image

os.makedirs("images", exist_ok=True)
image_paths = []

images = [
    "https://upload.wikimedia.org/wikipedia/commons/6/69/Diabetes_blue_circle_symbol.svg",
    "https://upload.wikimedia.org/wikipedia/commons/2/2c/Toilet_bowl.svg"
]

for i, url in enumerate(images):
    img_path = f"images/img{i}.png"
    try:
        # Fetch image
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        with open(img_path, "wb") as f:
            f.write(response.content)
        # Validate image format
        img = Image.open(img_path)
        img.verify()  # Verifies that the file is indeed an image
        image_paths.append(img_path)
    except Exception as e:
        print(f"Failed to download or validate image at {url}: {e}")

print(f"Downloaded image paths: {image_paths}")


Failed to download or validate image at https://upload.wikimedia.org/wikipedia/commons/6/69/Diabetes_blue_circle_symbol.svg: 403 Client Error: Forbidden. Please comply with the User-Agent policy: https://meta.wikimedia.org/wiki/User-Agent_policy for url: https://upload.wikimedia.org/wikipedia/commons/6/69/Diabetes_blue_circle_symbol.svg
Failed to download or validate image at https://upload.wikimedia.org/wikipedia/commons/2/2c/Toilet_bowl.svg: 403 Client Error: Forbidden. Please comply with the User-Agent policy: https://meta.wikimedia.org/wiki/User-Agent_policy for url: https://upload.wikimedia.org/wikipedia/commons/2/2c/Toilet_bowl.svg
Downloaded image paths: []


In [19]:
from transformers import pipeline
qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

context = " ".join(retrieved)
qa(question=query, context=context)


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


{'score': 0.8497876524925232, 'start': 32, 'end': 42, 'answer': 'infections'}

In [17]:
# Step 6: Create embeddings and build FAISS index

text_embeddings = [get_text_embedding(doc) for doc in docs]
image_embeddings = [get_image_embedding(path) for path in image_paths]

all_embeddings = np.array(text_embeddings + image_embeddings)
faiss_index = faiss.IndexFlatL2(all_embeddings.shape[1])
faiss_index.add(all_embeddings)

# Keep track of metadata
doc_metadata = docs + image_paths  # First few are texts, then image paths


In [20]:
# Step 7: Query and retrieve similar documents/images

query = "What causes diarrhea?"
query_vector = get_text_embedding(query)
k = 3  # top results
distances, indices = faiss_index.search(np.array([query_vector]), k)

retrieved = [doc_metadata[i] for i in indices[0]]

# Display results
response = generate_answer(query, retrieved)
print("\nGenerated Answer:", response)


Query: What causes diarrhea?

Retrieved relevant context:
1. Loose motion is often caused by infections and leads to dehydration.
2. Diabetes is a chronic condition that affects blood sugar levels.
3. Paracetamol is used to treat fever and mild pain.

Generated Answer: This is a mock answer based on retrieved context.
