In [2]:
# Imports
import torch
import clip
import faiss
import os
from PIL import Image
from langchain.vectorstores import FAISS as LangFAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
from faster_whisper import WhisperModel

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. Whisper (audio to text)
def transcribe_audio(file_path):
    print("🎤 Transcribing audio with faster-whisper...")
    model = WhisperModel("base", device=device)
    segments, _ = model.transcribe(file_path)
    transcription = " ".join([segment.text for segment in segments])
    return transcription

# 2. CLIP (image to embedding)
def embed_image(file_path, model, preprocess):
    print("🖼️ Embedding image using CLIP...")
    image = preprocess(Image.open(file_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
    return image_features.cpu().numpy()

# 3. Text embedding
def embed_text_chunks(docs):
    print("📚 Embedding text chunks...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = LangFAISS.from_documents(docs, embeddings)
    return vectorstore

# 4. Split large text corpus
def split_documents(texts):
    docs = [Document(page_content=t) for t in texts]
    splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_documents(docs)

# 5. Load LLM (Light model for Colab/PC)
def load_llm():
    print("🧠 Loading FLAN-T5-Base...")
    model_id = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
    return HuggingFacePipeline(pipeline=pipe)

# 6. Load CLIP
def init_clip():
    print("⚙️ Loading CLIP model...")
    model, preprocess = clip.load("ViT-B/32", device=device)
    return model, preprocess

# 7. Create QA chain
def create_multimodal_qa(corpus):
    chunks = split_documents(corpus)
    vectorstore = embed_text_chunks(chunks)
    llm = load_llm()
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

# 8. Chat interface
def start_multimodal_chat():
    clip_model, clip_preprocess = init_clip()

    corpus = [
        "The sun is a star at the center of the Solar System.",
        "Cats are small, carnivorous mammals that are often kept as pets.",
        "The Eiffel Tower is a famous landmark in Paris, France."
    ]

    qa_chain = create_multimodal_qa(corpus)
    clip_text_model = SentenceTransformer("clip-ViT-B-32")
    corpus_embeddings = clip_text_model.encode(corpus)
    index = faiss.IndexFlatL2(corpus_embeddings.shape[1])
    index.add(corpus_embeddings)

    print("\n🧠 Multimedia RAG Ready! Type 'exit' to quit.\n")

    while True:
        mode = input("Choose input mode (text/image/audio): ").strip().lower()
        if mode == "exit":
            print("👋 Exiting.")
            break
        elif mode == "text":
            query = input("📝 Your question: ")
        elif mode == "image":
            path = input("📁 Enter image path: ")
            try:
                image_features = embed_image(path, clip_model, clip_preprocess)
                D, I = index.search(image_features, k=1)
                query = corpus[I[0][0]]
                print(f"🧠 Interpreted image as: {query}")
            except Exception as e:
                print(f"⚠️ Error loading image: {e}")
                continue
        elif mode == "audio":
            path = input("🎵 Enter audio file path: ")
            try:
                query = transcribe_audio(path)
                print(f"🧠 Transcribed audio: {query}")
            except Exception as e:
                print(f"⚠️ Error transcribing audio: {e}")
                continue
        else:
            print("❌ Invalid mode. Please choose text, image, or audio.")
            continue

        try:
            result = qa_chain.invoke({"query": query})
            print(f"\n🤖 Answer: {result['result']}\n")
        except Exception as e:
            print(f"⚠️ Error during QA: {e}")

# Run
if __name__ == "__main__":
    start_multimodal_chat()


⚙️ Loading CLIP model...
📚 Embedding text chunks...
🧠 Loading FLAN-T5-Base...


Device set to use cpu


modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



🧠 Multimedia RAG Ready! Type 'exit' to quit.



Choose input mode (text/image/audio):  image
📁 Enter image path:  E:\CV_LAB\Inputs\car.png


🖼️ Embedding image using CLIP...
🧠 Interpreted image as: The Eiffel Tower is a famous landmark in Paris, France.

🤖 Answer: Cats are small, carnivorous mammals that are often kept as pets.



Choose input mode (text/image/audio):  E:\CV_LAB\Inputs\dog.jpg


❌ Invalid mode. Please choose text, image, or audio.


Choose input mode (text/image/audio):  exit


👋 Exiting.
