In [47]:
!pip install -q torch faiss-cpu transformers datasets langchain gradio soundfile librosa accelerate pypdf sentence-transformers langchain-community HuggingFace huggingface_hub

In [48]:
!pip install Transformers



In [54]:
# Load Hugging Face API token from Kaggle secrets
user_secrets = UserSecretsClient()
huggingfacehub_api_token = user_secrets.get_secret("HUGGINGFACEHUB_API_TOKEN")

In [63]:
import os
import json
import faiss
import numpy as np
import torch
from PIL import Image
from huggingface_hub import login
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import (
    CLIPModel, CLIPProcessor,
    WhisperProcessor, WhisperForConditionalGeneration,
    AutoModelForCausalLM, AutoTokenizer
)
import librosa
import gradio as gr
from kaggle_secrets import UserSecretsClient

# Configuration
MODEL_CONFIG = {
    "clip_model": "openai/clip-vit-base-patch32",
    "whisper_model": "openai/whisper-tiny.en",
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",  # 384-dim
    "
    ": "deepseek-ai/deepseek-coder-1.3b-instruct"  # Verified working model
}

# Hugging Face Login
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HUGGINGFACEHUB_API_TOKEN")
login(token=hf_token)

class MultimodalProcessor:
    def __init__(self):
        # Initialize models
        self.clip_model = CLIPModel.from_pretrained(MODEL_CONFIG["clip_model"])
        self.clip_processor = CLIPProcessor.from_pretrained(MODEL_CONFIG["clip_model"])
        self.whisper_model = WhisperForConditionalGeneration.from_pretrained(MODEL_CONFIG["whisper_model"])
        self.whisper_processor = WhisperProcessor.from_pretrained(MODEL_CONFIG["whisper_model"])
        
        # Text embedding model
        self.text_embedder = HuggingFaceEmbeddings(model_name=MODEL_CONFIG["embedding_model"])
        
        # LLM model
        self.llm_tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG["llm_repo"])
        self.llm_model = AutoModelForCausalLM.from_pretrained(
            MODEL_CONFIG["llm_repo"],
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        
        # Projection layer to match text embedding dimension (384)
        self.projection = torch.nn.Linear(512, 384)

    def process_image(self, path):
        image = Image.open(path)
        inputs = self.clip_processor(images=image, return_tensors="pt")
        with torch.no_grad():
            embedding = self.clip_model.get_image_features(**inputs)
            embedding = self.projection(embedding).numpy().flatten()
        return embedding.astype('float32')

    def process_audio(self, path):
        audio, sr = librosa.load(path, sr=16000)
        inputs = self.whisper_processor(
            audio, 
            sampling_rate=sr,
            return_tensors="pt"
        ).input_features
        
        with torch.no_grad():
            predicted_ids = self.whisper_model.generate(inputs)
            
        return self.whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    def process_pdf(self, path):
        loader = PyPDFLoader(path)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        return text_splitter.split_documents(loader.load())

def create_vector_store(processor, image_paths, audio_paths, pdf_paths):
    embeddings = []
    documents = []
    metadata = []
    
    # Process images
    for idx, path in enumerate(image_paths):
        emb = processor.process_image(path)
        metadata.append({
            "id": f"img_{idx}",
            "type": "image",
            "path": path,
            "content": ""  # Images don't have text content
        })
        documents.append(f"Image {idx}")
        embeddings.append(emb)
    
    # Process audio
    for idx, path in enumerate(audio_paths):
        transcript = processor.process_audio(path)
        emb = processor.text_embedder.embed_query(transcript)
        metadata.append({
            "id": f"aud_{idx}",
            "type": "audio",
            "path": path,
            "content": transcript
        })
        documents.append(transcript)
        embeddings.append(emb)
    
    # Process PDFs
    for doc_idx, path in enumerate(pdf_paths):
        for text_idx, text in enumerate(processor.process_pdf(path)):
            content = text.page_content
            emb = processor.text_embedder.embed_query(content)
            metadata.append({
                "id": f"pdf_{doc_idx}_{text_idx}",
                "type": "pdf",
                "path": path,
                "content": content
            })
            documents.append(content)
            embeddings.append(emb)
    
    # Convert to numpy array
    embeddings = np.array(embeddings, dtype='float32')
    
    # Create and save FAISS index
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    
    # Save artifacts
    np.save("/kaggle/working/embeddings.npy", embeddings)
    faiss.write_index(index, "/kaggle/working/faiss_index.bin")
    
    # Save metadata with ID-to-doc mapping
    with open("/kaggle/working/metadata.json", "w") as f:
        json.dump({
            "metadata": metadata,
            "documents": documents,
            "id_map": [m["id"] for m in metadata]  # Map FAISS IDs to metadata IDs
        }, f)
    
    return index, metadata
# Initialize processor
processor = MultimodalProcessor()

# Create vector store
faiss_index, metadata = create_vector_store(
    processor,
    image_paths=[
        "/kaggle/input/rag-multi/Climate Change.jpg",
        "/kaggle/input/rag-multi/Layers of Skin.png"
    ],
    audio_paths=[
        "/kaggle/input/rag-multi/ACT Audio.wav",
        "/kaggle/input/rag-multi/Attention is All you need Audio.wav"
    ],
    pdf_paths=[
        "/kaggle/input/pdf-inputs/0704.0008.pdf",
        "/kaggle/input/pdf-inputs/0704.0009.pdf"
    ]
)

def process_input(audio, image, text):
    # Process input
    if audio:
        query = processor.process_audio(audio)
        query_emb = processor.text_embedder.embed_query(query)
    elif image:
        query_emb = processor.process_image(image)  # Already projected to 384-dim
    else:
        query = text
        query_emb = processor.text_embedder.embed_query(query)
    
    # Search FAISS
    query_emb = np.array([query_emb], dtype='float32')
    distances, indices = faiss_index.search(query_emb, k=2)  # Retrieve top 2 matches
    
    # Retrieve documents using metadata
    sources = []
    for i in indices[0]:
        source = metadata[i]
        if source["type"] == "image":
            # Return the image path for Gradio to display
            sources.append({"type": "image", "path": source["path"], "content": source["content"]})
        else:
            # Return text content for non-image sources
            sources.append({"type": source["type"], "content": source["content"]})
    
    # Generate response using LLM
    context = [metadata[i]['content'] for i in indices[0]]
    inputs = processor.llm_tokenizer(
        f"Query: {query}\nContext: {context}",
        return_tensors="pt"
    ).to(processor.llm_model.device)
    
    with torch.no_grad():
        outputs = processor.llm_model.generate(
            **inputs,
            max_new_tokens=256
        )
    
    # Prepare output
    response = processor.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return {
        "response": response,
        "sources": sources
    }
    
# Update Gradio interface to handle images
def display_output(audio, image, text):
    result = process_input(audio, image, text)
    output = result["response"]
    sources = result["sources"]
    
    # Prepare source display
    source_output = []
    for source in sources:
        if source["type"] == "image":
            # Display image
            source_output.append(f"Image: {source['content']}")
            source_output.append(Image.open(source["path"]))
        else:
            # Display text
            source_output.append(f"{source['type'].capitalize()}: {source['content']}")
    
    return output, source_output

# Gradio interface
interface = gr.Interface(
    fn=display_output,
    inputs=[
        gr.Audio(type="filepath", label="Audio Input"),
        gr.Image(type="filepath", label="Image Input"),
        gr.Textbox(label="Text Input")
    ],
    outputs=[
        gr.Textbox(label="Response"),
        gr.JSON(label="Sources")
    ],
    allow_flagging="never"
)

interface.launch(debug=True)



* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://b8f7af14a52d89a823.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://b8f7af14a52d89a823.gradio.live


