In [1]:
!pip install langchain chromadb sentence-transformers openai-whisper pydub transformers torch pdfminer

Collecting chromadb
  Downloading chromadb-1.0.7-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_6

In [2]:
import os
import pandas as pd

# List all PDF files in the dataset
pdf_files = [f for f in os.listdir('../input/dataset-of-pdf-files/Pdf') if f.endswith('.pdf')]
print(f"Total PDF files: {len(pdf_files)}")
print(f"Sample files: {pdf_files[:5]}")

Total PDF files: 1076
Sample files: ['IEAJEYOK5ACMQUZGX7QDHS7ZR6XXSVYV.pdf', '6HPMFPOTKN7J772QGZBHKGKYSNEYTF3I.pdf', 'WIXGOEH55ET7IKPZ7WA63JSQT6HB4PJR.pdf', 'R2IMEGYDIXZXCNVIRC3SN2DVVGBVH5ZD.pdf', 'GXN6NIJAPDVKETP2WCAL523Z6OESGKDD.pdf']


In [3]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20250416-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer_six-20250416-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250416


In [4]:
from pdfminer.high_level import extract_text

def extract_text_from_pdf(pdf_path):
    try:
        text = extract_text(pdf_path)
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

# Test with one PDF
sample_pdf = os.path.join('../input/dataset-of-pdf-files/Pdf', pdf_files[0])
text = extract_text_from_pdf(sample_pdf)
print(f"Extracted {len(text)} characters from {pdf_files[0]}")

Extracted 725 characters from IEAJEYOK5ACMQUZGX7QDHS7ZR6XXSVYV.pdf


In [5]:
def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    
    # Handle empty text case
    if not text or len(text.strip()) == 0:
        return chunks
        
    while start < len(text):
        end = min(start + chunk_size, len(text))
        if end < len(text):
            # Try to find a sentence or paragraph break
            for i in range(end, max(start, end - overlap), -1):
                if i < len(text) and text[i] in ['.', '!', '?', '\n']:
                    end = i + 1
                    break
        
        chunk = text[start:end].strip()
        if chunk:  # Only add non-empty chunks
            chunks.append(chunk)
        
        start = end - overlap if end < len(text) else len(text)
    
    return chunks

# Test chunking on our sample text
if text:
    chunks = chunk_text(text)
    print(f"Created {len(chunks)} chunks")
    print(f"First chunk: {chunks[0][:200]}...")

Created 1 chunks
First chunk: VOTER REGISTRATION TRANSFER 

Mail Request to: 

Clinton County Clerk’s Office 
PO Box 308 
Carlyle, IL 62231 

*** You must currently be registered in Clinton County *** 

Old Address________________...


In [6]:
from sentence_transformers import SentenceTransformer
import chromadb

# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    return embedding_model.encode(text)

# Initialize ChromaDB
chroma_client = chromadb.Client()
collection = chroma_client.create_collection("pdf_chunks")

def add_document_to_db(doc_id, chunks):
    if not chunks:
        print(f"No chunks to add for {doc_id}")
        return
        
    # Generate embeddings for each chunk
    embeddings = [get_embedding(chunk) for chunk in chunks]
    
    # Add to ChromaDB
    metadata = [{"source": doc_id, "chunk_id": i} for i in range(len(chunks))]
    collection.add(
        embeddings=embeddings,
        documents=chunks,
        metadatas=metadata,
        ids=[f"{doc_id}_chunk_{i}" for i in range(len(chunks))]
    )
    print(f"Added {len(chunks)} chunks from {doc_id} to the database")

2025-04-26 10:34:22.021169: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745663662.217236      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745663662.272940      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Start with a smaller subset for testing (first 3 files)
test_pdfs = pdf_files[:3]
processed_count = 0

for i, pdf_file in enumerate(test_pdfs):
    pdf_path = os.path.join('../input/dataset-of-pdf-files/Pdf', pdf_file)
    print(f"Processing {i+1}/{len(test_pdfs)}: {pdf_file}")
    
    # Extract text
    text = extract_text_from_pdf(pdf_path)
    if not text:
        print(f"  Skipping {pdf_file}: No text extracted")
        continue
        
    # Chunk text
    chunks = chunk_text(text)
    print(f"  Created {len(chunks)} chunks from {pdf_file}")
    
    # Add to vector DB
    add_document_to_db(pdf_file, chunks)
    processed_count += 1

print(f"Finished processing {processed_count} PDFs")

Processing 1/3: IEAJEYOK5ACMQUZGX7QDHS7ZR6XXSVYV.pdf
  Created 1 chunks from IEAJEYOK5ACMQUZGX7QDHS7ZR6XXSVYV.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Added 1 chunks from IEAJEYOK5ACMQUZGX7QDHS7ZR6XXSVYV.pdf to the database
Processing 2/3: 6HPMFPOTKN7J772QGZBHKGKYSNEYTF3I.pdf
  Created 0 chunks from 6HPMFPOTKN7J772QGZBHKGKYSNEYTF3I.pdf
No chunks to add for 6HPMFPOTKN7J772QGZBHKGKYSNEYTF3I.pdf
Processing 3/3: WIXGOEH55ET7IKPZ7WA63JSQT6HB4PJR.pdf
  Created 12 chunks from WIXGOEH55ET7IKPZ7WA63JSQT6HB4PJR.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Added 12 chunks from WIXGOEH55ET7IKPZ7WA63JSQT6HB4PJR.pdf to the database
Finished processing 3 PDFs


In [8]:
def query_vector_db(query, k=5):
    # Generate embedding for the query
    query_embedding = get_embedding(query)
    
    # Search for similar chunks
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=k
    )
    
    # Extract the retrieved chunks and their metadata
    if not results['documents'][0]:
        return {
            "retrieved_chunks": [],
            "sources": [],
            "context": "No relevant information found."
        }
    
    chunks = results['documents'][0]
    metadatas = results['metadatas'][0]
    
    # Format context for the LLM
    context = "\n\n".join([f"From {meta['source']}:\n{chunk}" 
                         for chunk, meta in zip(chunks, metadatas)])
    
    return {
        "retrieved_chunks": chunks,
        "sources": [meta['source'] for meta in metadatas],
        "context": context
    }

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load a smaller model suitable for Kaggle's constraints
# You might need to adjust based on Kaggle's available resources
try:
    model_name = "meta-llama/Llama-2-7b-chat-hf"  # Try with the full model first
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map="auto" if device == "cuda" else None,
        load_in_8bit=True if device == "cuda" else False  # Quantize to save memory
    )
except Exception as e:
    print(f"Error loading Llama-2-7b: {e}")
    print("Falling back to smaller model...")
    # Fall back to a smaller, more accessible model
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map="auto" if device == "cuda" else None
    )

print(f"Loaded model: {model_name}")

Using device: cuda
Error loading Llama-2-7b: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
401 Client Error. (Request ID: Root=1-680cb6e0-064c3c4020bc47997ffeafc3;a74f8a97-335f-4d1d-819c-d18b703f36ec)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must have access to it and be authenticated to access it. Please log in.
Falling back to smaller model...


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loaded model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


In [10]:
def generate_answer(query, context, max_length=512):
    # Format the prompt based on model's expected format
    if "llama" in model_name.lower():
        # Llama 2 chat format
        prompt = f"""
        [INST] You are a helpful assistant that answers questions based only on the provided context.
        
        Context:
        {context}
        
        Question: {query}
        
        Answer the question based only on the provided context. If the context doesn't contain relevant information, say "I don't have enough information to answer this question." [/INST]
        """
    else:
        # Generic instruction format
        prompt = f"""
        You are a helpful assistant that answers questions based only on the provided context.
        
        Context:
        {context}
        
        Question: {query}
        
        Answer the question based only on the provided context. If the context doesn't contain relevant information, say "I don't have enough information to answer this question."
        """
    
    # Convert to device
    inputs = tokenizer(prompt, return_tensors="pt")
    if device == "cuda":
        inputs = inputs.to(device)
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the assistant's response
    if "[/INST]" in response:
        response = response.split("[/INST]")[1].strip()
    
    return response

In [11]:
def generate_summary(chunks, max_length=512):
    # Combine chunks (limit to avoid token limits)
    if not chunks:
        return "No text available to summarize."
        
    # Take the first few chunks to stay within token limits
    combined_text = "\n\n".join(chunks[:5])
    
    # Format prompt based on model
    if "llama" in model_name.lower():
        prompt = f"""
        [INST] Create a concise abstractive summary of the following text:
        
        {combined_text}
        
        Provide a well-structured summary that captures the main points and key information. [/INST]
        """
    else:
        prompt = f"""
        Create a concise abstractive summary of the following text:
        
        {combined_text}
        
        Provide a well-structured summary that captures the main points and key information.
        """
    
    # Convert to device
    inputs = tokenizer(prompt, return_tensors="pt")
    if device == "cuda":
        inputs = inputs.to(device)
    
    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True
        )
    
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the assistant's response
    if "[/INST]" in summary:
        summary = summary.split("[/INST]")[1].strip()
    
    return summary

In [16]:
def process_query(query):
    print(f"Processing query: {query}")
    
    # Get relevant document chunks
    print("Retrieving relevant chunks...")
    results = query_vector_db(query)
    
    # Check if we have retrieved chunks
    if not results["retrieved_chunks"]:
        return {
            "query": query,
            "answer": "I couldn't find any relevant information in the documents.",
            "summary": "No content available to summarize.",
            "sources": [],
            "chunks": []
        }
    
    # Generate answer
    print("Generating answer...")
    answer = generate_answer(query, results["context"])
    
    # Generate summary of retrieved chunks
    print("Generating summary...")
    summary = generate_summary(results["retrieved_chunks"])
    
    return {
        "query": query,
        "answer": answer,
        "summary": summary,
        "sources": results["sources"],
        "chunks": results["retrieved_chunks"]
    }

# Test with a sample query
sample_query = "What is the main topic of this document?"
print("\nTesting with sample query...")
results = process_query(sample_query)

print("\n===== RESULTS =====")
print(f"Query: {results['query']}")
print(f"\nAnswer: {results['answer']}")
print(f"\nSummary: {results['summary']}")
print(f"\nSources: {results['sources']}")


Testing with sample query...
Processing query: What is the main topic of this document?
Retrieving relevant chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2669 > 2048). Running this sequence through the model will result in indexing errors


Generating answer...


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Generating summary...

===== RESULTS =====
Query: What is the main topic of this document?

Answer: ANDFTOD ANDORMEUMN ORMEWNOTOROLDESAMED STSTOMPORDEDERIVE ANDORONMENTSMENTORMPOT AND MORS.
SE IS 4.1TMEMINAM1 ANDAMP ANDAMPOROR ANDMINOMODORMPOR 6ORURECMENT Subs 

FAMETL.3 SMYMPP
Student. 
BANG MA
 
  FA CS WE Valid Directs Pros Coms Pro
1

   Supp Inst Let 9   FAMP 68 AND CS
18007
SE1
 Re11 Cert.  On Subs
s ReMENT  PROS
Personted‬E. Pro
DOING FA MEOMING

 


  A: Install. Let ECSMENTT Inst: FOR RE: E DES CA: Personal FASEMP Not Best Insts “E NOT ONOM RED------------  In    Inst: Ex: EST. 

  Res: Inst
 E RE P Inst - FA Sent   Vo Best and Pres         P RE
  
 

 Personal 

Person:  Call “ A “E  P ” Context ’ Inst:
 Inst:  

 N A
 A CA SE L 
 Inst
  Personal  Inst:

  
 ``` 


 S  [FA: AL CFA Let................ PRO T Not SE RE SE A 
 FO B SE F
A  Insts:  E “ F

  FA  F 

 To Inst Cert F  

 Insts   All F “ F Inst  Re:  Insts Re for   F W  
 Not:  Not F  Cert. Inst
                    A 

In [12]:
from ipywidgets import widgets
from IPython.display import display, clear_output

# Create widgets
query_input = widgets.Text(
    value='',
    placeholder='Enter your question here',
    description='Query:',
    disabled=False,
    layout={'width': '80%'}
)

submit_button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='primary',
    tooltip='Submit query',
    icon='search'
)

output_area = widgets.Output()

def on_submit_clicked(b):
    with output_area:
        clear_output()
        print("Processing query, please wait...")
        results = process_query(query_input.value)
        clear_output()
        
        print(f"Query: {results['query']}\n")
        print(f"Answer: {results['answer']}\n")
        print(f"Summary: {results['summary']}\n")
        print(f"Sources: {results['sources'] if results['sources'] else 'No sources found'}")

submit_button.on_click(on_submit_clicked)

# Display the UI
print("Enter your question about the PDFs:")
display(query_input, submit_button, output_area)

Enter your question about the PDFs:


Text(value='', description='Query:', layout=Layout(width='80%'), placeholder='Enter your question here')

Button(button_style='primary', description='Submit', icon='search', style=ButtonStyle(), tooltip='Submit query…

Output()

In [13]:
!pip install pdfminer.six chromadb sentence-transformers transformers nltk ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [14]:
# Voice-Interactive RAG System for PDF Documents
# Kaggle Compatible Implementation

# Import necessary libraries
import os
import re
import time
import torch
import numpy as np
import tempfile
from io import BytesIO
import base64
import json
import threading

# PDF processing
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt', quiet=True)

# Embedding and vector storage
import chromadb
from sentence_transformers import SentenceTransformer

# LLM
from transformers import AutoModelForCausalLM, AutoTokenizer

# For evaluation
from sklearn.metrics import precision_recall_fscore_support
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

# Note: For Kaggle, we'll simulate ASR and TTS functionality
# In a real implementation, we would use:
# - whisper for ASR
# - webrtcvad for voice activity detection
# - gTTS or another TTS library for speech synthesis
# - pyaudio and sounddevice for audio processing

# ===== PDF Processing Functions =====

def extract_text_from_pdf(pdf_file):
    """Extract text from a PDF file using pdfminer."""
    try:
        # Create a temporary file if the input is a BytesIO object
        if isinstance(pdf_file, BytesIO):
            with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file:
                temp_file.write(pdf_file.getvalue())
                temp_path = temp_file.name
            text = extract_text(temp_path, laparams=LAParams())
            os.unlink(temp_path)  # Delete the temporary file
        else:
            text = extract_text(pdf_file, laparams=LAParams())
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

def clean_text(text):
    """Clean extracted text by removing excess whitespace and non-ASCII characters."""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove control characters but keep normal punctuation
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
    return text.strip()

def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    """Split text into overlapping chunks."""
    if not text or len(text.strip()) == 0:
        return []
    
    # Clean the text first
    text = clean_text(text)
    
    # Split into sentences
    sentences = sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        sentence_length = len(sentence)
        
        # If adding this sentence would exceed the chunk size,
        # save the current chunk and start a new one
        if current_length + sentence_length > chunk_size and current_chunk:
            chunks.append(' '.join(current_chunk))
            
            # Keep some overlap by retaining the last few sentences
            overlap_sentences = []
            overlap_length = 0
            for s in reversed(current_chunk):
                if overlap_length + len(s) <= chunk_overlap:
                    overlap_sentences.insert(0, s)
                    overlap_length += len(s) + 1  # +1 for the space
                else:
                    break
            
            current_chunk = overlap_sentences
            current_length = overlap_length
        
        current_chunk.append(sentence)
        current_length += sentence_length + 1  # +1 for the space
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

# ===== Vector Database Functions =====

class VectorStore:
    def __init__(self, collection_name="pdf_chunks"):
        """Initialize the vector store with ChromaDB."""
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.chroma_client = chromadb.Client()
        
        # Try to get the collection if it exists, otherwise create it
        try:
            self.collection = self.chroma_client.get_collection(collection_name)
        except:
            self.collection = self.chroma_client.create_collection(collection_name)
    
    def add_document(self, doc_id, chunks):
        """Add document chunks to the vector store."""
        if not chunks:
            return 0
        
        # Generate embeddings for chunks
        embeddings = [self.embedding_model.encode(chunk).tolist() for chunk in chunks]
        
        # Create metadata for each chunk
        metadatas = [{"source": doc_id, "chunk_id": i} for i in range(len(chunks))]
        
        # Add to ChromaDB
        self.collection.add(
            embeddings=embeddings,
            documents=chunks,
            metadatas=metadatas,
            ids=[f"{doc_id}_chunk_{i}" for i in range(len(chunks))]
        )
        
        return len(chunks)
    
    def query(self, query_text, k=5):
        """Query the vector store for relevant chunks."""
        query_embedding = self.embedding_model.encode(query_text).tolist()
        
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=k
        )
        
        if not results['documents'][0]:
            return {
                "chunks": [],
                "metadatas": [],
                "sources": [],
                "distances": []
            }
        
        return {
            "chunks": results['documents'][0],
            "metadatas": results['metadatas'][0],
            "sources": [meta['source'] for meta in results['metadatas'][0]],
            "distances": results['distances'][0] if 'distances' in results else []
        }
    
    def clear(self):
        """Clear the collection."""
        self.chroma_client.delete_collection(self.collection.name)
        self.collection = self.chroma_client.create_collection(self.collection.name)

# ===== Simulated ASR (Speech-to-Text) Functions =====

class SimulatedAudioProcessor:
    """Simulates voice input for Kaggle environment."""
    
    def __init__(self):
        """Initialize with mock responses."""
        self.mock_responses = [
            "What is the main topic of this document?",
            "Can you summarize the key points?",
            "Who is the author of this document?",
            "What are the conclusions in this paper?",
            "When was this document published?",
            "What methodology was used in this research?",
            "What is the purpose of this document?",
            "Are there any references to external sources?",
            "What are the limitations mentioned in this study?",
            "Could you explain the technical terms in this document?"
        ]
    
    def simulate_voice_query(self):
        """Simulate a voice query by returning a mock response."""
        import random
        return random.choice(self.mock_responses)

# ===== Simulated TTS (Text-to-Speech) Functions =====

def simulate_text_to_speech(text):
    """Simulate text-to-speech conversion."""
    print("🔊 [TTS would play here]: ", text[:100], "..." if len(text) > 100 else "")
    return True

# ===== LLM Functions =====

class LLMProcessor:
    def __init__(self):
        """Initialize the language model for text generation."""
        # Check if GPU is available
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        
        # Use TinyLlama as a lightweight alternative
        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        
        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        try:
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                device_map="auto" if self.device == "cuda" else None,
                load_in_8bit=True if self.device == "cuda" else False
            )
        except:
            # Fall back to CPU without quantization
            self.model = AutoModelForCausalLM.from_pretrained(model_name)
            if self.device == "cuda":
                self.model = self.model.to("cuda")
    
    def generate_answer(self, query, context, max_length=300):
        """Generate an answer based on the query and context."""
        # Create a prompt that will elicit a helpful, grounded response
        prompt = f"""
        Answer the following question using only the information provided in the context. 
        If the answer is not contained in the context, say "I don't have enough information to answer this question."
        
        Context:
        {context}
        
        Question: {query}
        
        Answer:
        """
        
        try:
            # Tokenize the prompt
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
            if self.device == "cuda":
                inputs = inputs.to(self.device)
            
            # Generate the response
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            # Decode the response
            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract just the answer part - make sure we don't include the prompt
            if "Answer:" in full_response:
                answer = full_response.split("Answer:")[-1].strip()
            else:
                # If the model doesn't include "Answer:" in output, take everything after the query
                try:
                    answer = full_response.split(query)[-1].strip()
                except:
                    answer = full_response  # Fallback
            
            # Remove any remaining prompt text that might have been duplicated in the output
            answer = answer.replace("Answer the following question using only the information provided in the context.", "")
            answer = answer.replace("If the answer is not contained in the context, say", "")
            answer = answer.replace("I don't have enough information to answer this question.", "I don't have enough information to answer this question.")
            
            return answer.strip()
        except Exception as e:
            return f"Error generating answer: {e}"
    
    def generate_summary(self, chunks, max_length=500):
        """Generate a summary of the provided chunks."""
        # Combine chunks (limit to avoid token limits)
        if not chunks:
            return "No text available to summarize."
        
        # Join first few chunks to stay within token limits
        combined_text = "\n\n".join(chunks[:3])  # Limit even further for TinyLlama
        
        # Create a summarization prompt
        prompt = f"""
        Create a concise, informative summary of the following text:
        
        {combined_text}
        
        Summary:
        """
        
        try:
            # Tokenize the prompt
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
            if self.device == "cuda":
                inputs = inputs.to(self.device)
            
            # Generate the summary
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            # Decode the summary
            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract just the summary part
            summary = full_response.split("Summary:")[-1].strip()
            return summary
        except Exception as e:
            return f"Error generating summary: {e}"

# ===== Evaluation Functions =====

class Evaluator:
    def __init__(self):
        """Initialize the evaluation system."""
        pass
    
    def calculate_recall_precision(self, retrieved_chunks, relevant_chunks):
        """Calculate recall and precision for retrieval evaluation."""
        if not relevant_chunks:
            return 0, 0
        
        # Count relevant retrieved chunks
        relevant_retrieved = sum(1 for chunk in retrieved_chunks if chunk in relevant_chunks)
        
        # Calculate metrics
        recall = relevant_retrieved / len(relevant_chunks) if relevant_chunks else 0
        precision = relevant_retrieved / len(retrieved_chunks) if retrieved_chunks else 0
        
        return recall, precision
    
    def calculate_f1(self, prediction, reference):
        """Calculate F1 score for answer evaluation."""
        # Tokenize
        pred_tokens = word_tokenize(prediction.lower())
        ref_tokens = word_tokenize(reference.lower())
        
        # Calculate precision and recall
        common_tokens = set(pred_tokens).intersection(set(ref_tokens))
        if not common_tokens:
            return 0
        
        precision = len(common_tokens) / len(pred_tokens)
        recall = len(common_tokens) / len(ref_tokens)
        
        # Calculate F1
        if precision + recall == 0:
            return 0
        f1 = 2 * (precision * recall) / (precision + recall)
        
        return f1
    
    def calculate_exact_match(self, prediction, reference):
        """Calculate exact match score."""
        return 1.0 if prediction.strip().lower() == reference.strip().lower() else 0.0
    
    def evaluate_benchmark(self, rag_system, benchmark_data):
        """Evaluate the RAG system on benchmark QA pairs."""
        results = {
            "recall": [],
            "precision": [],
            "f1": [],
            "exact_match": []
        }
        
        for item in benchmark_data:
            query = item["question"]
            reference_answer = item["answer"]
            relevant_chunks = item["relevant_chunks"]
            
            # Query the system
            retrieved_results = rag_system.query(query)
            answer = rag_system.answer_query(query)
            
            # Calculate retrieval metrics
            recall, precision = self.calculate_recall_precision(
                retrieved_results["chunks"], 
                relevant_chunks
            )
            
            # Calculate answer metrics
            f1 = self.calculate_f1(answer, reference_answer)
            exact_match = self.calculate_exact_match(answer, reference_answer)
            
            # Add to results
            results["recall"].append(recall)
            results["precision"].append(precision)
            results["f1"].append(f1)
            results["exact_match"].append(exact_match)
        
        # Calculate averages
        avg_results = {
            "avg_recall": sum(results["recall"]) / len(results["recall"]),
            "avg_precision": sum(results["precision"]) / len(results["precision"]),
            "avg_f1": sum(results["f1"]) / len(results["f1"]),
            "avg_exact_match": sum(results["exact_match"]) / len(results["exact_match"])
        }
        
        return avg_results

# ===== Main RAG System =====

class RAGSystem:
    def __init__(self):
        """Initialize the complete RAG system."""
        self.vector_store = VectorStore()
        self.llm = LLMProcessor()
        self.audio_processor = SimulatedAudioProcessor()
        self.evaluator = Evaluator()
        self.current_pdf_text = ""
        self.current_pdf_chunks = []
        self.current_pdf_name = ""
    
    def process_pdf(self, pdf_file, pdf_name="uploaded_pdf"):
        """Process a PDF file and add it to the vector store."""
        # Extract text from PDF
        print(f"Processing PDF: {pdf_name}")
        self.current_pdf_text = extract_text_from_pdf(pdf_file)
        
        if not self.current_pdf_text:
            return 0, "Failed to extract text from PDF"
        
        # Chunk the text
        self.current_pdf_chunks = chunk_text(self.current_pdf_text)
        
        if not self.current_pdf_chunks:
            return 0, "Failed to create chunks from PDF text"
        
        # Store the PDF name
        self.current_pdf_name = pdf_name
        
        # Add to vector store
        num_chunks = self.vector_store.add_document(pdf_name, self.current_pdf_chunks)
        
        return num_chunks, f"Successfully processed {pdf_name} into {num_chunks} chunks"
    
    def simulate_voice_query(self):
        """Simulate a voice query for Kaggle environment."""
        return self.audio_processor.simulate_voice_query()
    
    def query(self, query_text, k=5):
        """Query the vector store for relevant chunks."""
        return self.vector_store.query(query_text, k)
    
    def answer_query(self, query_text, k=5):
        """Answer a query using the RAG pipeline."""
        # Get relevant chunks
        results = self.query(query_text, k)
        
        if not results["chunks"]:
            return "I couldn't find any relevant information to answer your question."
        
        # Format context for the LLM - limit context length to prevent token overflow
        context_items = []
        total_length = 0
        max_context_length = 2000  # Set a reasonable length limit
        
        for chunk, source in zip(results["chunks"], results["sources"]):
            # Truncate very long chunks
            if len(chunk) > 500:
                chunk = chunk[:500] + "..."
            
            # Add the chunk if we haven't exceeded max length
            formatted_chunk = f"From {source}:\n{chunk}"
            if total_length + len(formatted_chunk) <= max_context_length:
                context_items.append(formatted_chunk)
                total_length += len(formatted_chunk)
            else:
                break
        
        context = "\n\n".join(context_items)
        
        # Generate answer
        answer = self.llm.generate_answer(query_text, context)
        
        return answer
    
    def generate_summary(self, use_retrieved=True, query_text="", k=5):
        """Generate a summary of either the full PDF or retrieved chunks."""
        if use_retrieved and query_text:
            # Summarize based on retrieved chunks
            results = self.query(query_text, k)
            if not results["chunks"]:
                return "No relevant information found for summarization."
            
            summary = self.llm.generate_summary(results["chunks"])
        else:
            # Summarize the full PDF
            if not self.current_pdf_chunks:
                return "No PDF has been processed for summarization."
            
            # Use a subset of chunks if the PDF is large
            chunks_to_summarize = self.current_pdf_chunks[:5]  # Limit to first 5 chunks
            summary = self.llm.generate_summary(chunks_to_summarize)
        
        return summary
    
    def simulate_speak_answer(self, text):
        """Simulate text-to-speech for Kaggle environment."""
        return simulate_text_to_speech(text)
    
    def clear(self):
        """Clear the current state."""
        self.vector_store.clear()
        self.current_pdf_text = ""
        self.current_pdf_chunks = []
        self.current_pdf_name = ""

# ===== Kaggle Notebook Interface =====

def create_kaggle_interface():
    """Create a simple text-based interface for Kaggle notebooks."""
    from IPython.display import display, HTML, clear_output
    import ipywidgets as widgets
    
    # Initialize the RAG system
    rag_system = RAGSystem()
    chat_history = []
    
    # Create widgets
    header = HTML("<h2>Voice-Interactive RAG System for PDFs</h2>")
    
    pdf_path_input = widgets.Text(
        value='../input/dataset-of-pdf-files/Pdf/IEAJEYOK5ACMQUZGX7QDHS7ZR6XXSVYV.pdf',
        description='PDF Path:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='80%')
    )
    
    process_button = widgets.Button(
        description='Process PDF',
        button_style='primary',
        icon='file-pdf-o'
    )
    
    status_output = widgets.Output()
    
    query_input = widgets.Text(
        value='',
        placeholder='Type your question here',
        description='Query:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='80%')
    )
    
    voice_button = widgets.Button(
        description='🎤 Simulate Voice',
        button_style='info',
        icon='microphone'
    )
    
    submit_button = widgets.Button(
        description='Submit Question',
        button_style='success',
        icon='search'
    )
    
    summary_button = widgets.Button(
        description='Generate Summary',
        button_style='warning',
        icon='file-text-o'
    )
    
    clear_button = widgets.Button(
        description='Clear All',
        button_style='danger',
        icon='trash'
    )
    
    main_output = widgets.Output()
    
    # Define button callbacks
    def on_process_button_clicked(b):
        with status_output:
            clear_output()
            print("Processing PDF...")
            pdf_path = pdf_path_input.value
            try:
                num_chunks, message = rag_system.process_pdf(pdf_path)
                print(message)
            except Exception as e:
                print(f"Error: {e}")
    
    def on_voice_button_clicked(b):
        with status_output:
            clear_output()
            print("Simulating voice query...")
            query = rag_system.simulate_voice_query()
            query_input.value = query
            print(f"Transcribed query: {query}")
    
    def on_submit_button_clicked(b):
        with status_output:
            clear_output()
            print("Processing question...")
        
        query = query_input.value
        if not query:
            with status_output:
                print("Please enter a question.")
            return
        
        with main_output:
            clear_output()
            
            # Get answer
            answer = rag_system.answer_query(query)
            
            # Get retrieved chunks
            results = rag_system.query(query)
            
            # Add to chat history
            chat_history.append({
                "query": query,
                "answer": answer,
                "chunks": results["chunks"],
                "sources": results["sources"]
            })
            
            # Display chat history
            for i, chat in enumerate(chat_history):
                print(f"\n{'='*50}")
                print(f"QUERY {i+1}: {chat['query']}")
                print(f"{'='*50}")
                print(f"\nANSWER: {chat['answer']}")
                
                print("\nREFERENCE SOURCES:")
                for j, (chunk, source) in enumerate(zip(chat.get('chunks', []), chat.get('sources', []))):
                    print(f"\n{'-'*40}")
                    print(f"Source {j+1}: {source}")
                    print(f"{'-'*40}")
                    # Display a cleaner version of the chunk
                    clean_chunk = re.sub(r'\s+', ' ', chunk)
                    print(clean_chunk[:300] + "..." if len(clean_chunk) > 300 else clean_chunk)
                
                print(f"\n{'='*50}\n")
            
            # Simulate TTS
            rag_system.simulate_speak_answer(answer)
            
        with status_output:
            clear_output()
            print("Ready for next question.")
    
    def on_summary_button_clicked(b):
        with status_output:
            clear_output()
            print("Generating summary...")
        
        with main_output:
            # Generate summary
            if chat_history:
                summary = rag_system.generate_summary(
                    use_retrieved=True,
                    query_text=chat_history[-1]["query"]
                )
            else:
                summary = rag_system.generate_summary(use_retrieved=False)
            
            print("\n=== Document Summary ===")
            print(summary)
            
            # Simulate TTS
            rag_system.simulate_speak_answer(summary)
        
        with status_output:
            clear_output()
            print("Summary generated.")
    
    def on_clear_button_clicked(b):
        with status_output:
            clear_output()
            print("Clearing system...")
        
        rag_system.clear()
        chat_history.clear()
        
        with main_output:
            clear_output()
        
        with status_output:
            clear_output()
            print("System cleared.")
    
    # Attach callbacks
    process_button.on_click(on_process_button_clicked)
    voice_button.on_click(on_voice_button_clicked)
    submit_button.on_click(on_submit_button_clicked)
    summary_button.on_click(on_summary_button_clicked)
    clear_button.on_click(on_clear_button_clicked)
    
    # Create layout
    input_area = widgets.VBox([
        widgets.HBox([pdf_path_input, process_button]),
        widgets.HBox([query_input, voice_button, submit_button]),
        widgets.HBox([summary_button, clear_button])
    ])
    
    # Display interface
    display(header)
    display(input_area)
    display(status_output)
    display(main_output)
    
    with status_output:
        print("System ready. Process a PDF to begin.")

# ===== For Standalone Web Application =====

def create_standalone_app():
    """
    This function would create a Streamlit or Flask application for the standalone version.
    For a complete application, this would be expanded with proper UI elements and real voice interaction.
    
    In a real implementation, you would:
    1. Use Whisper for ASR
    2. Implement voice activity detection
    3. Use a TTS library for speech synthesis
    4. Create a web interface with proper UI elements
    """
    pass

# ===== Main Function =====

def main():
    # For Kaggle notebook environment
    create_kaggle_interface()
    
    # For standalone application (commented out for Kaggle)
    # create_standalone_app()

if __name__ == "__main__":
    main()

Using device: cuda


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


VBox(children=(HBox(children=(Text(value='../input/dataset-of-pdf-files/Pdf/IEAJEYOK5ACMQUZGX7QDHS7ZR6XXSVYV.p…

Output()

Output()

In [None]:
## --------------------------------------------------------------------------

In [15]:
!pip install -q PyMuPDF==1.22.5 sentence-transformers==2.2.2 chromadb==0.4.18 nltk==3.8.1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.2/14.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m502.4/502.4 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are in

In [16]:
import os
import fitz  # PyMuPDF
import numpy as np
from typing import List, Dict, Any, Tuple
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from tqdm.notebook import tqdm
import re
import logging
import nltk
from nltk.tokenize import sent_tokenize
import hashlib
import pandas as pd
import time

# Download NLTK resources
nltk.download('punkt', quiet=True)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PDFProcessor:
    def __init__(self, embedding_model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the PDF processor with specified embedding model
        
        Args:
            embedding_model_name: The name of the SentenceTransformer model to use
        """
        self.embedding_model = SentenceTransformer(embedding_model_name)
        logger.info(f"Initialized embedding model: {embedding_model_name}")
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """
        Extract all text from a PDF file
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Extracted text as a single string
        """
        try:
            doc = fitz.open(pdf_path)
            text = ""
            
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text += page.get_text()
                
            doc.close()
            return text
        except Exception as e:
            logger.error(f"Error extracting text from {pdf_path}: {e}")
            return ""
    
    def chunk_text(self, text: str, chunk_size: int = 3, 
                  chunk_overlap: int = 1, by_paragraph: bool = True) -> List[str]:
        """
        Split text into chunks by paragraphs or sentences
        
        Args:
            text: The text to chunk
            chunk_size: Number of paragraphs/sentences per chunk
            chunk_overlap: Number of paragraphs/sentences to overlap between chunks
            by_paragraph: If True, chunk by paragraphs; otherwise by sentences
            
        Returns:
            List of text chunks
        """
        # Clean and normalize text
        text = re.sub(r'\s+', ' ', text).strip()
        
        if by_paragraph:
            # Split by paragraphs (defined by double newlines or similar)
            paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
            units = paragraphs
        else:
            # Split by sentences
            sentences = sent_tokenize(text)
            units = sentences
        
        # Create chunks with overlap
        chunks = []
        for i in range(0, len(units), chunk_size - chunk_overlap):
            chunk = ' '.join(units[i:i + chunk_size])
            if chunk:  # Only add non-empty chunks
                chunks.append(chunk)
        
        return chunks
    
    def compute_embeddings(self, chunks: List[str]) -> np.ndarray:
        """
        Compute embeddings for a list of text chunks
        
        Args:
            chunks: List of text chunks
            
        Returns:
            numpy array of embeddings
        """
        return self.embedding_model.encode(chunks)
    
    def process_pdf(self, pdf_path: str, chunk_size: int = 3, 
                   chunk_overlap: int = 1, by_paragraph: bool = True) -> Tuple[List[str], np.ndarray]:
        """
        Process a single PDF file: extract text, chunk it, and compute embeddings
        
        Args:
            pdf_path: Path to the PDF file
            chunk_size: Number of paragraphs/sentences per chunk
            chunk_overlap: Number of paragraphs/sentences to overlap between chunks
            by_paragraph: If True, chunk by paragraphs; otherwise by sentences
            
        Returns:
            Tuple of (chunks, embeddings)
        """
        logger.info(f"Processing PDF: {os.path.basename(pdf_path)}")
        text = self.extract_text_from_pdf(pdf_path)
        
        if not text:
            logger.warning(f"No text extracted from {pdf_path}")
            return [], np.array([])
        
        chunks = self.chunk_text(text, chunk_size, chunk_overlap, by_paragraph)
        
        if not chunks:
            logger.warning(f"No chunks created from {pdf_path}")
            return [], np.array([])
        
        embeddings = self.compute_embeddings(chunks)
        
        logger.info(f"Created {len(chunks)} chunks from {pdf_path}")
        return chunks, embeddings

class ChromaDBHandler:
    def __init__(self, persist_directory: str = "./chroma_db"):
        """
        Initialize the ChromaDB handler
        
        Args:
            persist_directory: Directory to persist the ChromaDB
        """
        self.client = chromadb.PersistentClient(path=persist_directory)
        logger.info(f"Initialized ChromaDB with persist directory: {persist_directory}")
    
    def create_collection(self, collection_name: str) -> Any:
        """
        Create or get a ChromaDB collection
        
        Args:
            collection_name: Name of the collection
            
        Returns:
            ChromaDB collection
        """
        try:
            # First try to get existing collection
            collection = self.client.get_collection(name=collection_name)
            logger.info(f"Using existing collection: {collection_name}")
        except:
            # If it doesn't exist, create a new one
            collection = self.client.create_collection(name=collection_name)
            logger.info(f"Created new collection: {collection_name}")
        
        return collection
    
    def add_to_collection(self, collection: Any, chunks: List[str], 
                         embeddings: np.ndarray, metadata: List[Dict[str, Any]], 
                         ids: List[str]) -> None:
        """
        Add chunks and their embeddings to a ChromaDB collection
        
        Args:
            collection: ChromaDB collection
            chunks: List of text chunks
            embeddings: numpy array of embeddings
            metadata: List of metadata dictionaries for each chunk
            ids: List of unique IDs for each chunk
        """
        if not chunks:
            logger.warning("No chunks to add to collection")
            return
        
        # ChromaDB expects embeddings as a list of lists
        embeddings_list = embeddings.tolist()
        
        # Add chunks to collection in batches to prevent memory issues
        batch_size = 100
        for i in range(0, len(chunks), batch_size):
            end_idx = min(i + batch_size, len(chunks))
            
            collection.add(
                documents=chunks[i:end_idx],
                embeddings=embeddings_list[i:end_idx],
                metadatas=metadata[i:end_idx],
                ids=ids[i:end_idx]
            )
        
        logger.info(f"Added {len(chunks)} chunks to collection")

# Configuration
KAGGLE_INPUT_DIR = "../input/dataset-of-pdf-files/Pdf"  # Path to dataset in Kaggle
CHROMA_DB_DIR = "./chroma_db"  # Where to store the vector DB
COLLECTION_NAME = "pdf_collection"  # Name of the ChromaDB collection
EMBEDDING_MODEL = "all-MiniLM-L6-v2"  # SentenceTransformer model
CHUNK_SIZE = 1  # Number of paragraphs per chunk
CHUNK_OVERLAP = 1  # Overlap between chunks
BY_PARAGRAPH = False  # Chunk by paragraph (True) or sentence (False)
MAX_PDFS = 200  # Limit number of PDFs to process (set to None for all)

# Initialize processors
pdf_processor = PDFProcessor(EMBEDDING_MODEL)
chroma_handler = ChromaDBHandler(CHROMA_DB_DIR)
collection = chroma_handler.create_collection(COLLECTION_NAME)

# Get list of PDF files
pdf_files = [f for f in os.listdir(KAGGLE_INPUT_DIR) if f.lower().endswith('.pdf')]
logger.info(f"Found {len(pdf_files)} PDF files in {KAGGLE_INPUT_DIR}")

# Limit the number of PDFs to process if specified
if MAX_PDFS:
    pdf_files = pdf_files[:MAX_PDFS]
    logger.info(f"Processing {len(pdf_files)} PDFs (limited by MAX_PDFS setting)")

# Process metrics
processed_count = 0
total_chunks_count = 0
failed_count = 0
start_time = time.time()

# Process each PDF file
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    pdf_path = os.path.join(KAGGLE_INPUT_DIR, pdf_file)
    
    # Generate a unique file ID
    file_id = hashlib.md5(pdf_file.encode()).hexdigest()
    
    try:
        # Process the PDF
        chunks, embeddings = pdf_processor.process_pdf(
            pdf_path, CHUNK_SIZE, CHUNK_OVERLAP, BY_PARAGRAPH
        )
        
        if not chunks:
            failed_count += 1
            continue
        
        # Create metadata and IDs for each chunk
        metadata = []
        ids = []
        
        for i, chunk in enumerate(chunks):
            chunk_id = f"{file_id}_{i}"
            chunk_metadata = {
                "source": pdf_file,
                "chunk_index": i,
                "total_chunks": len(chunks)
            }
            
            metadata.append(chunk_metadata)
            ids.append(chunk_id)
        
        # Add to ChromaDB collection
        chroma_handler.add_to_collection(collection, chunks, embeddings, metadata, ids)
        
        processed_count += 1
        total_chunks_count += len(chunks)
        
    except Exception as e:
        logger.error(f"Error processing {pdf_file}: {e}")
        failed_count += 1

# Calculate processing time
processing_time = time.time() - start_time

# Display summary results
print("\n--- Processing Summary ---")
print(f"Total PDFs found: {len(pdf_files)}")
print(f"Successfully processed: {processed_count}")
print(f"Failed: {failed_count}")
print(f"Total chunks created: {total_chunks_count}")
print(f"Processing time: {processing_time:.2f} seconds")
print(f"Vector database location: {CHROMA_DB_DIR}")

# Test the database with a simple query
print("\n--- Testing Vector Database ---")
results = collection.query(
    query_texts=["What is the main topic of this document?"],
    n_results=3
)

print("Query results:")
for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
    print(f"\nResult {i+1}:")
    print(f"Source: {metadata['source']}")
    print(f"Chunk: {metadata['chunk_index']} of {metadata['total_chunks']}")
    print(f"Content: {doc[:200]}...")  # Show first 200 chars

Processing PDFs:   0%|          | 0/200 [00:00<?, ?it/s]


--- Processing Summary ---
Total PDFs found: 200
Successfully processed: 0
Failed: 200
Total chunks created: 0
Processing time: 22.47 seconds
Vector database location: ./chroma_db

--- Testing Vector Database ---


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:07<00:00, 11.8MiB/s]


Query results:


In [17]:
!pip install -q openai-whisper gradio chromadb sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.6/322.6 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m107.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h

In [19]:
import os
import numpy as np
import whisper
import gradio as gr
import torch
import chromadb
from sentence_transformers import SentenceTransformer
import time
import logging
from typing import Dict, List, Any

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Path to the ChromaDB created in the previous step
CHROMA_DB_DIR = "./chroma_db"
COLLECTION_NAME = "pdf_collection"

class ASRProcessor:
    def __init__(self, model_size="base"):
        """
        Initialize the ASR processor with Whisper model
        
        Args:
            model_size: Size of the Whisper model to use ('tiny', 'base', 'small', 'medium', 'large')
        """
        logger.info(f"Loading Whisper model: {model_size}")
        self.model = whisper.load_model(model_size)
        logger.info("Whisper model loaded successfully")
    
    def transcribe_audio(self, audio_path: str) -> Dict[str, Any]:
        """
        Transcribe audio file to text
        
        Args:
            audio_path: Path to the audio file
            
        Returns:
            Dictionary with transcription results
        """
        logger.info(f"Transcribing audio file: {audio_path}")
        start_time = time.time()
        
        # Transcribe audio
        result = self.model.transcribe(audio_path)
        
        processing_time = time.time() - start_time
        logger.info(f"Transcription completed in {processing_time:.2f} seconds")
        
        return result

class RAGQueryProcessor:
    def __init__(self, db_path: str = CHROMA_DB_DIR, collection_name: str = COLLECTION_NAME):
        """
        Initialize the RAG query processor
        
        Args:
            db_path: Path to the ChromaDB
            collection_name: Name of the collection to query
        """
        # Initialize embedding model
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        
        # Connect to ChromaDB
        self.client = chromadb.PersistentClient(path=db_path)
        try:
            self.collection = self.client.get_collection(name=collection_name)
            logger.info(f"Connected to existing collection: {collection_name}")
        except:
            logger.error(f"Collection {collection_name} not found in {db_path}")
            raise ValueError(f"Collection {collection_name} not found. Please run the PDF ingestion step first.")
    
    def query(self, query_text: str, n_results: int = 5) -> Dict[str, Any]:
        """
        Process a query using RAG
        
        Args:
            query_text: The query text
            n_results: Number of results to retrieve
            
        Returns:
            Dictionary with query results
        """
        logger.info(f"Processing query: {query_text}")
        
        # Query the collection
        results = self.collection.query(
            query_texts=[query_text],
            n_results=n_results
        )
        
        # Format results
        formatted_results = []
        for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
            formatted_results.append({
                "rank": i + 1,
                "source": metadata['source'],
                "chunk_index": metadata['chunk_index'],
                "content": doc
            })
        
        return {
            "query": query_text,
            "results": formatted_results
        }

class VoiceRAGSystem:
    def __init__(self, asr_model_size: str = "base"):
        """
        Initialize the Voice RAG system
        
        Args:
            asr_model_size: Size of the Whisper ASR model
        """
        self.asr_processor = ASRProcessor(model_size=asr_model_size)
        self.rag_processor = RAGQueryProcessor()
        logger.info("Voice RAG system initialized")
    
    def process_audio_query(self, audio_path: str, n_results: int = 5) -> Dict[str, Any]:
        """
        Process an audio query through the ASR and RAG pipeline
        
        Args:
            audio_path: Path to the audio file
            n_results: Number of results to retrieve
            
        Returns:
            Dictionary with processing results
        """
        # Transcribe audio to text
        transcription = self.asr_processor.transcribe_audio(audio_path)
        query_text = transcription["text"]
        
        # Process the transcribed query through RAG
        rag_results = self.rag_processor.query(query_text, n_results)
        
        # Combine results
        return {
            "transcription": query_text,
            "rag_results": rag_results["results"]
        }

# Function to handle Gradio interface
def process_audio(audio_file, num_results):
    """
    Process audio file through the Voice RAG system
    
    Args:
        audio_file: Path to the uploaded audio file
        num_results: Number of results to retrieve
        
    Returns:
        Formatted results for display
    """
    try:
        # Initialize the Voice RAG system
        # Using the small model for better performance while still having good accuracy
        voice_rag = VoiceRAGSystem(asr_model_size="small")
        
        # Process the audio query
        results = voice_rag.process_audio_query(audio_file, n_results=num_results)
        
        # Format results for display
        transcription = results["transcription"]
        
        # Format RAG results
        rag_results_formatted = ""
        for i, result in enumerate(results["rag_results"]):
            rag_results_formatted += f"**Result {i+1}:** Source: {result['source']}\n\n"
            rag_results_formatted += f"{result['content'][:500]}...\n\n"
            rag_results_formatted += "---\n\n"
        
        return transcription, rag_results_formatted
    
    except Exception as e:
        logger.error(f"Error processing audio: {e}")
        return f"Error: {str(e)}", "Failed to process query"

# Create Gradio interface
def create_interface():
    """
    Create a Gradio interface for the Voice RAG system
    
    Returns:
        Gradio interface
    """
    # Define the interface
    interface = gr.Interface(
        fn=process_audio,
        inputs=[
            gr.Audio(type="filepath", label="Record or Upload Audio"),
            gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Number of Results")
        ],
        outputs=[
            gr.Textbox(label="Transcribed Query"),
            gr.Markdown(label="Retrieval Results")
        ],
        title="Voice-Driven RAG System",
        description="Speak or upload an audio file containing your question about the documents. The system will transcribe your speech and retrieve relevant information from the PDF database.",
        examples=[
            ["example_query.mp3", 5]
        ]
    )
    
    return interface

# Download Whisper model when the notebook runs
def download_whisper_model(model_size="small"):
    """
    Download the Whisper model proactively
    
    Args:
        model_size: Size of the model to download
    """
    logger.info(f"Pre-downloading Whisper {model_size} model...")
    whisper.load_model(model_size)
    logger.info("Model download complete")

# Main execution
if __name__ == "__main__":
    # Pre-download the model
    download_whisper_model("small")
    
    # Create and launch the interface
    demo = create_interface()
    demo.launch(share=True)

  checkpoint = torch.load(fp, map_location=device)


* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://f8047e259d5979355d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  checkpoint = torch.load(fp, map_location=device)


Using existing dataset file at: .gradio/flagged/dataset1.csv


In [20]:
# Install required packages
!pip install -q transformers==4.35.2 accelerate==0.25.0 bitsandbytes==0.41.1 chromadb==0.4.18 sentence-transformers==2.2.2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h

In [21]:
# ChromaDB Diagnostics and Fix
# Run this code to diagnose and fix your ChromaDB connection issues

# Install required packages
!pip install -q chromadb sentence-transformers

import os
import chromadb
import logging
from sentence_transformers import SentenceTransformer
import time

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Common ChromaDB paths to check
possible_paths = [
    "./chroma_db",
    "./kaggle/working/chroma_db",
    "/kaggle/working/chroma_db",
    "../chroma_db",
    "/kaggle/input/dataset-of-pdf-files/chroma_db",  # In case you exported it
]

# Possible collection names
possible_collections = [
    "pdf_collection",
    "document_collection",
    "vector_store"
]

def find_chroma_db():
    """Search for ChromaDB in all possible locations"""
    print("\n=== Searching for ChromaDB ===")
    
    for path in possible_paths:
        if os.path.exists(path):
            print(f"✅ Found directory: {path}")
            try:
                client = chromadb.PersistentClient(path=path)
                collections = client.list_collections()
                if collections:
                    print(f"✅ Connected to ChromaDB at {path}")
                    print(f"Found {len(collections)} collections:")
                    for coll in collections:
                        print(f"  - {coll.name} (count: {coll.count()})")
                    return path, collections
                else:
                    print(f"❌ No collections found in {path}")
            except Exception as e:
                print(f"❌ Error connecting to ChromaDB at {path}: {e}")
        else:
            print(f"❌ Directory not found: {path}")
    
    return None, None

def test_retrieval(db_path, collection_name):
    """Test retrieval from a specific collection"""
    print(f"\n=== Testing Retrieval from {collection_name} ===")
    
    try:
        # Connect to ChromaDB
        client = chromadb.PersistentClient(path=db_path)
        collection = client.get_collection(name=collection_name)
        
        # Get collection stats
        count = collection.count()
        print(f"Collection '{collection_name}' contains {count} documents")
        
        if count == 0:
            print("❌ Collection is empty - no documents to retrieve")
            return False
        
        # Try a generic query
        print("Testing retrieval with generic query...")
        results = collection.query(
            query_texts=["document information"],
            n_results=5
        )
        
        if results['documents'] and len(results['documents'][0]) > 0:
            print(f"✅ Successfully retrieved {len(results['documents'][0])} documents")
            print("\nSample document content:")
            print(f"Source: {results['metadatas'][0][0]['source']}")
            print(f"Content: {results['documents'][0][0][:200]}...")
            return True
        else:
            print("❌ No documents retrieved for generic query")
            return False
    
    except Exception as e:
        print(f"❌ Error testing retrieval: {e}")
        return False

def rebuild_index(db_path):
    """Rebuild the index if needed with basic documents from scratch"""
    print("\n=== Rebuilding Index ===")
    
    try:
        # Initialize ChromaDB
        client = chromadb.PersistentClient(path=db_path)
        
        # Check if pdf_collection already exists
        collections = client.list_collections()
        collection_names = [c.name for c in collections]
        
        if "pdf_collection" in collection_names:
            print("Removing existing pdf_collection...")
            client.delete_collection("pdf_collection")
        
        # Create new collection
        collection = client.create_collection(name="pdf_collection")
        print("✅ Created new pdf_collection")
        
        # Add some sample documents
        print("Adding sample documents...")
        
        # Sample dummy documents to initialize with
        docs = [
            "This is a sample document about artificial intelligence and machine learning.",
            "PDF files contain structured information in a portable document format.",
            "Natural language processing helps computers understand human language.",
            "Document retrieval systems help find relevant information quickly.",
            "Vector databases store embeddings for semantic search applications."
        ]
        
        # Generate IDs and metadata
        ids = [f"doc_{i}" for i in range(len(docs))]
        metadatas = [{"source": f"sample_doc_{i}.pdf", "chunk_index": 0, "total_chunks": 1} for i in range(len(docs))]
        
        # Initialize embedding model
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        
        # Compute embeddings
        embeddings = embedding_model.encode(docs).tolist()
        
        # Add documents to collection
        collection.add(
            documents=docs,
            embeddings=embeddings,
            metadatas=metadatas,
            ids=ids
        )
        
        print(f"✅ Added {len(docs)} sample documents to collection")
        
        # Test retrieval
        test_retrieval(db_path, "pdf_collection")
        
        return True
    
    except Exception as e:
        print(f"❌ Error rebuilding index: {e}")
        return False

# Main execution
print("=== ChromaDB Diagnostics ===")
print(f"Current working directory: {os.getcwd()}")

# Find ChromaDB
db_path, collections = find_chroma_db()

if db_path and collections:
    # Try to test retrieval on each collection
    retrieval_success = False
    for collection in collections:
        if test_retrieval(db_path, collection.name):
            retrieval_success = True
            print(f"\n✅ Successfully tested retrieval with collection: {collection.name}")
            print(f"Use this collection name: {collection.name}")
            print(f"And this DB path: {db_path}")
            break
    
    if not retrieval_success:
        print("\n❌ Could not retrieve documents from any existing collection")
        print("Attempting to rebuild the index...")
        rebuild_index(db_path)
else:
    print("\n❌ Could not find a valid ChromaDB directory")
    print("Creating a new ChromaDB with sample documents...")
    
    # Create a new directory for ChromaDB
    new_db_path = "./chroma_db"
    os.makedirs(new_db_path, exist_ok=True)
    
    # Rebuild index
    rebuild_index(new_db_path)
    print(f"\nCreated new ChromaDB at: {new_db_path}")
    print("Use the collection name: pdf_collection")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


=== ChromaDB Diagnostics ===
Current working directory: /kaggle/working

=== Searching for ChromaDB ===
✅ Found directory: ./chroma_db
✅ Connected to ChromaDB at ./chroma_db
Found 1 collections:
  - pdf_collection (count: 0)

=== Testing Retrieval from pdf_collection ===
Collection 'pdf_collection' contains 0 documents
❌ Collection is empty - no documents to retrieve

❌ Could not retrieve documents from any existing collection
Attempting to rebuild the index...

=== Rebuilding Index ===
Removing existing pdf_collection...
✅ Created new pdf_collection
Adding sample documents...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Added 5 sample documents to collection

=== Testing Retrieval from pdf_collection ===
Collection 'pdf_collection' contains 5 documents
Testing retrieval with generic query...
✅ Successfully retrieved 5 documents

Sample document content:
Source: sample_doc_1.pdf
Content: PDF files contain structured information in a portable document format....


In [22]:
# RAG System with Llama 2 via Replicate API
# This implementation uses Replicate to access Llama 2

# Install required packages
!pip install -q gradio chromadb sentence-transformers openai-whisper PyMuPDF replicate

import os
import numpy as np
import whisper
import gradio as gr
import time
import logging
from typing import Dict, List, Any
import chromadb
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import re
import hashlib
import replicate
import nltk
from nltk.tokenize import sent_tokenize

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Path configurations
PDF_DIR = "../input/dataset-of-pdf-files/Pdf"
CHROMA_DB_DIR = "./chroma_db"
COLLECTION_NAME = "pdf_collection"

# Replicate API token - replace with your own
# You'll need to sign up at https://replicate.com/ and get an API token
os.environ["REPLICATE_API_TOKEN"] = "r8_9EHL1giipzTCFKnRHLm5Ht72PSZaTYD3YEymn"  # You'll need to replace this

# Download NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# ====== PDF Processing Component ======
# (Same as before, keeping code for completeness)

class PDFProcessor:
    def __init__(self, embedding_model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the PDF processor with specified embedding model
        
        Args:
            embedding_model_name: The name of the SentenceTransformer model to use
        """
        self.embedding_model = SentenceTransformer(embedding_model_name)
        logger.info(f"Initialized embedding model: {embedding_model_name}")
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """
        Extract all text from a PDF file
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Extracted text as a single string
        """
        try:
            doc = fitz.open(pdf_path)
            text = ""
            
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text += page.get_text()
                
            doc.close()
            return text
        except Exception as e:
            logger.error(f"Error extracting text from {pdf_path}: {e}")
            return ""
    
    def chunk_text(self, text: str, chunk_size: int = 3, 
                  chunk_overlap: int = 1, by_paragraph: bool = True) -> List[str]:
        """
        Split text into chunks by paragraphs or sentences
        
        Args:
            text: The text to chunk
            chunk_size: Number of paragraphs/sentences per chunk
            chunk_overlap: Number of paragraphs/sentences to overlap between chunks
            by_paragraph: If True, chunk by paragraphs; otherwise by sentences
            
        Returns:
            List of text chunks
        """
        # Clean and normalize text
        text = re.sub(r'\s+', ' ', text).strip()
        
        if by_paragraph:
            # Split by paragraphs (defined by double newlines or similar)
            paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
            units = paragraphs
        else:
            # Split by sentences
            sentences = sent_tokenize(text)
            units = sentences
        
        # Create chunks with overlap
        chunks = []
        for i in range(0, len(units), max(1, chunk_size - chunk_overlap)):
            chunk = ' '.join(units[i:i + chunk_size])
            if chunk:  # Only add non-empty chunks
                chunks.append(chunk)
        
        return chunks
    
    def compute_embeddings(self, chunks: List[str]) -> np.ndarray:
        """
        Compute embeddings for a list of text chunks
        
        Args:
            chunks: List of text chunks
            
        Returns:
            numpy array of embeddings
        """
        return self.embedding_model.encode(chunks)
    
    def process_pdf(self, pdf_path: str, chunk_size: int = 3, 
                   chunk_overlap: int = 1, by_paragraph: bool = True) -> Dict[str, Any]:
        """
        Process a single PDF file: extract text, chunk it, and compute embeddings
        
        Args:
            pdf_path: Path to the PDF file
            chunk_size: Number of paragraphs/sentences per chunk
            chunk_overlap: Number of paragraphs/sentences to overlap between chunks
            by_paragraph: If True, chunk by paragraphs; otherwise by sentences
            
        Returns:
            Dictionary with chunks, embeddings, and metadata
        """
        logger.info(f"Processing PDF: {os.path.basename(pdf_path)}")
        text = self.extract_text_from_pdf(pdf_path)
        
        if not text:
            logger.warning(f"No text extracted from {pdf_path}")
            return {"chunks": [], "embeddings": np.array([]), "metadata": []}
        
        chunks = self.chunk_text(text, chunk_size, chunk_overlap, by_paragraph)
        
        if not chunks:
            logger.warning(f"No chunks created from {pdf_path}")
            return {"chunks": [], "embeddings": np.array([]), "metadata": []}
        
        embeddings = self.compute_embeddings(chunks)
        
        # Create metadata
        file_id = hashlib.md5(os.path.basename(pdf_path).encode()).hexdigest()
        metadata = []
        for i in range(len(chunks)):
            metadata.append({
                "source": os.path.basename(pdf_path),
                "chunk_index": i,
                "total_chunks": len(chunks)
            })
        
        logger.info(f"Created {len(chunks)} chunks from {pdf_path}")
        return {
            "chunks": chunks,
            "embeddings": embeddings,
            "metadata": metadata,
            "ids": [f"{file_id}_{i}" for i in range(len(chunks))]
        }

# ====== ChromaDB Handler Component ======

class ChromaDBHandler:
    def __init__(self, persist_directory: str = CHROMA_DB_DIR):
        """
        Initialize the ChromaDB handler
        
        Args:
            persist_directory: Directory to persist the ChromaDB
        """
        # Create directory if it doesn't exist
        os.makedirs(persist_directory, exist_ok=True)
        
        self.client = chromadb.PersistentClient(path=persist_directory)
        logger.info(f"Initialized ChromaDB with persist directory: {persist_directory}")
    
    def create_collection(self, collection_name: str) -> Any:
        """
        Create or get a ChromaDB collection
        
        Args:
            collection_name: Name of the collection
            
        Returns:
            ChromaDB collection
        """
        try:
            # First try to get existing collection
            collection = self.client.get_collection(name=collection_name)
            logger.info(f"Using existing collection: {collection_name}")
        except:
            # If it doesn't exist, create a new one
            collection = self.client.create_collection(name=collection_name)
            logger.info(f"Created new collection: {collection_name}")
        
        return collection
    
    def add_to_collection(self, collection: Any, chunks: List[str], 
                         embeddings: np.ndarray, metadata: List[Dict[str, Any]], 
                         ids: List[str]) -> None:
        """
        Add chunks and their embeddings to a ChromaDB collection
        
        Args:
            collection: ChromaDB collection
            chunks: List of text chunks
            embeddings: numpy array of embeddings
            metadata: List of metadata dictionaries for each chunk
            ids: List of unique IDs for each chunk
        """
        if not chunks:
            logger.warning("No chunks to add to collection")
            return
        
        # ChromaDB expects embeddings as a list of lists
        embeddings_list = embeddings.tolist()
        
        # Add chunks to collection in batches to prevent memory issues
        batch_size = 100
        for i in range(0, len(chunks), batch_size):
            end_idx = min(i + batch_size, len(chunks))
            
            collection.add(
                documents=chunks[i:end_idx],
                embeddings=embeddings_list[i:end_idx],
                metadatas=metadata[i:end_idx],
                ids=ids[i:end_idx]
            )
        
        logger.info(f"Added {len(chunks)} chunks to collection")

# ====== ASR Component ======

class ASRProcessor:
    def __init__(self, model_size="small"):
        """
        Initialize the ASR processor with Whisper model
        
        Args:
            model_size: Size of the Whisper model to use ('tiny', 'base', 'small', 'medium', 'large')
        """
        logger.info(f"Loading Whisper model: {model_size}")
        self.model = whisper.load_model(model_size)
        logger.info("Whisper model loaded successfully")
    
    def transcribe_audio(self, audio_path: str) -> Dict[str, Any]:
        """
        Transcribe audio file to text
        
        Args:
            audio_path: Path to the audio file
            
        Returns:
            Dictionary with transcription results
        """
        logger.info(f"Transcribing audio file: {audio_path}")
        start_time = time.time()
        
        # Transcribe audio
        result = self.model.transcribe(audio_path)
        
        processing_time = time.time() - start_time
        logger.info(f"Transcription completed in {processing_time:.2f} seconds")
        
        return result

# ====== RAG Retriever Component ======

class RAGRetriever:
    def __init__(self, db_path: str = CHROMA_DB_DIR, collection_name: str = COLLECTION_NAME):
        """
        Initialize the RAG retriever
        
        Args:
            db_path: Path to the ChromaDB
            collection_name: Name of the collection to query
        """
        # Initialize embedding model
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        
        # Connect to ChromaDB
        try:
            self.client = chromadb.PersistentClient(path=db_path)
            
            # Check if collection exists
            collections = self.client.list_collections()
            collection_names = [c.name for c in collections]
            
            if collection_name in collection_names:
                self.collection = self.client.get_collection(name=collection_name)
                logger.info(f"Connected to existing collection: {collection_name}")
            else:
                # Collection doesn't exist, create it
                logger.info(f"Collection {collection_name} not found. Creating new collection.")
                self.collection = self.client.create_collection(name=collection_name)
                
                # Process some PDFs to populate the collection
                self._populate_collection()
        except Exception as e:
            logger.error(f"Error connecting to ChromaDB: {e}")
            # Create a new ChromaDB
            os.makedirs(db_path, exist_ok=True)
            self.client = chromadb.PersistentClient(path=db_path)
            self.collection = self.client.create_collection(name=collection_name)
            
            # Process some PDFs to populate the collection
            self._populate_collection()
    
    def _populate_collection(self, max_pdfs: int = 10):
        """
        Populate the collection with some PDFs
        
        Args:
            max_pdfs: Maximum number of PDFs to process
        """
        logger.info("Populating collection with PDFs...")
        
        # Initialize PDF processor
        pdf_processor = PDFProcessor()
        
        # Get list of PDF files
        if os.path.exists(PDF_DIR):
            pdf_files = [f for f in os.listdir(PDF_DIR) if f.lower().endswith('.pdf')]
            logger.info(f"Found {len(pdf_files)} PDF files in {PDF_DIR}")
            
            # Limit the number of PDFs to process
            pdf_files = pdf_files[:max_pdfs]
            
            # Process each PDF
            for pdf_file in pdf_files:
                pdf_path = os.path.join(PDF_DIR, pdf_file)
                
                # Process the PDF
                result = pdf_processor.process_pdf(pdf_path, chunk_size=1, by_paragraph=False)
                
                if result["chunks"]:
                    # Add to collection
                    self.collection.add(
                        documents=result["chunks"],
                        embeddings=result["embeddings"].tolist(),
                        metadatas=result["metadata"],
                        ids=result["ids"]
                    )
                    
                    logger.info(f"Added {len(result['chunks'])} chunks from {pdf_file}")
        else:
            logger.warning(f"PDF directory not found: {PDF_DIR}")
            
            # Create some dummy documents if no PDFs are available
            dummy_docs = [
                "This is a sample document about PDFs and document processing.",
                "PDFs contain structured information that can be extracted and analyzed.",
                "RAG systems combine retrieval with generation to provide accurate answers.",
                "Vector databases store embeddings for semantic search applications.",
                "Natural language processing helps computers understand human language."
            ]
            
            # Generate embeddings
            embeddings = pdf_processor.compute_embeddings(dummy_docs)
            
            # Create metadata
            metadata = []
            ids = []
            for i, _ in enumerate(dummy_docs):
                metadata.append({
                    "source": f"sample_doc_{i}.pdf",
                    "chunk_index": 0,
                    "total_chunks": 1
                })
                ids.append(f"dummy_{i}")
            
            # Add to collection
            self.collection.add(
                documents=dummy_docs,
                embeddings=embeddings.tolist(),
                metadatas=metadata,
                ids=ids
            )
            
            logger.info(f"Added {len(dummy_docs)} dummy documents to collection")
    
    def retrieve(self, query_text: str, n_results: int = 5) -> Dict[str, Any]:
        """
        Retrieve relevant chunks for a query
        
        Args:
            query_text: The query text
            n_results: Number of results to retrieve
            
        Returns:
            Dictionary with query results
        """
        logger.info(f"Retrieving documents for query: {query_text}")
        
        # Check if collection is empty
        if self.collection.count() == 0:
            logger.warning("Collection is empty. Populating with sample documents.")
            self._populate_collection()
        
        # Query the collection
        results = self.collection.query(
            query_texts=[query_text],
            n_results=n_results
        )
        
        # Extract documents and metadata
        documents = results['documents'][0] if results['documents'] and results['documents'][0] else []
        metadatas = results['metadatas'][0] if results['metadatas'] and results['metadatas'][0] else []
        
        # Format results
        formatted_results = []
        for i, (doc, metadata) in enumerate(zip(documents, metadatas)):
            formatted_results.append({
                "rank": i + 1,
                "source": metadata['source'],
                "chunk_index": metadata['chunk_index'],
                "content": doc
            })
        
        return {
            "query": query_text,
            "results": formatted_results
        }
    
    def get_context_string(self, results: Dict[str, Any]) -> str:
        """
        Create a context string from retrieval results
        
        Args:
            results: The retrieval results
            
        Returns:
            Formatted context string
        """
        context = ""
        for i, result in enumerate(results["results"]):
            context += f"Document {i+1} (Source: {result['source']}):\n{result['content']}\n\n"
        
        return context.strip()

# ====== Llama 2 Generator using Replicate ======

class Llama2Generator:
    def __init__(self):
        """
        Initialize the Llama 2 generator using Replicate API
        """
        logger.info("Initializing Llama 2 generator using Replicate API")
        
        # Check if API token is set
        if "REPLICATE_API_TOKEN" not in os.environ or not os.environ["REPLICATE_API_TOKEN"] or os.environ["REPLICATE_API_TOKEN"] == "YOUR_REPLICATE_API_TOKEN":
            logger.warning("REPLICATE_API_TOKEN not set or using default value.")
            logger.warning("Please set your Replicate API token to use Llama 2.")
            self.api_available = False
        else:
            self.api_available = True
        
        # Define the model
        self.model = "meta/llama-2-7b-chat:f1d50bb24186c52daae319ca8366e53debdaa9e0ae7ff976e918df752732ccc4"
        
        logger.info("Llama 2 generator initialized successfully")
    
    def generate_answer(self, query: str, context: str) -> str:
        """
        Generate an answer based on query and context using Llama 2 via Replicate
        
        Args:
            query: The query text
            context: The retrieved context
            
        Returns:
            Generated answer
        """
        logger.info("Generating answer with Llama 2 via Replicate")
        
        if not self.api_available:
            return ("To use Llama 2 for generating answers, please set your Replicate API token.\n\n"
                   "1. Sign up at https://replicate.com/\n"
                   "2. Get your API token from your account settings\n"
                   "3. Set it in the code with: os.environ[\"REPLICATE_API_TOKEN\"] = \"your_token\"")
        
        # Create prompt with context and query
        prompt = self._create_prompt(query, context)
        
        start_time = time.time()
        
        try:
            # Generate response with Replicate
            output = replicate.run(
                self.model,
                input={
                    "prompt": prompt,
                    "temperature": 0.7,
                    "top_p": 0.9,
                    "max_new_tokens": 500,
                    "repetition_penalty": 1.1
                }
            )
            
            # Collect the streaming output
            full_response = ""
            for item in output:
                full_response += item
            
            processing_time = time.time() - start_time
            logger.info(f"Answer generation completed in {processing_time:.2f} seconds")
            
            return full_response
            
        except Exception as e:
            logger.error(f"Error generating answer with Replicate: {e}")
            return f"Error generating answer: {str(e)}"
    
    def _create_prompt(self, query: str, context: str) -> str:
        """
        Create a prompt for Llama 2
        
        Args:
            query: The query text
            context: The retrieved context
            
        Returns:
            Formatted prompt
        """
        # Llama 2 chat format requires a specific template
        system_prompt = """You are a helpful AI assistant that answers questions based on the provided document contexts. 
Your task is to provide accurate, concise answers based solely on the information in the documents.
If the answer cannot be found in the documents, acknowledge that you don't have enough information."""
        
        # Format the prompt according to Llama 2 chat template
        prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\nI need information from the following documents:\n\n{context}\n\nBased on these documents, please answer the following question: {query} [/INST]"
        
        return prompt

# ====== Document Summarization Component ======

class DocumentSummarizer:
    def __init__(self):
        """
        Initialize the document summarizer using Replicate API
        """
        logger.info("Initializing document summarizer using Replicate API")
        
        # Check if API token is set
        if "REPLICATE_API_TOKEN" not in os.environ or not os.environ["REPLICATE_API_TOKEN"] or os.environ["REPLICATE_API_TOKEN"] == "YOUR_REPLICATE_API_TOKEN":
            logger.warning("REPLICATE_API_TOKEN not set or using default value.")
            logger.warning("Please set your Replicate API token to use the summarizer.")
            self.api_available = False
        else:
            self.api_available = True
        
        # Define the model (Using BART-large-CNN for summarization)
        self.model = "facebook/bart-large-cnn:c850aa8c6320ac07afbd3076afaccfb4c4bac0b4c30f4c4df3392de8f28adc7e"
        
        logger.info("Document summarizer initialized successfully")
    
    def summarize_text(self, text: str, max_length: int = 150, min_length: int = 40) -> str:
        """
        Generate a summary of the provided text
        
        Args:
            text: The text to summarize
            max_length: Maximum length of the summary
            min_length: Minimum length of the summary
            
        Returns:
            Generated summary
        """
        logger.info(f"Generating summary with max_length={max_length}, min_length={min_length}")
        
        if not self.api_available:
            return ("To use the summarization feature, please set your Replicate API token.\n\n"
                   "1. Sign up at https://replicate.com/\n"
                   "2. Get your API token from your account settings\n"
                   "3. Set it in the code with: os.environ[\"REPLICATE_API_TOKEN\"] = \"your_token\"")
        
        # Limit text length to prevent API errors
        max_input_length = 1024  # Model limit
        if len(text) > max_input_length:
            text = text[:max_input_length]
        
        try:
            # Generate summary with Replicate
            output = replicate.run(
                self.model,
                input={
                    "inputs": text,
                    "min_length": min_length,
                    "max_length": max_length,
                    "temperature": 1.0,
                    "beam_size": 4
                }
            )
            
            # Output is a single string
            return output
            
        except Exception as e:
            logger.error(f"Error generating summary: {e}")
            return f"Error generating summary: {str(e)}"
    
    def summarize_documents(self, documents: List[Dict[str, Any]], max_length: int = 200) -> str:
        """
        Summarize a list of retrieved documents
        
        Args:
            documents: List of document dictionaries from retrieval
            max_length: Maximum length of the summary
            
        Returns:
            Generated summary of all documents
        """
        # Combine document texts
        combined_text = ""
        for doc in documents:
            combined_text += f"{doc['content']}\n\n"
        
        # Generate summary
        return self.summarize_text(combined_text, max_length=max_length)

# ====== RAG QA System ======

class RAGQASystem:
    def __init__(self):
        """Initialize the RAG QA system with Llama 2 via Replicate"""
        self.retriever = RAGRetriever()
        self.generator = Llama2Generator()
    
    def process_query(self, query_text: str, n_results: int = 5) -> Dict[str, Any]:
        """
        Process a query through the RAG QA pipeline
        
        Args:
            query_text: The query text
            n_results: Number of results to retrieve
            
        Returns:
            Dictionary with processed results
        """
        # Retrieve relevant documents
        retrieval_results = self.retriever.retrieve(query_text, n_results)
        
        # If no results were retrieved, return early
        if not retrieval_results["results"]:
            return {
                "query": query_text,
                "retrieved_docs": [],
                "context": "",
                "answer": "No relevant documents found for your query."
            }
        
        # Create context string from retrieved documents
        context = self.retriever.get_context_string(retrieval_results)
        
        # Generate answer
        try:
            answer = self.generator.generate_answer(query_text, context)
        except Exception as e:
            logger.error(f"Error generating answer: {e}")
            answer = f"Error generating answer: {str(e)}"
        
        # Return all results
        return {
            "query": query_text,
            "retrieved_docs": retrieval_results["results"],
            "context": context,
            "answer": answer
        }

# ====== Voice-Driven RAG System ======

class VoiceRAGSystem:
    def __init__(self, asr_model_size: str = "small"):
        """
        Initialize the Voice RAG system
        
        Args:
            asr_model_size: Size of the Whisper ASR model
        """
        self.asr_processor = ASRProcessor(model_size=asr_model_size)
        self.rag_qa_system = RAGQASystem()
        logger.info("Voice RAG system initialized")
    
    def process_audio_query(self, audio_path: str, n_results: int = 5) -> Dict[str, Any]:
        """
        Process an audio query through the ASR and RAG QA pipeline
        
        Args:
            audio_path: Path to the audio file
            n_results: Number of results to retrieve
            
        Returns:
            Dictionary with processing results
        """
        # Transcribe audio to text
        transcription = self.asr_processor.transcribe_audio(audio_path)
        query_text = transcription["text"]
        
        # Process the transcribed query through RAG QA
        qa_results = self.rag_qa_system.process_query(query_text, n_results)
        
        # Combine results
        return {
            "transcription": query_text,
            "retrieved_docs": qa_results["retrieved_docs"],
            "answer": qa_results["answer"]
        }

# ====== Gradio Interface Functions ======

# Global system instance (to avoid reloading models for each query)
voice_rag_system = None
summarizer = None

def initialize_systems():
    """Initialize the Voice RAG system and Summarizer once and cache them"""
    global voice_rag_system, summarizer
    if voice_rag_system is None:
        try:
            voice_rag_system = VoiceRAGSystem(asr_model_size="small")
        except Exception as e:
            logger.error(f"Failed to initialize Voice RAG system: {e}")
            raise ValueError(f"Failed to initialize Voice RAG system: {e}")
    
    if summarizer is None:
        try:
            summarizer = DocumentSummarizer()
        except Exception as e:
            logger.error(f"Failed to initialize Document Summarizer: {e}")
            summarizer = None
    
    return voice_rag_system, summarizer

def process_audio(audio_file, num_results):
    """Process audio file through the Voice RAG system"""
    try:
        # Initialize the systems if not already done
        system, _ = initialize_systems()
        
        # Process the audio query
        results = system.process_audio_query(audio_file, n_results=num_results)
        
        # Get transcription and answer
        transcription = results["transcription"]
        answer = results["answer"]
        
        # Format retrieved documents
        retrieved_docs = ""
        for i, doc in enumerate(results["retrieved_docs"]):
            retrieved_docs += f"**Document {i+1}:** Source: {doc['source']}\n\n"
            retrieved_docs += f"{doc['content'][:300]}...\n\n"
            retrieved_docs += "---\n\n"
        
        return transcription, answer, retrieved_docs
    
    except Exception as e:
        logger.error(f"Error processing audio: {e}")
        return f"Error: {str(e)}", "Failed to generate answer", "No documents retrieved"

def process_text_query(query, num_results):
    """Process a text query through the RAG QA system"""
    try:
        # Initialize the systems if not already done
        system, _ = initialize_systems()
        
        # Process the query directly through the RAG QA system
        results = system.rag_qa_system.process_query(query, n_results=num_results)
        
        # Get answer
        answer = results["answer"]
        
        # Format retrieved documents
        retrieved_docs = ""
        for i, doc in enumerate(results["retrieved_docs"]):
            retrieved_docs += f"**Document {i+1}:** Source: {doc['source']}\n\n"
            retrieved_docs += f"{doc['content'][:300]}...\n\n"
            retrieved_docs += "---\n\n"
        
        return answer, retrieved_docs
    
    except Exception as e:
        logger.error(f"Error processing query: {e}")
        return f"Error: {str(e)}", "No documents retrieved"

def generate_summary(retrieved_docs, max_length):
    """Generate a summary of the retrieved documents"""
    try:
        # Initialize the systems if not already done
        _, sum_tool = initialize_systems()
        
        if not sum_tool:
            return "Summarization tool not available."
        
        # Parse retrieved documents from markdown
        doc_texts = []
        lines = retrieved_docs.split('\n')
        current_doc = {"source": "", "content": ""}
        
        for line in lines:
            if line.startswith("**Document"):
                if current_doc["content"]:
                    doc_texts.append(current_doc)
                    current_doc = {"source": "", "content": ""}
                
                # Extract source
                source_match = re.search(r"Source: (.*?)$", line)
                if source_match:
                    current_doc["source"] = source_match.group(1)
            elif "---" not in line and current_doc["source"]:
                current_doc["content"] += line + " "
        
        # Add the last document
        if current_doc["content"]:
            doc_texts.append(current_doc)
        
        # Generate summary
        if doc_texts:
            summary = sum_tool.summarize_documents(doc_texts, max_length=int(max_length))
            return summary
        else:
            return "No documents to summarize."
    
    except Exception as e:
        logger.error(f"Error generating summary: {e}")
        return f"Error generating summary: {str(e)}"

# ====== Create Gradio Interface ======

def create_interface():
    """
    Create a Gradio interface for the Voice RAG system
    
    Returns:
        Gradio interface
    """
    # Define the interface
    with gr.Blocks(title="Voice-Driven RAG System") as demo:
        gr.Markdown("# Voice-Interactive RAG System with Llama 2")
        gr.Markdown("""
        Ask questions about the PDFs using voice or text input.
        
        **Important**: To use Llama 2 for answer generation and summarization,
        you need to set your Replicate API token in the code.
        """)
        
        with gr.Tab("Voice Input"):
            audio_input = gr.Audio(type="filepath", label="Record or Upload Audio")
            num_results_slider = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Number of Results")
            
            with gr.Row():
                submit_btn = gr.Button("Submit")
                clear_btn = gr.Button("Clear")
            
            transcription_output = gr.Textbox(label="Transcribed Query")
            answer_output = gr.Textbox(label="Generated Answer", lines=10)
            docs_output = gr.Markdown(label="Retrieved Documents")
            
            submit_btn.click(
                process_audio, 
                inputs=[audio_input, num_results_slider],
                outputs=[transcription_output, answer_output, docs_output]
            )
            
            clear_btn.click(
                lambda: (None, "", "", ""),
                inputs=None,
                outputs=[audio_input, transcription_output, answer_output, docs_output]
            )
        
        with gr.Tab("Text Input"):
            text_input = gr.Textbox(label="Enter your question", lines=2)
            num_results_text = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Number of Results")
            
            with gr.Row():
                text_submit_btn = gr.Button("Submit")
                text_clear_btn = gr.Button("Clear")
            
            text_answer_output = gr.Textbox(label="Generated Answer", lines=10)
            text_docs_output = gr.Markdown(label="Retrieved Documents")
            
            text_submit_btn.click(
                process_text_query,
                inputs=[text_input, num_results_text],
                outputs=[text_answer_output, text_docs_output]
            )
            
            text_clear_btn.click(
                lambda: ("", "", ""),
                inputs=None,
                outputs=[text_input, text_answer_output, text_docs_output]
            )
        
        with gr.Tab("Document Summary"):
            with gr.Row():
                summary_docs_input = gr.Markdown(label="Documents to Summarize")
                summary_length = gr.Slider(minimum=50, maximum=500, value=200, step=50, label="Summary Length (max tokens)")
            
            with gr.Row():
                summary_btn = gr.Button("Generate Summary")
                summary_clear_btn = gr.Button("Clear")
            
            summary_output = gr.Textbox(label="Generated Summary", lines=10)
            
            summary_btn.click(
                generate_summary,
                inputs=[summary_docs_input, summary_length],
                outputs=[summary_output]
            )
            
            summary_clear_btn.click(
                lambda: ("", ""),
                inputs=None,
                outputs=[summary_docs_input, summary_output]
            )
        
        gr.Markdown("### About This System")
        gr.Markdown("""
        This system combines:
        1. **Speech Recognition** (Whisper) for transcribing voice queries
        2. **Vector Search** (ChromaDB) for retrieving relevant document chunks
        3. **Text Generation** (Llama 2 via Replicate API) for producing grounded answers
        4. **Document Summarization** for creating abstractive summaries of documents
        
        The system only answers based on information found in the documents.
        """)
    
    # Download Whisper model proactively
    logger.info("Pre-downloading Whisper model...")
    whisper.load_model("small")
    
    return demo

# Main execution
if __name__ == "__main__":
    # Create and launch the interface
    demo = create_interface()
    demo.launch(share=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

  checkpoint = torch.load(fp, map_location=device)


* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://4ca8d49fe5d02b4715.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  checkpoint = torch.load(fp, map_location=device)
