In [None]:
# Install Dependencies
%pip install streamlit pymupdf sentence-transformers faiss-cpu transformers accelerate bitsandbytes pyngrok python-dotenv tiktoken rank-bm25 pandas plotly


In [None]:
# Setup Authentication
import os
from getpass import getpass
from dotenv import load_dotenv
from pyngrok import ngrok
import huggingface_hub

# Load environment variables
load_dotenv()

# Get Hugging Face token
hf_token = getpass("Enter your Hugging Face token: ")
os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token

# Login to Hugging Face
huggingface_hub.login(token=hf_token)

# Get ngrok token
ngrok_token = os.getenv("NGROK_AUTH_TOKEN")
if ngrok_token:
    !ngrok authtoken $ngrok_token
    print("Authentication setup complete!")
else:
    print("Please set NGROK_AUTH_TOKEN in .env file")


In [None]:
%%writefile app.py
import streamlit as st
import fitz
import re
import os
import tiktoken
import numpy as np
import faiss
from typing import List, Dict
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from rank_bm25 import BM25Okapi
import json

st.set_page_config(page_title="PDF Chat", layout="wide")

# CSS Styling
st.markdown("""
<style>
    header[data-testid="stHeader"] { display: none; }
    .stApp > div:first-child { padding-top: 0; }
    .main-container { max-width: 800px; margin: 0 auto; padding: 20px; padding-bottom: 100px; }
    .header { text-align: center; color: #ffffff; font-size: 24px; font-weight: 600; margin-bottom: 20px; }
    .chat-area { padding: 20px; margin-bottom: 20px; }
    .message { margin-bottom: 15px; display: flex; }
    .message.user { justify-content: flex-end; }
    .message.bot { justify-content: flex-start; }
    .message-content { max-width: 70%; padding: 12px 16px; border-radius: 18px; font-size: 14px; line-height: 1.4; word-wrap: break-word; }
    .message.user .message-content { background: #007bff; color: white; border-bottom-right-radius: 4px; }
    .message.bot .message-content { background: #f8f9fa; color: #333; border: 1px solid #e9ecef; border-bottom-left-radius: 4px; }
    .typing { background: #f8f9fa; color: #6c757d; padding: 12px 16px; border-radius: 18px; border-bottom-left-radius: 4px; font-style: italic; max-width: 70%; border: 1px solid #e9ecef; animation: pulse 1.5s infinite; }
    @keyframes pulse { 0%, 100% { opacity: 0.6; } 50% { opacity: 1; } }
    .input-area { position: fixed; bottom: 0; left: 0; right: 0; background: #000000; border-top: 1px solid #333333; padding: 15px; box-shadow: 0 -2px 4px rgba(0,0,0,0.3); z-index: 1000; }
    .stTextInput > div > div > input { border: 1px solid #ffffff; border-radius: 25px; padding: 12px 20px; font-size: 14px; background: #333333; color: #ffffff; height: 48px; transition: border-color 0.15s ease-in-out; }
    .stTextInput > div > div > input:focus { border-color: #ffffff; box-shadow: 0 0 0 0.2rem rgba(255,255,255,0.25); outline: none; }
    .stTextInput > div > div > input::placeholder { color: #cccccc; }
    .stTextInput { width: 100%; }
    .stApp { background: #000000; }
    .stSuccess { background: #d4edda; border: 1px solid #c3e6cb; color: #155724; border-radius: 8px; padding: 12px; margin-bottom: 20px; }
    .stInfo { background: #d1ecf1; border: 1px solid #bee5eb; color: #0c5460; border-radius: 8px; padding: 12px; margin-bottom: 10px; }
</style>
""", unsafe_allow_html=True)

# Enhanced RAG Classes
class HybridRetriever:
    def __init__(self, embedding_model, faiss_index, bm25_index, metadata, reranker=None):
        self.embedding_model = embedding_model
        self.faiss_index = faiss_index
        self.bm25_index = bm25_index
        self.metadata = metadata
        self.reranker = reranker
    
    def retrieve(self, query: str, k: int = 5) -> List[Dict]:
        # Dense retrieval (semantic)
        query_embedding = self.embedding_model.encode([query])[0]
        distances, dense_indices = self.faiss_index.search(query_embedding.reshape(1, -1).astype('float32'), k*2)
        
        # Sparse retrieval (BM25)
        bm25_scores = self.bm25_index.get_scores(query.split())
        sparse_indices = np.argsort(bm25_scores)[::-1][:k*2]
        
        # Combine and deduplicate
        all_indices = list(set(dense_indices[0].tolist() + sparse_indices.tolist()))
        
        # Get chunks with scores
        chunks_with_scores = []
        for idx in all_indices:
            if idx < len(self.metadata["chunks"]):
                chunk = self.metadata["chunks"][idx].copy()
                # Dense score
                dense_score = 0
                if idx in dense_indices[0]:
                    dense_idx = np.where(dense_indices[0] == idx)[0]
                    if len(dense_idx) > 0:
                        dense_score = 1 / (1 + distances[0][dense_idx[0]])
                
                # Sparse score
                sparse_score = bm25_scores[idx] / (np.max(bm25_scores) + 1e-8)
                
                # Combined score
                combined_score = 0.6 * dense_score + 0.4 * sparse_score
                
                chunk["dense_score"] = dense_score
                chunk["sparse_score"] = sparse_score
                chunk["combined_score"] = combined_score
                chunks_with_scores.append(chunk)
        
        # Sort by combined score
        chunks_with_scores.sort(key=lambda x: x["combined_score"], reverse=True)
        
        # Rerank if reranker is available
        if self.reranker and len(chunks_with_scores) > k:
            query_chunk_pairs = [(query, chunk["text"]) for chunk in chunks_with_scores[:k*2]]
            rerank_scores = self.reranker.predict(query_chunk_pairs)
            
            for i, chunk in enumerate(chunks_with_scores[:k*2]):
                chunk["rerank_score"] = rerank_scores[i]
                chunk["final_score"] = 0.7 * chunk["combined_score"] + 0.3 * rerank_scores[i]
            else:
                for chunk in chunks_with_scores:
                    chunk["final_score"] = chunk["combined_score"]
            
            chunks_with_scores.sort(key=lambda x: x["final_score"], reverse=True)
        
        # Return top k with ranking
        for i, chunk in enumerate(chunks_with_scores[:k]):
            chunk["rank"] = i + 1
        
        return chunks_with_scores[:k]

class EnhancedRAGSystem:
    def __init__(self, retriever, model, tokenizer):
        self.retriever = retriever
        self.model = model
        self.tokenizer = tokenizer
    
    def answer_question(self, question: str, k: int = 5) -> Dict:
        retrieved_chunks = self.retriever.retrieve(question, k=k)
        
        # Create context with citations
        context_parts = []
        for i, chunk in enumerate(retrieved_chunks):
            citation = f"[{i+1}]"
            context_parts.append(f"{citation} {chunk['text']}")
        context = "\n\n".join(context_parts)
        
        # Enhanced prompt with citation instructions
        rag_prompt = f"""<s>[INST] You are an expert AI assistant. Answer the question based ONLY on the provided context. Use citations [1], [2], etc. to reference specific sources. If the answer is not in the context, say "I cannot find this information in the provided context."

CONTEXT:
{context}

QUESTION: {question}

Please provide a comprehensive answer with proper citations based on the context: [/INST]"""
        
        response = self.generate_response(rag_prompt)
        return {
            "question": question, 
            "answer": response, 
            "retrieved_chunks": retrieved_chunks, 
            "context": context,
            "retrieval_scores": {f"chunk_{i+1}": {
                "dense_score": chunk.get("dense_score", 0),
                "sparse_score": chunk.get("sparse_score", 0),
                "combined_score": chunk.get("combined_score", 0),
                "final_score": chunk.get("final_score", chunk.get("combined_score", 0))
            } for i, chunk in enumerate(retrieved_chunks)}
        }
    
    def generate_response(self, prompt: str, max_length: int = 512) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
        
        # Move inputs to the same device as the model
        device = next(self.model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_new_tokens=max_length, temperature=0.2, top_p=0.85, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, eos_token_id=self.tokenizer.eos_token_id)
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        if prompt in response:
            response = response.split(prompt)[-1].strip()
        return response

# Utility Functions
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "".join([doc.load_page(page_num).get_text() for page_num in range(len(doc))])
    doc.close()
    return text

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'\n+', '\n', text)
    return text.strip()

def count_tokens(text: str) -> int:
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    return len(encoding.encode(text))

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[Dict]:
    sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
    chunks, current_chunk, chunk_id = [], "", 0
    
    for sentence in sentences:
        test_chunk = current_chunk + " " + sentence if current_chunk else sentence
        if count_tokens(test_chunk) > chunk_size and current_chunk:
            chunks.append({"id": chunk_id, "text": current_chunk.strip(), "token_count": count_tokens(current_chunk), "char_count": len(current_chunk)})
            chunk_id += 1
            overlap_text = current_chunk[-overlap:] if len(current_chunk) > overlap else current_chunk
            current_chunk = overlap_text + " " + sentence if overlap_text else sentence
        else:
            current_chunk = test_chunk
    
    if current_chunk.strip():
        chunks.append({"id": chunk_id, "text": current_chunk.strip(), "token_count": count_tokens(current_chunk), "char_count": len(current_chunk)})
    return chunks

# Load Models
@st.cache_resource
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

@st.cache_resource
def load_reranker():
    return CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

@st.cache_resource
def load_mistral_model():
    import os
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
    
    # Get token from environment
    hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
    if not hf_token:
        raise ValueError("Hugging Face token not found. Please run the authentication cell first.")
    
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", token=hf_token)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", quantization_config=bnb_config, device_map="auto", trust_remote_code=True, token=hf_token)
    return model, tokenizer

# Main App
st.markdown('<div class="main-container">', unsafe_allow_html=True)
st.markdown('<div class="header">RAG PDF Chat System</div>', unsafe_allow_html=True)

# Initialize session state
if "conversation" not in st.session_state:
    st.session_state.conversation = []
if "processing" not in st.session_state:
    st.session_state.processing = False
if "rag_system" not in st.session_state:
    st.session_state.rag_system = None

# Upload PDFs
uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)

if uploaded_files:
    # Process PDFs and build RAG system
    if st.session_state.rag_system is None:
        with st.spinner("Processing PDFs and building RAG system..."):
            # Extract text from all PDFs
            all_pdf_text = ""
            for uploaded_file in uploaded_files:
                with open(f"temp_{uploaded_file.name}", "wb") as f:
                    f.write(uploaded_file.getbuffer())
                raw_text = extract_text_from_pdf(f"temp_{uploaded_file.name}")
                cleaned_text = clean_text(raw_text)
                all_pdf_text += f" {cleaned_text}"
                os.remove(f"temp_{uploaded_file.name}")
            
            # Chunk the text
            chunks = chunk_text(all_pdf_text, chunk_size=500, overlap=50)
            
            # Generate embeddings
            embedding_model = load_embedding_model()
            texts = [chunk["text"] for chunk in chunks]
            embeddings = embedding_model.encode(texts)
            
            # Add embeddings to chunks
            chunks_with_embeddings = []
            for i, chunk in enumerate(chunks):
                enhanced_chunk = chunk.copy()
                enhanced_chunk["embedding"] = embeddings[i]
                chunks_with_embeddings.append(enhanced_chunk)
            
            # Create FAISS index
            embeddings_array = np.array([chunk["embedding"] for chunk in chunks_with_embeddings])
            dimension = embeddings_array.shape[1]
            faiss_index = faiss.IndexFlatL2(dimension)
            faiss_index.add(embeddings_array.astype('float32'))
            
            # Create BM25 index
            tokenized_texts = [text.split() for text in texts]
            bm25_index = BM25Okapi(tokenized_texts)
            
            # Load reranker
            reranker = load_reranker()
            
            # Create hybrid retriever
            retriever = HybridRetriever(embedding_model, faiss_index, bm25_index, {"chunks": chunks_with_embeddings}, reranker)
            
            # Load Mistral model
            mistral_model, mistral_tokenizer = load_mistral_model()
            
            # Create enhanced RAG system
            st.session_state.rag_system = EnhancedRAGSystem(retriever, mistral_model, mistral_tokenizer)
    
    # Display file information
    st.success(f"PDFs processed successfully!")
    st.info(f"Total chunks: {len(chunks) if 'chunks' in locals() else 'N/A'}")
    
    # Chat interface
    st.markdown('<div class="chat-area">', unsafe_allow_html=True)
    
    # Display messages
    for turn in st.session_state.conversation:
        if turn["role"] == "user":
            st.markdown(f'''<div class="message user"><div class="message-content">{turn["text"]}</div></div>''', unsafe_allow_html=True)
        else:
            st.markdown(f'''<div class="message bot"><div class="message-content">{turn["text"]}</div></div>''', unsafe_allow_html=True)
    
    # Show typing indicator
    if st.session_state.processing:
        st.markdown('<div class="typing">Bot is thinking...</div>', unsafe_allow_html=True)
    
    st.markdown('</div>', unsafe_allow_html=True)
    
    # Input area
    st.markdown('<div class="input-area">', unsafe_allow_html=True)
    
    # Use dynamic key to clear input field
    if "input_key" not in st.session_state:
        st.session_state.input_key = 0
    
    if st.session_state.get("input_cleared", False):
        st.session_state.input_key += 1
        st.session_state.input_cleared = False
    
    user_input = st.text_input("", placeholder="Ask a question about your PDFs...", key=f"user_input_{st.session_state.input_key}", disabled=st.session_state.processing)
    st.markdown('</div>', unsafe_allow_html=True)

    # Handle input submission
    if user_input and not st.session_state.processing:
        if "last_input" not in st.session_state or st.session_state.last_input != user_input:
            st.session_state.conversation.append({"role": "user", "text": user_input})
            st.session_state.processing = True
            st.session_state.last_input = user_input
            st.session_state.input_cleared = True
            st.rerun()
    
    # Process bot response
    if st.session_state.processing and len(st.session_state.conversation) > 0 and st.session_state.conversation[-1]["role"] == "user":
        with st.spinner("Generating response..."):
            user_question = st.session_state.conversation[-1]['text']
            result = st.session_state.rag_system.answer_question(user_question, k=5)
            
            # Store result for visualization
            st.session_state.last_result = result
            
            # Only store the clean answer
            st.session_state.conversation.append({"role": "bot", "text": result["answer"]})
            st.session_state.processing = False
            st.rerun()
    
    # Show retrieval visualization if available
    if "last_result" in st.session_state and st.session_state.last_result:
        with st.expander("🔍 Retrieval Analysis", expanded=False):
            st.write("**Retrieved Chunks with Scores:**")
            
            for i, chunk in enumerate(st.session_state.last_result["retrieved_chunks"]):
                col1, col2 = st.columns([3, 1])
                
                with col1:
                    st.write(f"**Chunk {i+1}:** {chunk['text'][:200]}...")
                
                with col2:
                    scores = st.session_state.last_result["retrieval_scores"][f"chunk_{i+1}"]
                    st.metric("Dense Score", f"{scores['dense_score']:.3f}")
                    st.metric("Sparse Score", f"{scores['sparse_score']:.3f}")
                    st.metric("Final Score", f"{scores['final_score']:.3f}")
                
                st.divider()
            
            # Show score distribution
            st.write("**Score Distribution:**")
            import pandas as pd
            import plotly.express as px
            
            score_data = []
            for i, chunk in enumerate(st.session_state.last_result["retrieved_chunks"]):
                scores = st.session_state.last_result["retrieval_scores"][f"chunk_{i+1}"]
                score_data.append({
                    "Chunk": f"Chunk {i+1}",
                    "Dense": scores['dense_score'],
                    "Sparse": scores['sparse_score'],
                    "Final": scores['final_score']
                })
            
            df = pd.DataFrame(score_data)
            fig = px.bar(df, x="Chunk", y=["Dense", "Sparse", "Final"], 
                        title="Retrieval Scores by Chunk", barmode='group')
            st.plotly_chart(fig, use_container_width=True)

st.markdown('</div>', unsafe_allow_html=True)


In [None]:
# Run the Application
import time
import subprocess
import os

# Kill any existing processes
os.system("pkill -f streamlit")
ngrok.kill()
time.sleep(2)

# Start Streamlit
subprocess.Popen(["streamlit", "run", "app.py", "--server.port=8501", "--server.headless=true"])
time.sleep(5)

# Start ngrok tunnel
public_url = ngrok.connect(8501)
print("RAG PDF Chat System URL:", public_url)