In [2]:
# =================== INSTALLS =====================
!pip install pymupdf
!pip install langchain_community
!pip install faiss-cpu
!pip install faiss-gpu
!pip install rank_bm25

# =================== IMPORTS =====================
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.retrievers import BM25Retriever, EnsembleRetriever

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import textwrap
import warnings
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time
import pandas as pd  # <== CSV export

warnings.filterwarnings("ignore")

# =================== LOAD & SPLIT DOCS =====================
loader = PyMuPDFLoader("/kaggle/input/booooook/Harry Potter Book.pdf")
pages = loader.load_and_split()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=750)
chunks = text_splitter.split_documents(pages)

# =================== EMBEDDINGS & VECTOR STORE =====================
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-large")
vectordb = FAISS.from_documents(chunks, embedding_model)
vectordb.save_local("faiss_index")

# =================== RETRIEVERS =====================
bm25 = BM25Retriever.from_documents(chunks)
bm25.k = 8

semantic_retriever = vectordb.as_retriever(search_kwargs={"k": 8})
hybrid_retriever = EnsembleRetriever(retrievers=[semantic_retriever, bm25], weights=[0.5, 0.5])

# =================== MODEL =====================
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, return_full_text=False, max_new_tokens=500, do_sample=False)

# =================== HELPERS =====================
def format_response(doc):
    return f"Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content.strip()}"

def build_prompt(docs, question):
    retrieved_responses = [format_response(doc) for doc in docs]
    while len(retrieved_responses) < 8:
        retrieved_responses.append("No relevant information.")
    return f"""
You are an AI assistant tasked with answering questions based on retrieved knowledge.

### Retrieved Information:
{retrieved_responses[0]}
{retrieved_responses[1]}
{retrieved_responses[2]}
{retrieved_responses[3]}
{retrieved_responses[4]}
{retrieved_responses[5]}
{retrieved_responses[6]}
{retrieved_responses[7]}

### Question:
{question}

### Instructions:
- Integrate the key points from all retrieved responses into a cohesive, well-structured answer.
- If responses are contradictory, mention the different perspectives.
- If no relevant info, say: "I couldn't find a good response to your query in the database."
"""

def get_similarity_scores(docs, response_text):
    retrieved_documents = [doc.page_content for doc in docs]
    retrieved_embeddings = embedding_model.embed_documents(retrieved_documents)
    generated_text_embedding = embedding_model.embed_query(response_text)
    similarities = [cosine_similarity([emb], [generated_text_embedding])[0][0] for emb in retrieved_embeddings]
    faithfulness_score = max(similarities)
    avg_relevancy_score = np.mean(similarities)
    return faithfulness_score, avg_relevancy_score

# =================== CUSTOM RETRIEVERS =====================
def mmr_manual(query, documents, top_k=8, lambda_param=0.5):
    query_emb = embedding_model.embed_query(query)
    doc_embs = embedding_model.embed_documents([doc.page_content for doc in documents])
    
    selected = []
    remaining = list(range(len(documents)))
    
    while len(selected) < top_k and remaining:
        mmr_scores = []
        for i in remaining:
            sim_query = cosine_similarity([doc_embs[i]], [query_emb])[0][0]
            sim_selected = max([cosine_similarity([doc_embs[i]], [doc_embs[j]])[0][0] for j in selected], default=0)
            mmr_score = lambda_param * sim_query - (1 - lambda_param) * sim_selected
            mmr_scores.append((i, mmr_score))
        best_idx = max(mmr_scores, key=lambda x: x[1])[0]
        selected.append(best_idx)
        remaining.remove(best_idx)
    return [documents[i] for i in selected]

def rrf_manual(query, docs1, docs2, k=8, k_factor=60):
    all_docs = docs1 + docs2
    unique_docs = {doc.page_content: doc for doc in all_docs}.values()
    scores = {}

    for i, doc in enumerate(docs1):
        scores[doc.page_content] = scores.get(doc.page_content, 0) + 1 / (k_factor + i + 1)
    for i, doc in enumerate(docs2):
        scores[doc.page_content] = scores.get(doc.page_content, 0) + 1 / (k_factor + i + 1)

    sorted_docs = sorted(unique_docs, key=lambda d: scores[d.page_content], reverse=True)
    return sorted_docs[:k]

# =================== MAIN =====================
question = "Who was Hagrid?"
results = {}

retrieval_methods = {
    "Semantic Search": lambda q: semantic_retriever.get_relevant_documents(q),
    "Keyword Search (BM25)": lambda q: bm25.get_relevant_documents(q),
    "Hybrid Search": lambda q: hybrid_retriever.get_relevant_documents(q),
    "MMR Search": lambda q: mmr_manual(q, chunks, top_k=8),
    "Reciprocal Rank Fusion": lambda q: rrf_manual(q, semantic_retriever.get_relevant_documents(q), bm25.get_relevant_documents(q), k=8)
}

for name, retrieval_func in retrieval_methods.items():
    print(f"\n=== {name} ===")
    
    start_total = time.time()
    start_retrieve = time.time()
    docs = retrieval_func(question)
    end_retrieve = time.time()
    
    prompt = build_prompt(docs, question)
    start_infer = time.time()
    output = generator([{"role": "user", "content": prompt}])
    end_infer = time.time()
    
    end_total = time.time()

    response_text = output[0]["generated_text"]
    print("Answer:\n", textwrap.fill(response_text, width=80))
    
    faith, rel = get_similarity_scores(docs, response_text)
    results[name] = {
        "faithfulness": faith,
        "relevance": rel,
        "retrieval_time": end_retrieve - start_retrieve,
        "inference_time": end_infer - start_infer,
        "total_time": end_total - start_total
    }

    print(f"Faithfulness Score: {faith:.4f}")
    print(f"Relevancy Score: {rel:.4f}")
    print(f"Retrieval Time: {end_retrieve - start_retrieve:.2f}s")
    print(f"Inference Time: {end_infer - start_infer:.2f}s")
    print(f"Total Time: {end_total - start_total:.2f}s")

# =================== SUMMARY =====================
print("\n=== Summary Table ===")
print(f"{'Method':35} | {'Faith':>7} | {'Relevancy':>9} | {'Retrieval(s)':>13} | {'Inference(s)':>13} | {'Total(s)':>9}")
print("-" * 95)
for method, scores in results.items():
    print(f"{method:35} | {scores['faithfulness']:7.4f} | {scores['relevance']:9.4f} | {scores['retrieval_time']:13.2f} | {scores['inference_time']:13.2f} | {scores['total_time']:9.2f}")

# =================== SAVE TO CSV =====================
df = pd.DataFrame([
    {
        "Method": method,
        "Faithfulness Score": res["faithfulness"],
        "Relevance Score": res["relevance"],
        "Retrieval Time (s)": res["retrieval_time"],
        "Inference Time (s)": res["inference_time"],
        "Total Time (s)": res["total_time"],
    }
    for method, res in results.items()
])
df.to_csv("retrieval_results.csv", index=False)
print("\nResults saved to 'retrieval_results.csv'")




Device set to use cuda:0



=== Semantic Search ===
Answer:
 Okay, so I need to figure out who Hagrid is based on the information provided.
Let me start by reading through all the retrieved information carefully.  First,
I see that Hagrid is a character from Harry Potter and the Sorcerer's Stone.
He's the wizard who is being discussed in the context of being a wizard, but
he's also a bit of a troublemaker. There are several quotes from him that show
his personality.  Looking at the first paragraph, Hagrid says, "I think you must
have made a mistake. I don’t think I can be a wizard." Then he says, "I don’t
think I can be a wizard." This seems to confirm that he's not a wizard but a
wizard who's been trying to be one.  In the second paragraph, Hagrid says, "I
think you must have made a mistake. I don’t think I can be a wizard." He also
mentions that he tried to defeat the greatest sorcerer in the world but was
unsuccessful. This suggests that he's been trying to be a wizard but hasn't
succeeded.  The third paragra