In [1]:
# =================== INSTALLS =====================
!pip install pymupdf
!pip install langchain_community
!pip install faiss-cpu
!pip install faiss-gpu
!pip install rank_bm25


# =================== IMPORTS =====================
import textwrap
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.retrievers import BM25Retriever, EnsembleRetriever

warnings.filterwarnings("ignore")

# =================== LOAD & SPLIT DOCS =====================
loader = PyMuPDFLoader("/kaggle/input/boooooook/Harry Potter Book.pdf")
pages = loader.load_and_split()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=750)
chunks = text_splitter.split_documents(pages)

# =================== EMBEDDINGS & VECTOR STORE =====================
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-large")
vectordb = FAISS.from_documents(chunks, embedding_model)
vectordb.save_local("faiss_index")

# =================== RETRIEVERS =====================
bm25 = BM25Retriever.from_documents(chunks)
bm25.k = 8
semantic_retriever = vectordb.as_retriever(search_kwargs={"k": 8})
hybrid_retriever = EnsembleRetriever(retrievers=[semantic_retriever, bm25], weights=[0.5, 0.5])

# =================== MODEL =====================
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, return_full_text=False, max_new_tokens=500, do_sample=False)

# =================== HELPERS =====================
def format_response(doc):
    return f"Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content.strip()}"

def build_prompt(docs, question):
    retrieved_responses = sorted(docs, key=lambda d: len(d.page_content), reverse=True)  # long-context reorder
    formatted = [format_response(doc) for doc in retrieved_responses]
    while len(formatted) < 8:
        formatted.append("No relevant information.")
    return f"""
You are an AI assistant tasked with answering questions based on retrieved knowledge.

### Retrieved Information:
{formatted[0]}
{formatted[1]}
{formatted[2]}
{formatted[3]}
{formatted[4]}
{formatted[5]}
{formatted[6]}
{formatted[7]}

### Question:
{question}

### Instructions:
- Integrate the key points from all retrieved responses into a cohesive, well-structured answer.
- If responses are contradictory, mention the different perspectives.
- If no relevant info, say: "I couldn't find a good response to your query in the database."
"""

def summarize_answer(answer):
    # simple summary: first 1-2 sentences
    return " ".join(answer.split(". ")[:2]) + "."

def get_similarity_scores(docs, response_text):
    retrieved_documents = [doc.page_content for doc in docs]
    retrieved_embeddings = embedding_model.embed_documents(retrieved_documents)
    generated_text_embedding = embedding_model.embed_query(response_text)
    similarities = [cosine_similarity([emb], [generated_text_embedding])[0][0] for emb in retrieved_embeddings]
    return max(similarities), np.mean(similarities)

# =================== CUSTOM RETRIEVERS =====================
def mmr_manual(query, documents, top_k=8, lambda_param=0.5):
    query_emb = embedding_model.embed_query(query)
    doc_embs = embedding_model.embed_documents([doc.page_content for doc in documents])
    selected, remaining = [], list(range(len(documents)))
    while len(selected) < top_k and remaining:
        mmr_scores = []
        for i in remaining:
            sim_query = cosine_similarity([doc_embs[i]], [query_emb])[0][0]
            sim_selected = max([cosine_similarity([doc_embs[i]], [doc_embs[j]])[0][0] for j in selected], default=0)
            mmr_score = lambda_param * sim_query - (1 - lambda_param) * sim_selected
            mmr_scores.append((i, mmr_score))
        best_idx = max(mmr_scores, key=lambda x: x[1])[0]
        selected.append(best_idx)
        remaining.remove(best_idx)
    return [documents[i] for i in selected]

def rrf_manual(query, docs1, docs2, k=8, k_factor=60):
    all_docs = docs1 + docs2
    unique_docs = {doc.page_content: doc for doc in all_docs}.values()
    scores = {}
    for i, doc in enumerate(docs1):
        scores[doc.page_content] = scores.get(doc.page_content, 0) + 1 / (k_factor + i + 1)
    for i, doc in enumerate(docs2):
        scores[doc.page_content] = scores.get(doc.page_content, 0) + 1 / (k_factor + i + 1)
    sorted_docs = sorted(unique_docs, key=lambda d: scores[d.page_content], reverse=True)
    return sorted_docs[:k]

# =================== MAIN =====================
question = "Who was Hagrid?"
results = []

retrieval_methods = {
    "Semantic Search": semantic_retriever.get_relevant_documents(question),
    "Keyword Search (BM25)": bm25.get_relevant_documents(question),
    "Hybrid Search": hybrid_retriever.get_relevant_documents(question),
    "MMR Search": mmr_manual(question, chunks, top_k=8),
    "Reciprocal Rank Fusion": rrf_manual(question, semantic_retriever.get_relevant_documents(question), bm25.get_relevant_documents(question), k=8)
}

for name, docs in retrieval_methods.items():
    print(f"\n=== {name} ===")
    prompt = build_prompt(docs, question)
    output = generator([{"role": "user", "content": prompt}])
    response_text = output[0]["generated_text"]
    summary = summarize_answer(response_text)
    faith, rel = get_similarity_scores(docs, response_text)
    
    print("Answer:\n", textwrap.fill(response_text, width=80))
    print("Summary:\n", summary)
    print(f"Faithfulness Score: {faith:.4f}")
    print(f"Relevancy Score: {rel:.4f}")
    
    results.append({
        "Method": name,
        "Faithfulness": faith,
        "Relevancy": rel,
        "Summary": summary
    })

# =================== CSV LOGGING =====================
df = pd.DataFrame(results)
df.to_csv("retrieval_comparison_results.csv", index=False)
print("\nResults logged to retrieval_comparison_results.csv")

# Display summary table
print("\n=== Summary Table ===")
print(f"{'Method':35} | {'Faithfulness':>13} | {'Relevancy':>10}")
print("-" * 65)
for row in results:
    print(f"{row['Method']:35} | {row['Faithfulness']:13.4f} | {row['Relevancy']:10.4f}")


Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5
Collecting langchain_community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain_community)
  Downloading langchain_core-0.3.54-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.23 (from langchain_community)
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.9.0-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from lan

2025-04-18 12:17:03.453949: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744978623.697545      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744978623.776064      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Device set to use cuda:0



=== Semantic Search ===
Answer:
 Okay, so I need to figure out who Hagrid is based on the provided information.
Let me start by reading through all the retrieved paragraphs carefully.  First,
I see that Hagrid is a wizard, and he's been around for a long time. He's been
in various houses, like the Dursleys' and the Prewetts' houses. He's been
involved in magical events, like the train ride to Hogwarts and the train
pulling out of the station. There's also a mention of him being a key figure in
the Forbidden Forest, where he lives in a small wooden house.  Looking at the
different passages, Hagrid is described as a kind and wise man. He's been around
for a long time, and he's been involved in various magical events. He's been
involved in the train ride to Hogwarts, which is a significant event for him.
He's also been involved in the Forbidden Forest, where he lives in a small
house.  There's also a part where Hagrid is upset because he was involved in a
situation where he was supposed 