**FOR FASTER COMPUTATION**

In [3]:
!pip install pymupdf
!pip install langchain_community
!pip install faiss-gpu
!pip install faiss-cpu
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import warnings, textwrap
warnings.filterwarnings("ignore")

# Load document
loader = PyMuPDFLoader("/kaggle/input/booooook/Harry Potter Book.pdf")
pages = loader.load_and_split()

# Embedding model
embedding_model = HuggingFaceEmbeddings(model_name="google/flan-t5-small")

# Language model
model_name = "Qwen/Qwen2.5-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, return_full_text=False, max_new_tokens=5000, do_sample=False)

# Embedding utility functions
def get_embeddings_for_documents(documents, model):
    return model.embed_documents(documents)

def get_embedding_for_query(query, model):
    return model.embed_query(query)

# Chunking & evaluation loop
chunk_sizes = [250, 500, 750, 1000, 1500, 2000, 3000, 4000, 5000]
chunk_overlaps = [50, 100, 200, 300, 500, 750, 1000,1500,2000]
question = "Who was Hagrid?"

results = []

for chunk_size in chunk_sizes:
    for chunk_overlap in chunk_overlaps:
        if chunk_overlap >= chunk_size:
            continue  # Skip invalid combinations
        print(f"\n=== Testing chunk_size={chunk_size}, chunk_overlap={chunk_overlap} ===")

        # Chunking
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        chunks = text_splitter.split_documents(pages)

        # Vector DB
        vectordb = FAISS.from_documents(chunks, embedding_model)
        retriever = vectordb.as_retriever(search_kwargs={"k": 5})
        docs = retriever.get_relevant_documents(question)

        def format_response(doc):
            return f"Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content.strip()}"

        retrieved_responses = [format_response(doc) for doc in docs[:5]]
        while len(retrieved_responses) < 5:
            retrieved_responses.append("No relevant information.")

        # RAG Prompt
        prompt = f"""
You are an AI assistant tasked with answering questions based on retrieved knowledge.

### **Retrieved Information**:
1️⃣ {retrieved_responses[0]}

2️⃣ {retrieved_responses[1]}

3️⃣ {retrieved_responses[2]}
4️⃣ {retrieved_responses[3]}
5️⃣ {retrieved_responses[4]}

### **Question**:
{question}

### **Instructions**:
- Integrate the key points from all retrieved responses into a **cohesive, well-structured answer**.
- If the responses are **contradictory**, mention the different perspectives.
- If none of the retrieved responses contain relevant information, reply:
  **"I couldn't find a good response to your query in the database."**
"""

        messages = [{"role": "user", "content": prompt}]
        output = generator(messages)

        # Embeddings
        retrieved_documents = [doc.page_content for doc in docs[:5]]
        retrieved_embeddings = get_embeddings_for_documents(retrieved_documents, embedding_model)
        generated_text_embedding = get_embedding_for_query(output[0]["generated_text"], embedding_model)

        similarities = []
        for doc_embedding in retrieved_embeddings:
            similarity = cosine_similarity([doc_embedding], [generated_text_embedding])
            similarities.append(similarity[0][0])

        faithfulness_score = max(similarities)
        avg_relevancy_score = np.mean(similarities)

        results.append({
            "chunk_size": chunk_size,
            "chunk_overlap": chunk_overlap,
            "faithfulness": round(faithfulness_score, 4),
            "relevancy": round(avg_relevancy_score, 4)
        })

# Print results
print("\n=== Summary of Results ===")
print(f"{'Chunk Size':<12} {'Overlap':<10} {'Faithfulness':<15} {'Relevancy':<15}")
for r in results:
    print(f"{r['chunk_size']:<12} {r['chunk_overlap']:<10} {r['faithfulness']:<15} {r['relevancy']:<15}")




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0



=== Testing chunk_size=250, chunk_overlap=50 ===

=== Testing chunk_size=250, chunk_overlap=100 ===

=== Testing chunk_size=250, chunk_overlap=200 ===

=== Testing chunk_size=500, chunk_overlap=50 ===

=== Testing chunk_size=500, chunk_overlap=100 ===

=== Testing chunk_size=500, chunk_overlap=200 ===

=== Testing chunk_size=500, chunk_overlap=300 ===

=== Testing chunk_size=750, chunk_overlap=50 ===

=== Testing chunk_size=750, chunk_overlap=100 ===

=== Testing chunk_size=750, chunk_overlap=200 ===

=== Testing chunk_size=750, chunk_overlap=300 ===


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



=== Testing chunk_size=750, chunk_overlap=500 ===

=== Testing chunk_size=1000, chunk_overlap=50 ===

=== Testing chunk_size=1000, chunk_overlap=100 ===

=== Testing chunk_size=1000, chunk_overlap=200 ===

=== Testing chunk_size=1000, chunk_overlap=300 ===

=== Testing chunk_size=1000, chunk_overlap=500 ===

=== Testing chunk_size=1000, chunk_overlap=750 ===

=== Testing chunk_size=1500, chunk_overlap=50 ===

=== Testing chunk_size=1500, chunk_overlap=100 ===

=== Testing chunk_size=1500, chunk_overlap=200 ===

=== Testing chunk_size=1500, chunk_overlap=300 ===

=== Testing chunk_size=1500, chunk_overlap=500 ===

=== Testing chunk_size=1500, chunk_overlap=750 ===

=== Testing chunk_size=1500, chunk_overlap=1000 ===

=== Testing chunk_size=2000, chunk_overlap=50 ===

=== Testing chunk_size=2000, chunk_overlap=100 ===

=== Testing chunk_size=2000, chunk_overlap=200 ===

=== Testing chunk_size=2000, chunk_overlap=300 ===

=== Testing chunk_size=2000, chunk_overlap=500 ===

=== Testing ch