In [1]:
!pip install pymupdf
!pip install langchain_community
!pip install faiss-gpu
!pip install faiss-cpu
!pip install ragas

import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore")
import textwrap

from ragas.metrics import faithfulness, answer_relevancy
from ragas.evaluation import evaluate

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity




In [2]:
# Load and split PDF document
loader = PyMuPDFLoader("/kaggle/input/booooook/Harry Potter Book.pdf")
pages = loader.load_and_split()
print(len(pages))

221


In [3]:
# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-large")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust chunk size as needed
    chunk_overlap=750  # Ensures context continuity
)

# Apply chunking to pages
chunks = text_splitter.split_documents(pages)
len(chunks)

# Create FAISS vector database
#vectordb = FAISS.from_documents(pages, embedding_model)
vectordb = FAISS.from_documents(chunks, embedding_model)


# Save FAISS index to disk for later use
vectordb.save_local("faiss_index")

# Check the number of stored documents
print(f"Number of documents in the vector store: {vectordb.index.ntotal}")


Number of documents in the vector store: 1269


In [4]:
# Query processing
question = "Who was Hagrid?"

# === SEMANTIC RETRIEVAL USING COSINE SIMILARITY ===

# Embed the query
query_embedding = embedding_model.embed_query(question)

# Extract all chunk texts
all_chunks = [doc.page_content for doc in chunks]

# Embed all chunks
all_chunk_embeddings = embedding_model.embed_documents(all_chunks)

# Compute cosine similarities
cos_similarities = cosine_similarity([query_embedding], all_chunk_embeddings)[0]


In [5]:
# Get top-k relevant chunks based on semantic similarity
top_k = 7
top_k_indices = cos_similarities.argsort()[::-1][:top_k]
docs = [chunks[i] for i in top_k_indices]  # Replace docs from FAISS with semantic retrieval

# Print results
for i, doc in enumerate(docs, 1):
    page_number = doc.metadata.get('page', 'Unknown')
    print(f"Document {i} - Page {page_number} - Score: {doc.metadata.get('score', 'N/A')}")
    print(doc.page_content[:500])  # Print first 500 characters of each result
    print("-" * 80)


Document 1 - Page 45 - Score: N/A
could he possibly be? He’d spent his life being clouted by Dudley 
and bullied by Aunt Petunia and Uncle Vernon; if he was really a 
wizard, why hadn’t they been turned into warty toads every time 
they’d tried to lock him in his cupboard? If he’d once defeated the 
greatest sorcerer in the world, how come Dudley had always been 
able to kick him around like a football? 
‘Hagrid,’ he said quietly, ‘I think you must have made a mistake. 
I don’t think I can be a wizard.’ 
To his surprise, Hagrid 
--------------------------------------------------------------------------------
Document 2 - Page 64 - Score: N/A
birthday of his life – and yet – he chewed his hamburger, trying to 
find the words. 
‘Everyone thinks I’m special,’ he said at last. ‘All those people in 
the Leaky Cauldron, Professor Quirrell, Mr Ollivander ... but I 
don’t know anything about magic at all. How can they expect 
great things? I’m famous and I can’t even remember what I’m 
famous 

In [6]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto", #device_map='cuda'
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(model.dtype)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params / 1e6} million")
memory_footprint = total_params * 2 / (1024 ** 2)  # Convert to MB
print(f"Estimated Memory Footprint: {memory_footprint:.2f} MB")

# Create a pipeline
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=5000,
do_sample=False
)


Device set to use cuda:0


torch.bfloat16
Total Parameters: 1777.088 million
Estimated Memory Footprint: 3389.53 MB


In [7]:
# Extract page content and metadata properly
def format_response(doc):
    return f"Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content.strip()}"

# Handle cases where fewer than 3 results are returned
retrieved_responses = [format_response(doc) for doc in docs[:7]]
while len(retrieved_responses) < 7:
    retrieved_responses.append("No relevant information.")  # Fill missing slots

# Construct the RAG prompt
prompt = f"""
You are an AI assistant tasked with answering questions based on retrieved knowledge.

### **Retrieved Information**:
{retrieved_responses[0]}
{retrieved_responses[1]}
{retrieved_responses[2]}
{retrieved_responses[3]}
{retrieved_responses[4]}
{retrieved_responses[5]}
{retrieved_responses[6]}

### **Question**:
{question}

### **Instructions**:
- Integrate the key points from all retrieved responses into a **cohesive, well-structured answer**.
- If the responses are **contradictory**, mention the different perspectives.
- If none of the retrieved responses contain relevant information, reply:
  **"I couldn't find a good response to your query in the database."**
"""



In [8]:

# Use Qwen2.5 3B with the correct message format
messages = [
    {"role": "user", "content": prompt}
]

# Generate output using the model
output = generator(messages)

# Print formatted response
print(textwrap.fill(output[0]["generated_text"], width=80))


Okay, so I need to figure out who Hagrid is based on the provided information.
Let me go through each retrieved response to see what they have to say about
him.  First, in page 45, it says that Hagrid was clued in by Dudley and was
bullied by Aunt Petunia and Uncle Vernon. He was a wizard, but they hadn't been
turned into warty toads because they tried to lock him in his cupboard. If he
had defeated the greatest sorcerer, Dudley couldn't kick him around like a
football. Hagrid was confused and said, "I think you must have made a mistake. I
don't think I can be a wizard." He then chuckled and admitted he didn't know why
he couldn't be a wizard. So Hagrid was a wizard but had some personal issues.
Next, in page 64, Hagrid says he was born on the day his parents died and that
he chewed his hamburger to find out what he was famous for. He claims he's
famous and can't remember what he was famous for. He mentions that he was famous
for being a wizard but doesn't know the specifics. So Hagrid

In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to get embeddings for a list of texts (documents)
def get_embeddings_for_documents(documents, model):
    # Use embed_documents to get embeddings for a list of documents
    return model.embed_documents(documents)

# Function to get embedding for a single text (generated text)
def get_embedding_for_query(query, model):
    # Use embed_query to get embedding for a single query
    return model.embed_query(query)

# Get embeddings for the retrieved documents and the generated text
retrieved_documents = [doc.page_content for doc in docs[:8]]
retrieved_embeddings = get_embeddings_for_documents(retrieved_documents, embedding_model)
generated_text_embedding = get_embedding_for_query(output[0]["generated_text"], embedding_model)

# Calculate cosine similarity between each retrieved document and the generated text
similarities = []
for doc_embedding in retrieved_embeddings:
    similarity = cosine_similarity([doc_embedding], [generated_text_embedding])
    similarities.append(similarity[0][0])

# Print out cosine similarity scores for faithfulness and relevancy
print("Cosine Similarities (Relevancy Scores):")
for i, similarity in enumerate(similarities, 1):
    print(f"Document {i}: Similarity = {similarity:.4f}")

# Calculate the Faithfulness Score
# Faithfulness Score: This is the maximum cosine similarity between the generated response and any document
faithfulness_score = max(similarities)
print(f"\nFaithfulness Score: {faithfulness_score:.4f}")

# Calculate the average relevancy score (mean cosine similarity)
average_relevancy_score = np.mean(similarities)
print(f"Average Relevancy Score: {average_relevancy_score:.4f}")

print(f"Average Faithfulness Score")

Cosine Similarities (Relevancy Scores):
Document 1: Similarity = 0.9404
Document 2: Similarity = 0.8933
Document 3: Similarity = 0.9052
Document 4: Similarity = 0.8810
Document 5: Similarity = 0.8914
Document 6: Similarity = 0.8721
Document 7: Similarity = 0.8851

Faithfulness Score: 0.9404
Average Relevancy Score: 0.8955
Average Faithfulness Score
