In [29]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore")
import textwrap

from ragas.metrics import faithfulness, answer_relevancy
from ragas.evaluation import evaluate



In [30]:
# Load and split PDF document
loader = PyMuPDFLoader("/kaggle/input/booooook/Harry Potter Book.pdf")
pages = loader.load_and_split()
print(len(pages))

221


In [31]:
# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="google/flan-t5-small")


In [32]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust chunk size as needed
    chunk_overlap=750  # Ensures context continuity
)

# Apply chunking to pages
chunks = text_splitter.split_documents(pages)
len(chunks)


1269

In [33]:
# Create FAISS vector database
#vectordb = FAISS.from_documents(pages, embedding_model)
vectordb = FAISS.from_documents(chunks, embedding_model)


# Save FAISS index to disk for later use
vectordb.save_local("faiss_index")

# Check the number of stored documents
print(f"Number of documents in the vector store: {vectordb.index.ntotal}")

Number of documents in the vector store: 1269


In [34]:
x=9

In [35]:
# Query processing
question = "Who was Hagrid?"
retriever = vectordb.as_retriever(search_kwargs={"k": x})
docs = retriever.get_relevant_documents(question)

# Print results
for i, doc in enumerate(docs, x):
    page_number = doc.metadata.get('page', 'Unknown')
    print(f"Document {i} - Page {page_number} - Score: {doc.metadata.get('score', 'N/A')}")
    print(doc.page_content[:500])  # Print first 500 characters of each result
    print("-" * 80)

Document 9 - Page 57 - Score: N/A
Galleons there were to a pound to know that he was holding more 
money than he’d had in his whole life – more money than even 
Dudley had ever had. 
‘Might as well get yer uniform,’ said Hagrid, nodding towards 
Madam Malkin’s Robes for All Occasions. ‘Listen, Harry would yeh 
mind if I slipped off fer a pick-me-up in the Leaky Cauldron? I 
hate them Gringotts carts.’ He did still look a bit sick, so Harry 
entered Madam Malkin’s shop alone, feeling nervous. 
Madam Malkin was a squat, smiling wi
--------------------------------------------------------------------------------
Document 10 - Page 169 - Score: N/A
NORBERT  THE  NORWEGIAN  RIDGEBACK 
171 
 
 
He looked very pleased with himself, but Hermione didn’t. 
‘Hagrid, you live in a wooden house,’ she said. 
But Hagrid wasn’t listening. He was humming merrily as he 
stoked the fire. 
* 
So now they had something else to worry about: what might hap-
pen to Hagrid if anyone found out he was hiding an i

In [36]:
model_name = "Qwen/Qwen2.5-3B-Instruct"
# model_name = "meta-llama/Llama-3.2-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto", #device_map='cuda'
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [37]:
print(model.dtype)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params / 1e6} million")
memory_footprint = total_params * 2 / (1024 ** 2)  # Convert to MB
print(f"Estimated Memory Footprint: {memory_footprint:.2f} MB")

torch.bfloat16
Total Parameters: 3085.938688 million
Estimated Memory Footprint: 5885.96 MB


In [38]:
# Create a pipeline
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=5000,
do_sample=False
)

Device set to use cuda:0


In [39]:
# Extract page content and metadata properly
def format_response(doc):
    return f"Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content.strip()}"

# Handle cases where fewer than 3 results are returned
retrieved_responses = [format_response(doc) for doc in docs[:x]]
while len(retrieved_responses) < x:
    retrieved_responses.append("No relevant information.")  # Fill missing slots


In [40]:
# Construct the RAG prompt
prompt = f"""
You are an AI assistant tasked with answering questions based on retrieved knowledge.

### **Retrieved Information**:
{retrieved_responses[0]}
{retrieved_responses[1]}
{retrieved_responses[2]}
{retrieved_responses[3]}
{retrieved_responses[4]}
{retrieved_responses[5]}
{retrieved_responses[6]}
{retrieved_responses[7]}
{retrieved_responses[8]}









### **Question**:
{question}

### **Instructions**:
- Integrate the key points from all retrieved responses into a **cohesive, well-structured answer**.
- If the responses are **contradictory**, mention the different perspectives.
- If none of the retrieved responses contain relevant information, reply:
  **"I couldn't find a good response to your query in the database."**
"""



In [41]:
# Use Qwen2.5 3B with the correct message format
messages = [
    {"role": "user", "content": prompt}
]

# Generate output using the model
output = generator(messages)

# Print formatted response
print(textwrap.fill(output[0]["generated_text"], width=80))


Based on the information provided in the retrieved pages, Hagrid is a character
in the Harry Potter series. Here are some key points about Hagrid:  1. **Job and
Responsibilities**: Hagrid works as a gamekeeper at Hogwarts School of
Witchcraft and Wizardry. He is responsible for taking care of the school's
magical creatures, including the giant squid, the Hungarian Horntail dragon, and
the three-headed dog, Fluffy.  2. **Physical Description**: Hagrid is described
as a large, burly man with a round face and bushy eyebrows. He has a deep voice
and often wears a large hat.  3. **Background**: Hagrid comes from a working-
class background and has a difficult childhood. He is not a pure-blood wizard
and is often teased for his lack of magical ability compared to other students.
4. **Relationships**: Hagrid has a close relationship with Harry Potter, having
taken him in as a foster child after Harry's parents were killed. He is also
friends with Ron Weasley and Hermione Granger.  5. **Charac

In [42]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to get embeddings for a list of texts (documents)
def get_embeddings_for_documents(documents, model):
    # Use embed_documents to get embeddings for a list of documents
    return model.embed_documents(documents)

# Function to get embedding for a single text (generated text)
def get_embedding_for_query(query, model):
    # Use embed_query to get embedding for a single query
    return model.embed_query(query)

# Get embeddings for the retrieved documents and the generated text
retrieved_documents = [doc.page_content for doc in docs[:x]]
retrieved_embeddings = get_embeddings_for_documents(retrieved_documents, embedding_model)
generated_text_embedding = get_embedding_for_query(output[0]["generated_text"], embedding_model)

# Calculate cosine similarity between each retrieved document and the generated text
similarities = []
for doc_embedding in retrieved_embeddings:
    similarity = cosine_similarity([doc_embedding], [generated_text_embedding])
    similarities.append(similarity[0][0])

# Print out cosine similarity scores for faithfulness and relevancy
print("Cosine Similarities (Relevancy Scores):")
for i, similarity in enumerate(similarities, 1):
    print(f"Document {i}: Similarity = {similarity:.4f}")

# Calculate the Faithfulness Score
# Faithfulness Score: This is the maximum cosine similarity between the generated response and any document
faithfulness_score = max(similarities)
print(f"\nFaithfulness Score: {faithfulness_score:.4f}")

# Calculate the average relevancy score (mean cosine similarity)
average_relevancy_score = np.mean(similarities)
print(f"Average Relevancy Score: {average_relevancy_score:.4f}")


Cosine Similarities (Relevancy Scores):
Document 1: Similarity = 0.7528
Document 2: Similarity = 0.7546
Document 3: Similarity = 0.7645
Document 4: Similarity = 0.7890
Document 5: Similarity = 0.7662
Document 6: Similarity = 0.7314
Document 7: Similarity = 0.7873
Document 8: Similarity = 0.7485
Document 9: Similarity = 0.6970

Faithfulness Score: 0.7890
Average Relevancy Score: 0.7546
