In [8]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
from typing import List, Dict, Any, Tuple
import numpy as np

In [9]:

### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    print("Pdf directory : ", pdf_dir)

    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print("List of PDF files found: ", pdf_files)
    
    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files: 
        ## Load the pdf file. 
        pdf_loader = PyMuPDFLoader(str(pdf_file)) 
        documents = pdf_loader.load() 

        for doc in documents:
            doc.metadata['source'] = str(pdf_file) 
            doc.metadata['file_type'] = 'pdf' 

        print(f"Loaded {len(documents)} documentsfrom file - {pdf_file}")

        all_documents.extend(documents)

    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

all_pdf_documents

Pdf directory :  ..\data
List of PDF files found:  [WindowsPath('../data/pdf/harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf')]
Found 1 PDF files to process
Loaded 228 documentsfrom file - ..\data\pdf\harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf
Loaded 228 documentsfrom file - ..\data\pdf\harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf


[Document(metadata={'producer': 'Acrobat Distiller 7.0.5 (Windows)', 'creator': 'PScript5.dll Version 5.2', 'creationdate': '2012-05-01T23:57:27+10:00', 'source': '..\\data\\pdf\\harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf', 'file_path': '..\\data\\pdf\\harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf', 'total_pages': 228, 'format': 'PDF 1.4', 'title': 'Harry Potter and the Philosopher’s Stone', 'author': 'J.K. Rowling', 'subject': "When a letter arrives for unhappy but ordinary Harry Potter, a decade-old secret is revealed to him. His parents were wizards, killed by a Dark Lord's curse when Harry was just a baby, and which he somehow survived. Escaping from his unbearable Muggle guardians to Hogwarts, a wizarding school brimming with ghosts and enchantments, Harry stumbles into a sinister adventure when he finds a threeheaded dog guarding a room on the third floor. Then he hears of a missing stone with astonishing powers which could be valuable, dangerous, or bo

In [10]:
def split_documents(documents, chunk_size=100, chunk_overlap=50) -> list:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)

    print (f"Split {len(documents)} documents into {len(split_docs)} chunks") 

    return split_docs

In [11]:
split_pdf_documents = split_documents(all_pdf_documents)
split_pdf_documents

Split 228 documents into 6868 chunks



[Document(metadata={'producer': 'Acrobat Distiller 7.0.5 (Windows)', 'creator': 'PScript5.dll Version 5.2', 'creationdate': '2012-05-01T23:57:27+10:00', 'source': '..\\data\\pdf\\harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf', 'file_path': '..\\data\\pdf\\harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf', 'total_pages': 228, 'format': 'PDF 1.4', 'title': 'Harry Potter and the Philosopher’s Stone', 'author': 'J.K. Rowling', 'subject': "When a letter arrives for unhappy but ordinary Harry Potter, a decade-old secret is revealed to him. His parents were wizards, killed by a Dark Lord's curse when Harry was just a baby, and which he somehow survived. Escaping from his unbearable Muggle guardians to Hogwarts, a wizarding school brimming with ghosts and enchantments, Harry stumbles into a sinister adventure when he finds a threeheaded dog guarding a room on the third floor. Then he hears of a missing stone with astonishing powers which could be valuable, dangerous, or bo

In [12]:
### Create Embeddings 
from sentence_transformers import SentenceTransformer

class EmbeddingManager: 
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model = model_name 
        self.embeddingModel = self.loadModel()
        return 
    
    ### Load model.  
    def loadModel(self):
        try: 
            print("Loading model : ", self.model)
            self.embeddingsModel = SentenceTransformer(self.model)
        except Exception as e:
            print("Exception in loading model : ", e)
            print("Failed to load model.")
            raise e

        return self.embeddingsModel
    
    ### Create embeddings model. 
    def createEmbeddings(self, texts:List[str], chunk_size=100, chunk_overlap=50) -> np.ndarray:

        if not self.embeddingModel: 
            print("Embeddings model is not loaded. Please load the model first.")
            raise Exception("Embeddings model is not loaded. Please load the model first.")

        embeddings = self.embeddingModel.encode(texts, show_progress_bar=True)
        print("Generated Embeddings with shape: ", embeddings.shape)
        return embeddings

In [13]:
embeddingManager = EmbeddingManager()
texts = [doc.page_content for doc in split_pdf_documents]
embeddings = embeddingManager.createEmbeddings(texts)


Loading model :  all-MiniLM-L6-v2


Batches: 100%|██████████| 215/215 [00:18<00:00, 11.50it/s]

Generated Embeddings with shape:  (6868, 384)





In [14]:
len(embeddings)
len(embeddings[0])
embeddings.shape

(6868, 384)

In [15]:
import chromadb
import uuid

class VectorStoreDB: 
    persist_directory = None
    collection_name = None
    vectordbclient = None
    collection = None

    """
    Any class to manage the vector store, it stores  embeddings and stores it in the vector db.
    It also retrieves the embeddings from the vector db.
    """

    ## Initialize the class. 
    def __init__(self, persist_directory: str, collection_name: str): 
        print("VectorStoreDB init called")
        self.persist_directory = persist_directory
        self.collection_name = collection_name 
        self.vectordbclient = None 
        self.collection = None

        self.createCollection()
        return 

    ## Create a collection.     
    def createCollection(self): 
        try: 
            ## Create the persistent directory if it does not exist. 
            os.makedirs(self.persist_directory, exist_ok=True) 
            self.vectordbclient = chromadb.PersistentClient(path=self.persist_directory)

            ## Create the collection if it does not exist.
            self.collection = self.vectordbclient.get_or_create_collection(
                name=self.collection_name, 
                metadata={"description":"RAG store for pdf documents"}
            )

            print("List of available collections in vector DB :", self.vectordbclient.list_collections())
            print("Created collection created with name: ", self.collection_name)
            print("Available documents in the collection : ", self.collection.count())

        except Exception as e:
            print("Exception in creating collection : ", e)
            raise e

        return 
    
    ## Add embeddings to the collection with batch processing
    def add_documents(self, documents: List[Any], embeddings: np.ndarray, batch_size: int = 5000):
        """
        Add documents and their embeddings to the vector store with batch processing
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
            batch_size: Maximum number of documents to process in each batch
        """
        
        print(f"Adding {len(documents)} documents to the collection")
        
        if len(documents) != len(embeddings):
            raise Exception("Number of documents and embeddings must be same.")

        # Process documents in batches
        total_documents = len(documents)
        batches_count = (total_documents + batch_size - 1) // batch_size  # Ceiling division
        
        print(f"Processing {total_documents} documents in {batches_count} batches of max {batch_size} documents each")
        
        for batch_idx in range(batches_count):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, total_documents)
            
            print(f"Processing batch {batch_idx + 1}/{batches_count}: documents {start_idx} to {end_idx-1}")
            
            # Get batch data
            batch_documents = documents[start_idx:end_idx]
            batch_embeddings = embeddings[start_idx:end_idx]
            
            # Process this batch
            self._add_batch(batch_documents, batch_embeddings, start_idx)
        
        print(f"Successfully added all documents. Total documents in collection: {self.collection.count()}")
        return
    
    def _add_batch(self, documents: List[Any], embeddings: np.ndarray, start_index: int):
        """
        Add a single batch of documents to the collection
        
        Args:
            documents: Batch of LangChain documents
            embeddings: Corresponding embeddings for the batch
            start_index: Starting index for generating unique IDs
        """
        metadatas = [] 
        doc_ids = [] 
        documents_text = [] 
        embeddings_list = []

        try: 
            ## Prepare Metadata for this batch
            for i, (doc, embedding) in enumerate(zip(documents, embeddings)): 
                global_index = start_index + i  # Global index across all batches
                
                ## Prepare doc_id list. 
                doc_id = f"doc_{uuid.uuid4().hex[:8]}_{global_index}"
                doc_ids.append(doc_id)

                ## Prepare metadata list. 
                metadata = dict(doc.metadata)
                metadata["doc_id"] = doc_id 
                metadata["doc_index"] = global_index 
                metadata["content_length"] = len(doc.page_content)
                metadatas.append(metadata)

                ## Prepare document text list. 
                doc_text = doc.page_content 
                documents_text.append(doc_text) 

                ## Prepare embeddings list. 
                embeddings_list.append(embedding.tolist()) 

            # Add this batch to ChromaDB
            self.collection.add(
                ids=doc_ids, 
                embeddings=embeddings_list,  # Use the list version, not the numpy array
                metadatas=metadatas,
                documents=documents_text
            )
            
            print(f"  ✓ Added batch of {len(documents)} documents")

        except Exception as e:
            print(f"Exception in adding batch: {e}")
            raise e

        return

In [16]:
vectorStoreDB = VectorStoreDB("../data/vectorstore", "pdf_rag2") 
vectorStoreDB.add_documents(split_pdf_documents, embeddings) 




VectorStoreDB init called
List of available collections in vector DB : [Collection(name=pdf_rag), Collection(name=pdf_rag2)]
Created collection created with name:  pdf_rag2
Available documents in the collection :  13736
Adding 6868 documents to the collection
Processing 6868 documents in 2 batches of max 5000 documents each
Processing batch 1/2: documents 0 to 4999
  ✓ Added batch of 5000 documents
Processing batch 2/2: documents 5000 to 6867
  ✓ Added batch of 5000 documents
Processing batch 2/2: documents 5000 to 6867
  ✓ Added batch of 1868 documents
Successfully added all documents. Total documents in collection: 20604
  ✓ Added batch of 1868 documents
Successfully added all documents. Total documents in collection: 20604


In [17]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStoreDB, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.createEmbeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorStoreDB,embeddingManager)

In [18]:
rag_retriever.retrieve("Dead flies")

Retrieving documents for query: 'Dead flies'
Top K: 5, Score threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.11it/s]

Generated Embeddings with shape:  (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_d8396262_2868',
  'content': 'With some interesting stuff, \nFor now they’re bare and full of air, \nDead flies and bits of fluff,',
  'metadata': {'content_length': 98,
   'file_type': 'pdf',
   'subject': "When a letter arrives for unhappy but ordinary Harry Potter, a decade-old secret is revealed to him. His parents were wizards, killed by a Dark Lord's curse when Harry was just a baby, and which he somehow survived. Escaping from his unbearable Muggle guardians to Hogwarts, a wizarding school brimming with ghosts and enchantments, Harry stumbles into a sinister adventure when he finds a threeheaded dog guarding a room on the third floor. Then he hears of a missing stone with astonishing powers which could be valuable, dangerous, or both.",
   'doc_index': 2868,
   'file_path': '..\\data\\pdf\\harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf',
   'creationdate': '2012-05-01T23:57:27+10:00',
   'source': '..\\data\\pdf\\harry-potter-and-the-philosophers-stone-by

In [19]:
import os
from dotenv import load_dotenv
load_dotenv()

print(os.getenv("GROQ_API_KEY"))


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


In [20]:
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage, SystemMessage

In [21]:
class GroqLLM:
    def __init__(self, model_name: str = "gemma2-9b-it", api_key: str =None):
        """
        Initialize Groq LLM
        
        Args:
            model_name: Groq model name (qwen2-72b-instruct, llama3-70b-8192, etc.)
            api_key: Groq API key (or set GROQ_API_KEY environment variable)
        """
        self.model_name = model_name
        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
        
        if not self.api_key:
            raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable or pass api_key parameter.")
        
        self.llm = ChatGroq(
            groq_api_key=self.api_key,
            model_name=self.model_name,
            temperature=0.1,
            max_tokens=1024
        )
        
        print(f"Initialized Groq LLM with model: {self.model_name}")

    def generate_response(self, query: str, context: str, max_length: int = 500) -> str:
        """
        Generate response using retrieved context
        
        Args:
            query: User question
            context: Retrieved document context
            max_length: Maximum response length
            
        Returns:
            Generated response string
        """
        
        # Create prompt template
        prompt_template = PromptTemplate(
            input_variables=["context", "question"],
            template="""You are a helpful AI assistant. Use the following context to answer the question accurately and concisely.

Context:
{context}

Question: {question}

Answer: Provide a clear and informative answer based on the context above. If the context doesn't contain enough information to answer the question, say so."""
        )
        
        # Format the prompt
        formatted_prompt = prompt_template.format(context=context, question=query)
        
        try:
            # Generate response
            messages = [HumanMessage(content=formatted_prompt)]
            response = self.llm.invoke(messages)
            return response.content
            
        except Exception as e:
            return f"Error generating response: {str(e)}"
        
    def generate_response_simple(self, query: str, context: str) -> str:
        """
        Simple response generation without complex prompting
        
        Args:
            query: User question
            context: Retrieved context
            
        Returns:
            Generated response
        """
        simple_prompt = f"""Based on this context: {context}

Question: {query}

Answer:"""
        
        try:
            messages = [HumanMessage(content=simple_prompt)]
            response = self.llm.invoke(messages)
            return response.content
        except Exception as e:
            return f"Error: {str(e)}"
    

In [22]:
# Initialize Groq LLM (you'll need to set GROQ_API_KEY environment variable)
try:
    groq_llm = GroqLLM(api_key=os.getenv("GROQ_API_KEY"))
    print("Groq LLM initialized successfully!")
except ValueError as e:
    print(f"Warning: {e}")
    print("Please set your GROQ_API_KEY environment variable to use the LLM.")
    groq_llm = None

Initialized Groq LLM with model: gemma2-9b-it
Groq LLM initialized successfully!


In [23]:
rag_retriever.retrieve("Explain story part of the Sorting Hat in Harry Potter")

Retrieving documents for query: 'Explain story part of the Sorting Hat in Harry Potter'
Top K: 5, Score threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 72.12it/s]

Generated Embeddings with shape:  (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_ea62276f_2651',
  'content': 'The Sorting Hat \n89 \n \n‘So we’ve just got to try on the hat!’ Ron whispered to Harry.',
  'metadata': {'doc_id': 'doc_ea62276f_2651',
   'modDate': "D:20120701142039+10'00'",
   'author': 'J.K. Rowling',
   'format': 'PDF 1.4',
   'producer': 'Acrobat Distiller 7.0.5 (Windows)',
   'page': 90,
   'file_path': '..\\data\\pdf\\harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf',
   'file_type': 'pdf',
   'creator': 'PScript5.dll Version 5.2',
   'content_length': 85,
   'subject': "When a letter arrives for unhappy but ordinary Harry Potter, a decade-old secret is revealed to him. His parents were wizards, killed by a Dark Lord's curse when Harry was just a baby, and which he somehow survived. Escaping from his unbearable Muggle guardians to Hogwarts, a wizarding school brimming with ghosts and enchantments, Harry stumbles into a sinister adventure when he finds a threeheaded dog guarding a room on the third floor. Then he hears of a m

In [26]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [None]:
answer=rag_simple("Explain story part of the Sorting Hat in Harry Potter.",rag_retriever,llm)
print(answer)



Retrieving documents for query: 'Explain story part of the Sorting Hat in Harry Potter.'
Top K: 3, Score threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 158.86it/s]

Generated Embeddings with shape:  (1, 384)
Retrieved 3 documents (after filtering)





In the Harry Potter series, the Sorting Hat is a magical hat that sorts first-year students into four houses: Gryffindor, Slytherin, Ravenclaw, and Hufflepuff. The hat is placed on a student's head, and it reads their thoughts and personality to determine which house they would best fit into.


In [28]:
answer=rag_simple("How Harry joined School?.",rag_retriever,llm)
print(answer)


Retrieving documents for query: 'How Harry joined School?.'
Top K: 3, Score threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 90.89it/s]

Generated Embeddings with shape:  (1, 384)
Retrieved 3 documents (after filtering)





There is no information in the given context about Harry joining school. The context only mentions Harry wondering what a wizard does after finishing school.


In [31]:
answer=rag_simple("How was Harry's childhood?",rag_retriever,llm)
print(answer)


Retrieving documents for query: 'How was Harry's childhood?'
Top K: 3, Score threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 153.50it/s]

Generated Embeddings with shape:  (1, 384)
Retrieved 3 documents (after filtering)





Harry's childhood was marked by a traumatic event, as he survived an attempt on his life as a baby, and then escaped from the person responsible.


In [33]:
answer=rag_simple("Explain Halloween story of Harry?",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'Explain Halloween story of Harry?'
Top K: 3, Score threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 93.57it/s]

Generated Embeddings with shape:  (1, 384)
Retrieved 3 documents (after filtering)





The Halloween story of Harry refers to the events in the book "Harry Potter and the Philosopher's Stone" (published as "Harry Potter and the Sorcerer's Stone" in the United States) by J.K. Rowling. 

On three consecutive nights, Harry sneaks out of his dormitory to explore the school, Hogwarts, and discovers a mysterious three-headed dog guarding a trapdoor that leads to a hidden passageway.
