## Data Ingestion   

In [12]:
from langchain.schema import Document
from langchain_core.documents import Document
from typing import List, Dict, Any


In [2]:
# These metadata fields are optional and can be omitted. This is helpful for the RAG pipeline to track the source of the document,
# Also for filtering the documents
document = Document(
    page_content="Hello, world!", 
    metadata={
        "source": "example.pdf",
        "pages": 1,
        "author": "Sunil J",
        "date": "2021-01-01"  
        }
    )

print(document)

page_content='Hello, world!' metadata={'source': 'example.pdf', 'pages': 1, 'author': 'Sunil J', 'date': '2021-01-01'}


In [3]:
### Using text loader to load the data from the file
from langchain.document_loaders import TextLoader

loader = TextLoader("../data/koh_samet.txt",encoding="utf-8")
documents = loader.load()

print(documents)

[Document(metadata={'source': '../data/koh_samet.txt'}, page_content='Koh Samet is a beautiful tropical island located about three hours from Bangkok, known for its stunning white sandy beaches, clear turquoise waters, and peaceful atmosphere. This small island offers a perfect escape with its lush jungles, rocky outcrops, fresh seafood, and a mix of relaxing bars and resorts, making it an ideal destination for both adventure and relaxation.\n\nIntroduction to Koh Samet\n\nKoh Samet is a charming island situated in the Gulf of Thailand, within Rayong province. It lies approximately 220 km from Bangkok and just 75 km from Pattaya, making it a popular weekend getaway for locals and travelers seeking a break from city life. Despite its proximity to major cities, Koh Samet remains a hidden gem, known for its idyllic beaches, fresh seafood, and laid-back vibe.\n\nGetting There\n\nFrom Bangkok to Ban Phe Pier: The journey to Koh Samet begins with a 2-3 hour trip from Bangkok to Ban Phe Pier 

In [4]:
### Using directory loader to load the text files from the directory
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader(
    "../data",
    glob="*.txt",
    loader_cls=TextLoader,
    show_progress=True,
    )
documents = loader.load()

print(documents)

100%|██████████| 2/2 [00:00<00:00, 648.47it/s]

[Document(metadata={'source': '../data/koh_samet.txt'}, page_content='Koh Samet is a beautiful tropical island located about three hours from Bangkok, known for its stunning white sandy beaches, clear turquoise waters, and peaceful atmosphere. This small island offers a perfect escape with its lush jungles, rocky outcrops, fresh seafood, and a mix of relaxing bars and resorts, making it an ideal destination for both adventure and relaxation.\n\nIntroduction to Koh Samet\n\nKoh Samet is a charming island situated in the Gulf of Thailand, within Rayong province. It lies approximately 220 km from Bangkok and just 75 km from Pattaya, making it a popular weekend getaway for locals and travelers seeking a break from city life. Despite its proximity to major cities, Koh Samet remains a hidden gem, known for its idyllic beaches, fresh seafood, and laid-back vibe.\n\nGetting There\n\nFrom Bangkok to Ban Phe Pier: The journey to Koh Samet begins with a 2-3 hour trip from Bangkok to Ban Phe Pier 




In [5]:
# read my excel file from the data/excel folder using langchain excel loader
from langchain_community.document_loaders import UnstructuredExcelLoader    

loader = UnstructuredExcelLoader("../data/excel/top_10_countries_population.xlsx")
documents = loader.load()
print(len(documents))
documents

1


[Document(metadata={'source': '../data/excel/top_10_countries_population.xlsx'}, page_content='Rank Country Population (millions) 1 China 1425 2 India 1417 3 United States 339 4 Indonesia 277 5 Pakistan 240 6 Nigeria 223 7 Brazil 216 8 Bangladesh 174 9 Russia 144 10 Mexico 130')]

In [6]:
# need pdf loader to load the pdf files from the data/pdf folder
from langchain_community.document_loaders import PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf_documents",glob="*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=True,
    )
pdf_documents = dir_loader.load()
pdf_documents


100%|██████████| 1/1 [00:02<00:00,  2.23s/it]


[Document(metadata={'producer': 'Acrobat Pro DC 20.6.20042', 'creator': 'Word', 'creationdate': '2020-03-26T13:14:28-05:00', 'source': '../data/pdf_documents/Physics.pdf', 'file_path': '../data/pdf_documents/Physics.pdf', 'total_pages': 850, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-05T14:54:05-05:00', 'trapped': '', 'modDate': "D:20200805145405-05'00'", 'creationDate': "D:20200326131428-05'00'", 'page': 0}, page_content='HIGH SCHOOL'),
 Document(metadata={'producer': 'Acrobat Pro DC 20.6.20042', 'creator': 'Word', 'creationdate': '2020-03-26T13:14:28-05:00', 'source': '../data/pdf_documents/Physics.pdf', 'file_path': '../data/pdf_documents/Physics.pdf', 'total_pages': 850, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-05T14:54:05-05:00', 'trapped': '', 'modDate': "D:20200805145405-05'00'", 'creationDate': "D:20200326131428-05'00'", 'page': 1}, page_content=''),
 Document(metad

In [7]:
### Text splitting get into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter  
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [8]:
chunks=split_documents(pdf_documents)
chunks

Split 850 documents into 2799 chunks

Example chunk:
Content: HIGH SCHOOL...
Metadata: {'producer': 'Acrobat Pro DC 20.6.20042', 'creator': 'Word', 'creationdate': '2020-03-26T13:14:28-05:00', 'source': '../data/pdf_documents/Physics.pdf', 'file_path': '../data/pdf_documents/Physics.pdf', 'total_pages': 850, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-05T14:54:05-05:00', 'trapped': '', 'modDate': "D:20200805145405-05'00'", 'creationDate': "D:20200326131428-05'00'", 'page': 0}


[Document(metadata={'producer': 'Acrobat Pro DC 20.6.20042', 'creator': 'Word', 'creationdate': '2020-03-26T13:14:28-05:00', 'source': '../data/pdf_documents/Physics.pdf', 'file_path': '../data/pdf_documents/Physics.pdf', 'total_pages': 850, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-05T14:54:05-05:00', 'trapped': '', 'modDate': "D:20200805145405-05'00'", 'creationDate': "D:20200326131428-05'00'", 'page': 0}, page_content='HIGH SCHOOL'),
 Document(metadata={'producer': 'Acrobat Pro DC 20.6.20042', 'creator': 'Word', 'creationdate': '2020-03-26T13:14:28-05:00', 'source': '../data/pdf_documents/Physics.pdf', 'file_path': '../data/pdf_documents/Physics.pdf', 'total_pages': 850, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-05T14:54:05-05:00', 'trapped': '', 'modDate': "D:20200805145405-05'00'", 'creationDate': "D:20200326131428-05'00'", 'page': 2}, page_content='Physics \n \n \n \n

In [15]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [16]:
import numpy as np
class EmbeddingManager:
    """
    This class is used to manage the embeddings for the documents.
    """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self.load_model()
    
    def load_model(self):
        try:
            print(f"Loading model {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, documents: List[str]) -> np.ndarray:
        """
        Generate embeddings for the documents.
        np.ndarray of shape (n_documents, n_dimensions)
        """
        print(f"Generating embeddings for {len(documents)} documents")
        embeddings = self.model.encode(documents, show_progress_bar=True)
        print(f"Embeddings generated for {len(documents)} documents with shape {embeddings.shape}")
        return embeddings

# Initialize embeddings manager
embeddings_manager = EmbeddingManager()
embeddings_manager
    

Loading model all-MiniLM-L6-v2
Model all-MiniLM-L6-v2 loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x13722ac90>

In [17]:
## Vector Store
import os
class VectorStoreManager:
    """
    This class is used to manage the vector store.
    """

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str | None = None):
        self.collection_name = collection_name
        # Resolve absolute path to avoid readonly issues from relative CWDs
        if persist_directory is None:
            self.persist_directory = os.path.abspath(os.path.join(os.getcwd(), "..", "data", "vector_store"))
        else:
            self.persist_directory = os.path.abspath(persist_directory)
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """
        Initialize the store if it doesn't exist.
        """
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            # Ensure directory is writable (rwx for owner, rx for group/others)
            try:
                os.chmod(self.persist_directory, 0o755)
            except Exception:
                pass
            # Use PersistentClient for proper persistence
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(
                self.collection_name,
                metadata={
                    "description": "This is a collection of documents for the PDF files"
                }
            )
            print(f"Collection {self.collection_name} created successfully at {self.persist_directory}.")
            print(f"Collection {self.collection_name} has {self.collection.count()} documents.")
        except Exception as e:
            print(f"Error creating directory {self.persist_directory}: {e}")
            raise

    def add_documents(self, documents: List[Document], embeddings: np.ndarray):
        """
        Add documents to the collection.
        """
        ids = [str(uuid.uuid4()) for _ in range(len(documents))]
        texts = [doc.page_content for doc in documents]
        metadatas = [doc.metadata for doc in documents]
        self.collection.add(
            ids=ids,
            documents=texts,
            metadatas=metadatas,
            embeddings=embeddings
        )
        print(f"Added {len(documents)} documents to the collection {self.collection_name}.")
        print(f"Collection {self.collection_name} has {self.collection.count()} documents.")

vector_store_manager = VectorStoreManager()
vector_store_manager



Collection pdf_documents created successfully at /Users/suniljacob/01-gitDownloads/02-build-rag-pipeline/data/vector_store.
Collection pdf_documents has 850 documents.


<__main__.VectorStoreManager at 0x138202f50>

In [18]:
chunks

[Document(metadata={'producer': 'Acrobat Pro DC 20.6.20042', 'creator': 'Word', 'creationdate': '2020-03-26T13:14:28-05:00', 'source': '../data/pdf_documents/Physics.pdf', 'file_path': '../data/pdf_documents/Physics.pdf', 'total_pages': 850, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-05T14:54:05-05:00', 'trapped': '', 'modDate': "D:20200805145405-05'00'", 'creationDate': "D:20200326131428-05'00'", 'page': 0}, page_content='HIGH SCHOOL'),
 Document(metadata={'producer': 'Acrobat Pro DC 20.6.20042', 'creator': 'Word', 'creationdate': '2020-03-26T13:14:28-05:00', 'source': '../data/pdf_documents/Physics.pdf', 'file_path': '../data/pdf_documents/Physics.pdf', 'total_pages': 850, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-05T14:54:05-05:00', 'trapped': '', 'modDate': "D:20200805145405-05'00'", 'creationDate': "D:20200326131428-05'00'", 'page': 2}, page_content='Physics \n \n \n \n

In [19]:
texts = [doc.page_content for doc in pdf_documents]

# Generate embeddings for the texts

embeddings = embeddings_manager.generate_embeddings(texts)

# store in vector store
vector_store_manager.add_documents(pdf_documents,embeddings)


Generating embeddings for 850 documents


Batches: 100%|██████████| 27/27 [01:17<00:00,  2.88s/it]


Embeddings generated for 850 documents with shape (850, 384)
Added 850 documents to the collection pdf_documents.
Collection pdf_documents has 1700 documents.


In [20]:
# Verify vector store contents
print(f"Collection count: {vector_store_manager.collection.count()}")

# Get all documents from the collection
all_docs = vector_store_manager.collection.get()
print(f"Retrieved {len(all_docs['ids'])} documents")
print("Document IDs:", all_docs['ids'])
#print("First document preview:", all_docs['documents'][0][:200] + "..." if all_docs['documents'] else "No documents")

# Test a simple query
query_text = "What is Electric Field?"
query_embedding = embeddings_manager.generate_embeddings([query_text])
results = vector_store_manager.collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=2
)
print(f"\nQuery results for '{query_text}':")
for i, (doc, dist) in enumerate(zip(results['documents'][0], results['distances'][0])):
    print(f"Result {i+1} (distance: {dist:.3f}): {doc[:10]}...")


Collection count: 1700
Retrieved 1700 documents
Document IDs: ['ec1479e4-6ea8-4c2f-b943-a7a7d6082fcf', 'cfe8a201-9680-4422-b506-b593302d3075', 'e1577b16-9319-4058-8276-931c6690706d', 'd3004e68-8144-4f90-b978-1584b4870dbc', 'dfee8757-eeb9-41fe-8cd1-5c7f81772ad3', 'e93c0b83-569e-47da-aea0-bfabad0877d8', '34c2094d-8022-4713-a463-aa1fe7af825c', '12b99e65-5f13-448f-9123-a547ae3736ee', 'babeee37-47d1-46dd-9e42-16c08af224f4', '9525d8c0-43fb-4936-b7cd-c118b85f44f9', 'cb5560fb-6f1d-4a5f-9983-b75a60603b40', '0e14a3a5-fa48-4341-834a-24726f8a35f1', 'cc47e01c-db2c-48ee-ace7-d3a34cf7837c', '88088315-3572-4c53-b795-792bdaca3295', 'e7c7afbf-e452-4978-8b01-445c076d983f', '03f6f025-4e3e-49ce-9580-ebc1bca06b7f', 'a7717c1a-f94e-410a-a4e7-0ba4ce501c32', '1f549166-d268-4e9c-87c0-c415a4fd5eb6', '11147918-d845-4f0a-80cd-e58c06464129', '44f4e093-530b-4d84-8384-6f4453e69dc9', 'dd9d1370-7191-4c34-9bfb-e9da2e394184', 'fcb2046d-9e6d-4c07-8405-92b38c5558c3', '8d2a59a4-37ff-4a6a-ae38-dba3a8c26476', 'df04e53a-779f-46

Batches: 100%|██████████| 1/1 [00:00<00:00, 77.40it/s]

Embeddings generated for 1 documents with shape (1, 384)

Query results for 'What is Electric Field?':
Result 1 (distance: 1.009): This equat...
Result 2 (distance: 1.009): This equat...





In [24]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStoreManager, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vector_store_manager,embeddings_manager)
rag_retriever

<__main__.RAGRetriever at 0x139040790>

In [26]:
rag_retriever.retrieve("What is Physics?")

Retrieving documents for query: 'What is Physics?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 documents


Batches: 100%|██████████| 1/1 [00:00<00:00, 18.64it/s]

Embeddings generated for 1 documents with shape (1, 384)
Retrieved 5 documents (after filtering)





[{'id': '814620db-77b4-4a0a-bb7c-a0deb6c363ea',
  'content': 'for different observers in relative motion\nuncertainty\na quantitative measure of how much measured\nvalues deviate from a standard or expected value\nuniversal\napplies throughout the known universe\ny-intercept\nthe point where a plot line intersects the y-axis\nSECTION SUMMARY\n1.1 Physics: Definitions and\nApplications\n•\nPhysics is the most fundamental of the sciences,\nconcerning itself with energy, matter, space and time,\nand their interactions.\n•\nModern physics involves the theory of relativity, which\ndescribes how time, space and gravity are not constant\nin our universe can be different for different observers,\nand quantum mechanics, which describes the behavior\nof subatomic particles.\n•\nPhysics is the basis for all other sciences, such as\nchemistry, biology and geology, because physics\ndescribes the fundamental way in which the universe\nfunctions.\n1.2 The Scientific Methods\n•\nScience seeks to disco

In [27]:
rag_retriever.retrieve("What are the properties of Maxwell's equations?")

Retrieving documents for query: 'What are the properties of Maxwell's equations?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 documents


Batches: 100%|██████████| 1/1 [00:00<00:00, 28.94it/s]

Embeddings generated for 1 documents with shape (1, 384)
Retrieved 2 documents (after filtering)





[{'id': 'e4331f0b-baca-4602-9e4e-bf90d0684efc',
  'content': 'nineteenth century. Although he died young, Maxwell not only formulated a complete electromagnetic theory, represented by\nMaxwell’s equations, he also developed the kinetic theory of gases, and made significant contributions to the understanding of\ncolor vision and the nature of Saturn’s rings.\nMaxwell brought together all the work that had been done by brilliant physicists, such as Ørsted, Coulomb, Ampere, Gauss, and\nFaraday, and added his own insights to develop the overarching theory of electromagnetism. Maxwell’s equations are\nparaphrased here in words because their mathematical content is beyond the level of this text. However, the equations illustrate\nhow apparently simple mathematical statements can elegantly unite and express a multitude of concepts—why mathematics is\nthe language of science.\nMaxwell’s Equations\n1.\nElectric field lines originate on positive charges and terminate on negative charges. The ele

In [28]:
# Augumented Generation
## Simple RAG Pipeline with Groq LLM
from dotenv import load_dotenv
from langchain_groq import ChatGroq
import os

load_dotenv()
GROQ_API_KEY=os.getenv("GROQ_API_KEY")


groq_llm = ChatGroq(
    model_name="gemma2-9b-it",
    api_key=os.getenv(GROQ_API_KEY),
    temperature=0.1,
    max_tokens=1024,
)


In [30]:
# retrieve context + generate response

def rag_chain(question: str, retriever,llm,top_k:int=2):
    results = retriever.retrieve(question,top_k=top_k)
    print(results)
    context = "\n".join([doc["content"] for doc in results]) if results else ""
    if not context:
        return "No context found"
    rag_prompt = """
    You are a helpful assistant that can answer questions about the context provided.
    Context: {context}
    Question: {question}
    """
    response = llm.invoke(rag_prompt.format(context=context,question=question))
    return response.content

rag_chain("What is potential energy?",rag_retriever,groq_llm)
    


Retrieving documents for query: 'What is potential energy?'
Top K: 2, Score threshold: 0.0
Generating embeddings for 1 documents


Batches: 100%|██████████| 1/1 [00:00<00:00, 36.97it/s]

Embeddings generated for 1 documents with shape (1, 384)
Retrieved 2 documents (after filtering)
[{'id': '8ab3538a-2944-4a14-8dd1-c79bf9b8dd5c', 'content': 'Figure 18.23 The potential energy depends on the sign of the charges and their separation. The arrows on the charges indicate the\ndirection in which the charges would move if released. When charges with the same sign are far apart, their potential energy is low, as\nshown in the top panel for two positive charges. The situation is the reverse for charges of opposite signs, as shown in the bottom panel.\nElectric Potential\nRecall that to find the force applied by a fixed charge Q on any arbitrary test charge q, it was convenient to define the electric\nfield, which is the force per unit charge applied by Q on any test charge that we place in its electric field. The same strategy is\nused here with electric potential energy: We now define the electric potential V, which is the electric potential energy per unit\ncharge.\nNormally, 




"The provided text defines potential energy in the context of electric charges. \n\nHere's a breakdown:\n\n* **Potential energy depends on:**\n    * The **sign** of the charges (whether they are positive or negative).\n    * Their **separation** (how far apart they are).\n\n* **When charges have the same sign:**\n    * They repel each other.\n    * Their potential energy is **high** when they are close together.\n    * Their potential energy is **low** when they are far apart.\n\n* **When charges have opposite signs:**\n    * They attract each other.\n    * Their potential energy is **low** when they are close together.\n    * Their potential energy is **high** when they are far apart.\n\n\nLet me know if you have any other questions about potential energy or the provided text! \n"