## Data Ingestion   

In [56]:
from langchain.schema import Document
from langchain_core.documents import Document


In [57]:
# These metadata fields are optional and can be omitted. This is helpful for the RAG pipeline to track the source of the document,
# Also for filtering the documents
document = Document(
    page_content="Hello, world!", 
    metadata={
        "source": "example.pdf",
        "pages": 1,
        "author": "Sunil J",
        "date": "2021-01-01"  
        }
    )

print(document)

page_content='Hello, world!' metadata={'source': 'example.pdf', 'pages': 1, 'author': 'Sunil J', 'date': '2021-01-01'}


In [58]:
### Using text loader to load the data from the file
from langchain.document_loaders import TextLoader

loader = TextLoader("../data/koh_samet.txt",encoding="utf-8")
documents = loader.load()

print(documents)

[Document(metadata={'source': '../data/koh_samet.txt'}, page_content='Koh Samet is a beautiful tropical island located about three hours from Bangkok, known for its stunning white sandy beaches, clear turquoise waters, and peaceful atmosphere. This small island offers a perfect escape with its lush jungles, rocky outcrops, fresh seafood, and a mix of relaxing bars and resorts, making it an ideal destination for both adventure and relaxation.\n\nIntroduction to Koh Samet\n\nKoh Samet is a charming island situated in the Gulf of Thailand, within Rayong province. It lies approximately 220 km from Bangkok and just 75 km from Pattaya, making it a popular weekend getaway for locals and travelers seeking a break from city life. Despite its proximity to major cities, Koh Samet remains a hidden gem, known for its idyllic beaches, fresh seafood, and laid-back vibe.\n\nGetting There\n\nFrom Bangkok to Ban Phe Pier: The journey to Koh Samet begins with a 2-3 hour trip from Bangkok to Ban Phe Pier 

In [59]:
### Using directory loader to load the text files from the directory
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader(
    "../data",
    glob="*.txt",
    loader_cls=TextLoader,
    show_progress=True,
    )
documents = loader.load()

print(documents)

100%|██████████| 2/2 [00:00<00:00, 1422.52it/s]

[Document(metadata={'source': '../data/koh_samet.txt'}, page_content='Koh Samet is a beautiful tropical island located about three hours from Bangkok, known for its stunning white sandy beaches, clear turquoise waters, and peaceful atmosphere. This small island offers a perfect escape with its lush jungles, rocky outcrops, fresh seafood, and a mix of relaxing bars and resorts, making it an ideal destination for both adventure and relaxation.\n\nIntroduction to Koh Samet\n\nKoh Samet is a charming island situated in the Gulf of Thailand, within Rayong province. It lies approximately 220 km from Bangkok and just 75 km from Pattaya, making it a popular weekend getaway for locals and travelers seeking a break from city life. Despite its proximity to major cities, Koh Samet remains a hidden gem, known for its idyllic beaches, fresh seafood, and laid-back vibe.\n\nGetting There\n\nFrom Bangkok to Ban Phe Pier: The journey to Koh Samet begins with a 2-3 hour trip from Bangkok to Ban Phe Pier 




In [60]:
# read my excel file from the data/excel folder using langchain excel loader
from langchain_community.document_loaders import UnstructuredExcelLoader    

loader = UnstructuredExcelLoader("../data/excel/top_10_countries_population.xlsx")
documents = loader.load()
print(len(documents))
documents

1


[Document(metadata={'source': '../data/excel/top_10_countries_population.xlsx'}, page_content='Rank Country Population (millions) 1 China 1425 2 India 1417 3 United States 339 4 Indonesia 277 5 Pakistan 240 6 Nigeria 223 7 Brazil 216 8 Bangladesh 174 9 Russia 144 10 Mexico 130')]

In [61]:
# need pdf loader to load the pdf files from the data/pdf folder
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf_documents",glob="*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=True,
    )
pdf_documents = dir_loader.load()
pdf_documents


100%|██████████| 3/3 [00:00<00:00, 114.64it/s]


[Document(metadata={'producer': 'OpenPDF 1.3.32', 'creator': '', 'creationdate': '2024-11-14T10:56:16+01:00', 'source': '../data/pdf_documents/swiss-pass-sunil.pdf', 'file_path': '../data/pdf_documents/swiss-pass-sunil.pdf', 'total_pages': 1, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': "D:20241114105616+01'00'", 'page': 0}, page_content='ĠġĢģ\nTicket-ID 496508225499\nSwiss Travel Pass 3 days\nJacob Sunil\nValid: 2025\n14.05.1980\nħ\nĨ\nvon/de/da/from\nì nach/a/a/to\nħ\nĨ\nKL.\nCL.\n28.04\n00:00 *\nì *\n01.05\n05:00\n2\n*\n*\n*\nì *\n*\n*\n*\nArea of validity and benefits see www.swisstravelpass.com. Valid as Swiss Museum Pass.\nOnly valid with your Passport / ID.\nAdults\nCHF 244.00\nB2P\n(2.)(GA)(8)\nArticle no.: 6704\nOrder no.: 115477989376\nincl. 8.10% VAT\nThe current tariff of Swiss transport companies, in particular the common ancillary tariff regulations for direct\nservice and regio

In [62]:
type(pdf_documents)

list

In [63]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [64]:
class EmbeddingManager:
    """
    This class is used to manage the embeddings for the documents.
    """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self.load_model()
    
    def load_model(self):
        try:
            print(f"Loading model {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, documents: List[str]) -> np.ndarray:
        """
        Generate embeddings for the documents.
        Args:
            documents: List[str]
        Returns:
            np.ndarray of shape (n_documents, n_dimensions)
        """
        print(f"Generating embeddings for {len(documents)} documents")
        embeddings = self.model.encode(documents, show_progress_bar=True)
        print(f"Embeddings generated for {len(documents)} documents with shape {embeddings.shape}")
        return embeddings

# Initialize embeddings manager
embeddings_manager = EmbeddingManager()
embeddings_manager
    

Loading model all-MiniLM-L6-v2
Model all-MiniLM-L6-v2 loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x141582290>

In [65]:
## Vector Store
import os
class VectorStoreManager:
    """
    This class is used to manage the vector store.
    """

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str | None = None):
        self.collection_name = collection_name
        # Resolve absolute path to avoid readonly issues from relative CWDs
        if persist_directory is None:
            self.persist_directory = os.path.abspath(os.path.join(os.getcwd(), "..", "data", "vector_store"))
        else:
            self.persist_directory = os.path.abspath(persist_directory)
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """
        Initialize the store if it doesn't exist.
        """
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            # Ensure directory is writable (rwx for owner, rx for group/others)
            try:
                os.chmod(self.persist_directory, 0o755)
            except Exception:
                pass
            # Use PersistentClient for proper persistence
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(
                self.collection_name,
                metadata={
                    "description": "This is a collection of documents for the PDF files"
                }
            )
            print(f"Collection {self.collection_name} created successfully at {self.persist_directory}.")
            print(f"Collection {self.collection_name} has {self.collection.count()} documents.")
        except Exception as e:
            print(f"Error creating directory {self.persist_directory}: {e}")
            raise

    def add_documents(self, documents: List[Document], embeddings: np.ndarray):
        """
        Add documents to the collection.
        """
        ids = [str(uuid.uuid4()) for _ in range(len(documents))]
        texts = [doc.page_content for doc in documents]
        metadatas = [doc.metadata for doc in documents]
        self.collection.add(
            ids=ids,
            documents=texts,
            metadatas=metadatas,
            embeddings=embeddings
        )
        print(f"Added {len(documents)} documents to the collection {self.collection_name}.")
        print(f"Collection {self.collection_name} has {self.collection.count()} documents.")

vector_store_manager = VectorStoreManager()
vector_store_manager



Collection pdf_documents created successfully at /Users/suniljacob/01-gitDownloads/02-build-rag-pipeline/data/vector_store.
Collection pdf_documents has 0 documents.


<__main__.VectorStoreManager at 0x14144bb10>

In [66]:
texts = [doc.page_content for doc in pdf_documents]

# Generate embeddings for the texts

embeddings = embeddings_manager.generate_embeddings(texts)

# store in vector store
vector_store_manager.add_documents(pdf_documents,embeddings)

# query the vector store
# query = "What is the capital of India?"
# results = vector_store_manager.query_collection(query)
# print(results)

Generating embeddings for 3 documents


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.95it/s]

Embeddings generated for 3 documents with shape (3, 384)
Added 3 documents to the collection pdf_documents.
Collection pdf_documents has 3 documents.





In [69]:
# Verify vector store contents
print(f"Collection count: {vector_store_manager.collection.count()}")

# Get all documents from the collection
all_docs = vector_store_manager.collection.get()
print(f"Retrieved {len(all_docs['ids'])} documents")
print("Document IDs:", all_docs['ids'])
#print("First document preview:", all_docs['documents'][0][:200] + "..." if all_docs['documents'] else "No documents")

# Test a simple query
query_text = "What is this document about?"
query_embedding = embeddings_manager.generate_embeddings([query_text])
results = vector_store_manager.collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=2
)
print(f"\nQuery results for '{query_text}':")
for i, (doc, dist) in enumerate(zip(results['documents'][0], results['distances'][0])):
    print(f"Result {i+1} (distance: {dist:.3f}): {doc[:10]}...")


Collection count: 3
Retrieved 3 documents
Document IDs: ['fa731226-4840-4f0a-b6ec-25e47a95abce', 'be6bdc2e-bd3c-4539-89ab-fc7dd377573e', 'b065ed2a-1b64-411b-bf19-86611c1e45ab']
Generating embeddings for 1 documents


Batches: 100%|██████████| 1/1 [00:00<00:00, 36.01it/s]

Embeddings generated for 1 documents with shape (1, 384)

Query results for 'What is this document about?':
Result 1 (distance: 1.707): FORM NO. 1...
Result 2 (distance: 1.716): FORM NO. 1...



