### RAG Pipeline- DATa ingestion to vector db

In [5]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [6]:
def process_all_pdfs(pdf_directory):
    """process all pdf files"""
    all_docs=[]
    pdf_dir=Path(pdf_directory)

    pdf_files=list(pdf_dir.glob("**/*.pdf"))

    print(f"found {len(pdf_files)} PDf files to process")

    for pdf_file in pdf_files:
        print(f"processing:{pdf_file.name}")

        try:
            loader=PyPDFLoader(str(pdf_file))
            documents=loader.load()

            for doc in documents:
                doc.metadata['source_file']=pdf_file.name
                doc.metadata['file_type']='pdf'

            all_docs.extend(documents)
            print(f"Loaded {len(documents)} pages")

        except Exception as e:
            print("error")

    print(f"total docs loaded: {len(documents)}")
    return all_docs

all_pdf_docs=process_all_pdfs("../data")


found 2 PDf files to process
processing:Jasmine-Resume.pdf
Loaded 1 pages
processing:Project - Manufacturing.pdf
Loaded 5 pages
total docs loaded: 5


In [7]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap
    ,separators=["\n\n","\n"," ."," .","!","?"], length_function=len)
    split_documents=text_splitter.split_documents(documents)

    print(f"Number of documents: {len(documents)}")
    print(f"Number of split documents: {len(split_documents)}")


    if split_documents:
        print("Split documents:")
        print(f"Content of first split document: {split_documents[0].page_content}")
        print(f"Metadata of first split document: {split_documents[0].metadata}")
    return split_documents


In [8]:
split_documents(all_pdf_docs,chunk_size=100,chunk_overlap=20)

Number of documents: 6
Number of split documents: 147
Split documents:
Content of first split document: Jasmine Kaur 
+1 5195338833|  jasminkaur5858@gmail.com | LinkedIn | Website  | GitHub 
EDUCATION
Metadata of first split document: {'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-01-15T11:17:28-05:00', 'author': 'JakesResume', 'comments': '', 'company': '', 'keywords': '', 'moddate': '2026-01-15T11:17:35-05:00', 'sourcemodified': 'D:20260113213020', 'subject': '', 'title': '', 'source': '..\\data\\pdf_files\\Jasmine-Resume.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Jasmine-Resume.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-01-15T11:17:28-05:00', 'author': 'JakesResume', 'comments': '', 'company': '', 'keywords': '', 'moddate': '2026-01-15T11:17:35-05:00', 'sourcemodified': 'D:20260113213020', 'subject': '', 'title': '', 'source': '..\\data\\pdf_files\\Jasmine-Resume.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Jasmine-Resume.pdf', 'file_type': 'pdf'}, page_content='Jasmine Kaur \n+1 5195338833|  jasminkaur5858@gmail.com | LinkedIn | Website  | GitHub \nEDUCATION'),
 Document(metadata={'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-01-15T11:17:28-05:00', 'author': 'JakesResume', 'comments': '', 'company': '', 'keywords': '', 'moddate': '2026-01-15T11:17:35-05:00', 'sourcemodified': 'D:20260113213020', 'subject': '', 'title': '', 'source': '..\\data\\pdf_files\\Jasmine-Resume.pdf', 'total_pages': 1, 'p

In [1]:
import numpy as np 
from sentence_transformers import SentenceTransformer 
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class EmbeddingManager:
    """Handles doc embedding generation using sentencetransformer"""

    def __init__(self, model_name:str="all-MiniLM-L6-v2"):
        """Initialize the mebedding manager
        Args:
            model_name: huggingface model name for sentence embeddings
        """

        self.model_name=model_name
        self.model=None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer"""
        try:
            print(f"Loading embedded model: {self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"model loaded successfully. embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {str(e)}")
            raise

    def generate_embeddings(self,texts:List[str])->np.ndarray:
        """Generate embeddings for a list of texts

        Args:
            texts: List of text strings to embed

        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded. Please call _load_model() first")

        print(f"Generating embeddings for {len(texts)} texts")
        embeddings=self.model.encode(texts, show_progress_bar=True)
        print(f"generated embeddings with shape: {embeddings.shape}")
        return embeddings



In [3]:
embedding_manager = EmbeddingManager()


Loading embedded model: all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model loaded successfully. embedding dimension: 384


In [None]:
class VectorStore:
    """Manages document embedding in a chromadb vector store"""
    def __init__(self,collection_name:str="pdf_dcouments",persist_directory:str="../data/vector_store"):
        """Intialize the vector store"""
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initalize_store()

    def _intialize_store(self):
        """Initialize the chromadb client and collection"""
        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)
            self.collection=self.client.get_or_create_collection(name=self.collection_name,metadata={"description":"pdf document loading for RAG"})
            print(f"Vector store initialized successfully. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {str(e)}")
            raise
            