RAG Piplelines - Data Ingestion to Vector DB Pipleline

In [18]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


In [19]:

def process_all_pdfs(pdf_directory):
    """ Process all the pdf files in a directory"""

    all_document = [ ]
    pdf_dir = Path(pdf_directory)

    # Find all PDF files recursively
    pdf_files = list (pdf_dir.glob('**/*.pdf'))

    print(f"Found {len(pdf_files)} PDF files to process.")

    for pdf_file in pdf_files:
        print(f"\nProcessing {pdf_file.name}")

        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            #Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_document.extend(documents)
            print(f" Loaded {len(documents)} pages")
        except Exception as e:
            print(f" Error: {e}")

    print(f"\n Total documents loaded:{len(all_document)}")
    return all_document

# Process all PDF's in the data directory
all_pdf_documents = process_all_pdfs("../data")


Found 5 PDF files to process.

Processing Research_Paper_on_Artificial_Intelligence.pdf
 Loaded 3 pages

Processing AI_ITS_Application.pdf


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)


 Loaded 5 pages

Processing USA_CHINA_AI.pdf
 Loaded 1 pages

Processing start_hamburg.pdf
 Loaded 4 pages

Processing AI_Applications_in_CyberSecurity.pdf
 Loaded 11 pages

 Total documents loaded:24


In [20]:
# Checking the all pdfs data.
all_pdf_documents

[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft¬Æ Word 2016', 'creationdate': '2023-02-20T14:13:44+00:00', 'title': 'CASE STUDIES JOURNAL VOL-2, ISSUE 6  ISSN (2305-509X)', 'author': 'any', 'moddate': '2023-02-20T14:13:44+00:00', 'source': '../data/pdf/Research_Paper_on_Artificial_Intelligence.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'Research_Paper_on_Artificial_Intelligence.pdf', 'file_type': 'pdf'}, page_content='www.ijecs.in \nInternational Journal Of Engineering And Computer Science \nVolume 12 Issue 02, February 2023, Page No.25654-25656 \nISSN: 2319-7242 DOI: 10.18535/ijecs/v11i02.4671 \n \n \n \n7  Ijecs 02 February Rajiv Gupta Research Paper on Artificial Intelligence \n \nPage | \n25654 \nResearch Paper on Artificial Intelligence \n \nRajiv Gupta \nChandigarh University, Chandigarh,  Haryana \n \n \nAbstract: This branch of computer science is concerned with making computers behave like humans. \nArtificial intelligence in

In [21]:
# Text splitting get into chunks

def split_document(documents, chunk_size=100, chunk_overlap=20):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]

    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(split_docs)} documents into {len(split_docs)} chunks")

    # Show example of a chunk

    if split_docs:
        print(f"\n Example chunks:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata:\n{split_docs[0].metadata}")

    return split_docs


In [22]:
# checking the chunking there

chunks = split_document(all_pdf_documents)

Split 747 documents into 747 chunks

 Example chunks:
Content: www.ijecs.in 
International Journal Of Engineering And Computer Science...
Metadata:
{'producer': 'www.ilovepdf.com', 'creator': 'Microsoft¬Æ Word 2016', 'creationdate': '2023-02-20T14:13:44+00:00', 'title': 'CASE STUDIES JOURNAL VOL-2, ISSUE 6  ISSN (2305-509X)', 'author': 'any', 'moddate': '2023-02-20T14:13:44+00:00', 'source': '../data/pdf/Research_Paper_on_Artificial_Intelligence.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'Research_Paper_on_Artificial_Intelligence.pdf', 'file_type': 'pdf'}


### Embeddin and VectoreStoreDB

In [23]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import  uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
# Class Embedding Manager

class EmbeddingManager:
    """ Handles document embedding generation using sentenceTransformer """
    def __init__(self, model_name: str = "all-miniLm-L6-V2"):
        """
        Initialize the embedding manager
        Args:
            model_name: Huggingface model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")

        except Exception as e:
            print(f" Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str])-> np.ndarray:
        """
        Generate embedding for a list of texts
        Args:
            texts: List of text strings to embed

        Returns:
            numpy array of embedding with a shape (len(texts), embedding_dimension)
        """

        print(f"Generating embedding for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embedding with a shape: {embeddings.shape}")
        return embeddings

    def get_embedding_dimension(self) -> int:
        """Get the embedding dimension of the model"""
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()


## Initialize the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-miniLm-L6-V2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x1201a0c20>

In [29]:
## VectorStore
class VectorStore:
    """Manager document embedding in a ChromaDB vector store"""
    def __init__(self,collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store

        Args:
             collection_name: name of the ChromaDB collection
             persist_directory: Directory to persist the vectore store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok = True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata= {"description" : "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collections:{self.collection.count()}")

        except Exception as e:
            print(f" Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):

        """
        Add documents and their embeddings to the vector store

        Args:
           documents: List of LanChain documents
           embeddings: Corresponding embeddings for the documents

        """

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_texts = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_texts.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

            # Add to collection
            try:
                self.collection.add(
                    ids = ids,
                    embeddings = embeddings_list,
                    metadatas = metadatas,
                    documents = documents_texts
                )
                print(f"Successfully added {len(documents)} documents to vector store.")
                print(f"Total documents in collection: {self.collection.count()}")

            except Exception as e:
                print(f" Error adding documents to vector store: {e}")

vectorestore = VectorStore()
vectorestore

Vector store initialized. Collection: pdf_documents
Existing documents in collections:0


<__main__.VectorStore at 0x1201a1160>

In [26]:
chunks

[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft¬Æ Word 2016', 'creationdate': '2023-02-20T14:13:44+00:00', 'title': 'CASE STUDIES JOURNAL VOL-2, ISSUE 6  ISSN (2305-509X)', 'author': 'any', 'moddate': '2023-02-20T14:13:44+00:00', 'source': '../data/pdf/Research_Paper_on_Artificial_Intelligence.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'Research_Paper_on_Artificial_Intelligence.pdf', 'file_type': 'pdf'}, page_content='www.ijecs.in \nInternational Journal Of Engineering And Computer Science'),
 Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft¬Æ Word 2016', 'creationdate': '2023-02-20T14:13:44+00:00', 'title': 'CASE STUDIES JOURNAL VOL-2, ISSUE 6  ISSN (2305-509X)', 'author': 'any', 'moddate': '2023-02-20T14:13:44+00:00', 'source': '../data/pdf/Research_Paper_on_Artificial_Intelligence.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'Research_Paper_on_Artificial_Intelligence.pdf', 'file_type'

In [30]:
## Convert the text to embeddings
texts = [doc.page_content for doc in chunks]

## Generate the Embedding

embeddings = embedding_manager.generate_embeddings(texts)

## store in the vectore database

vectorestore.add_documents(chunks, embeddings)


Generating embedding for 747 texts...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 24/24 [00:04<00:00,  5.60it/s]


Generated embedding with a shape: (747, 384)
Adding 747 documents to vector store...
Successfully added 747 documents to vector store.
Total documents in collection: 1
Successfully added 747 documents to vector store.
Total documents in collection: 2
Successfully added 747 documents to vector store.
Total documents in collection: 3
Successfully added 747 documents to vector store.
Total documents in collection: 4
Successfully added 747 documents to vector store.
Total documents in collection: 5
Successfully added 747 documents to vector store.
Total documents in collection: 6
Successfully added 747 documents to vector store.
Total documents in collection: 7
Successfully added 747 documents to vector store.
Total documents in collection: 8
Successfully added 747 documents to vector store.
Total documents in collection: 9
Successfully added 747 documents to vector store.
Total documents in collection: 10
Successfully added 747 documents to vector store.
Total documents in collection: 11
