In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [3]:


### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data/pdf")



Found 1 PDF files to process

Processing: NPCK-2021-CATALOGUE-17.pdf
  ✓ Loaded 94 pages

Total documents loaded: 94


In [4]:


### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs



In [5]:
chunks=split_documents(all_pdf_documents)
chunks

Split 94 documents into 143 chunks

Example chunk:
Content: 2
POTATO VARIETY CATALOGUE 2021...
Metadata: {'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Windows)', 'creationdate': '2021-10-19T23:58:06+03:00', 'moddate': '2021-10-20T00:02:46+03:00', 'trapped': '/False', 'source': '..\\data\\pdf\\NPCK-2021-CATALOGUE-17.pdf', 'total_pages': 94, 'page': 1, 'page_label': '2', 'source_file': 'NPCK-2021-CATALOGUE-17.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Windows)', 'creationdate': '2021-10-19T23:58:06+03:00', 'moddate': '2021-10-20T00:02:46+03:00', 'trapped': '/False', 'source': '..\\data\\pdf\\NPCK-2021-CATALOGUE-17.pdf', 'total_pages': 94, 'page': 1, 'page_label': '2', 'source_file': 'NPCK-2021-CATALOGUE-17.pdf', 'file_type': 'pdf'}, page_content='2\nPOTATO VARIETY CATALOGUE 2021'),
 Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Windows)', 'creationdate': '2021-10-19T23:58:06+03:00', 'moddate': '2021-10-20T00:02:46+03:00', 'trapped': '/False', 'source': '..\\data\\pdf\\NPCK-2021-CATALOGUE-17.pdf', 'total_pages': 94, 'page': 2, 'page_label': '3', 'source_file': 'NPCK-2021-CATALOGUE-17.pdf', 'file_type': 'pdf'}, page_content='3\nPOTATO VARIETY CATALOGUE 2021\nFOREWORD                    6\nKEY PARTNERS                                                                                                 

In [6]:
### embedding And Vector Store DB
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

class EmbeddingManager:
    """Handles document embedding generation using SenteceTransformers"""

    def __init__(self,model_name: str = "all-MiniLM-L6-v2"):
        """Initialize the EmbeddingManager with a specified model. 
        Args:
            model_name Huggingface Model name for sentece Embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Emmbedding Dimension:, {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts.
        Args:
            texts List of texts to generate embeddings for
        Returns:
            np.ndarray Array of embeddings with shape (len(texts), embedding_dimension)
        """
        if not self.model:
            raise ValueError("Model not loaded. Call _load_model() first.")
        
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated {len(embeddings)} embeddings of shape {embeddings.shape}")
        return embeddings
    

    def get_sentence_embedding_dimension(self) -> int:
        """Get the dimension of the sentence embeddings."""
        if not self.model:
            raise ValueError("Model not loaded. Call _load_model() first.")
        return self.model.get_sentence_embedding_dimension()
    

embedding_manager = EmbeddingManager()
embedding_manager 


  from .autonotebook import tqdm as notebook_tqdm


Loading model: all-MiniLM-L6-v2
Model loaded successfully. Emmbedding Dimension:, 384


<__main__.EmbeddingManager at 0x21481e74050>


### VECTOR STORE


In [7]:
class VectorStore:
    """Manages document embedding in a ChromaDB vector store."""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """Initialize the VectorStore .
        Args:
            Collection name: Name of the vector database:
            persist_directory: Directory to persist the vector database
        """
        self.collection_name =collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize the ChromaDB client and collection."""
        try:
            print(f"Initializing ChromaDB client with persist directory: {self.persist_directory}")
            os.makedirs(self.persist_directory, exist_ok=True)

            self.client = chromadb.PersistentClient(path = self.persist_directory)

            ## Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata= {"description": "Collection of PDF document embeddings for RAG"}
                )
            print(f"Vector store collection '{self.collection_name}' initialized successfully.")
            print(f"Existing collections: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise
    
    def add_documents(self, documents: List[Any],embeddings:np.ndarray) -> List[str]:
        """Add documents to the ChromaDB collection after generating embeddings.

        Args:
            documents List of Document objects to add
            embeddings: Corresponding embeddings for the documents
            
        """
        if len(documents) != len (embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        
        ids = []
        metadatas = []
        documents_text = []
        embeddings_lists = []

        for i, (doc,embedding) in enumerate(zip(documents,embeddings)):
            #Generate a unique ID for each document
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            ## Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length']  = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_lists.append(embedding.tolist())
            
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_lists,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store.")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding document {doc_id} to collection: {e}")
            raise
            
            
vector_store = VectorStore()

Initializing ChromaDB client with persist directory: ../data/vector_store
Vector store collection 'pdf_documents' initialized successfully.
Existing collections: 0


In [8]:
### Convert the text to embeddings and add to vector store
texts = [doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings = embedding_manager.generate_embeddings(texts)

## Add to vector store
vector_store.add_documents(chunks,embeddings)

Batches: 100%|██████████| 5/5 [00:01<00:00,  2.73it/s]

Generated 143 embeddings of shape (143, 384)
Successfully added 143 documents to vector store.
Total documents in collection: 143



