### Data Ingestion

In [1]:
from langchain_core.documents import Document

In [None]:
doc = Document(page_content="Hello world", 
               metadata={"source": "my_source"})

In [2]:
### Create a simple txt file
import os
os.makedirs("data/text_files", exist_ok=True)

In [5]:
sample_txt = {
    "../data/text_files/langchain_intro.txt": 
    """LangChain is a framework for developing applications powered by language models.
    It can be used for chatbots, Generative Question-Answering (GQA), summarization, and much more.""",
    "../data/text_files/errors.txt": 
    """ArithmeticError is the base class for those built-in exceptions that are raised for various arithmetic errors, such as OverflowError, ZeroDivisionError, and FloatingPointError.""",
              }

In [6]:
for filepath,content in sample_txt.items():
    print(filepath)
    with open(filepath, "w",encoding="utf-8") as f:
        f.write(content)

../data/text_files/langchain_intro.txt
../data/text_files/errors.txt


In [8]:
### Text loader 
from langchain.document_loaders import TextLoader

#from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/langchain_intro.txt",encoding="utf-8")
documents = loader.load()

In [9]:
print(documents)

[Document(metadata={'source': '../data/text_files/langchain_intro.txt'}, page_content='LangChain is a framework for developing applications powered by language models.\n    It can be used for chatbots, Generative Question-Answering (GQA), summarization, and much more.')]


### PDF Documents

In [17]:
### Directory loader
from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader

dir_loader = DirectoryLoader("../data/pdf",
                              glob="**/*.pdf", ##Pattern to match files
                              loader_cls=PyMuPDFLoader, ##Loader class to use
                                #loader_kwargs={"encoding": "utf-8"}, ##Loader class keyword arguments
                                show_progress=False ##Whether to show a progress bar
                              )

pdf_documents = dir_loader.load()
print(pdf_documents)

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Windows)', 'creationdate': '2021-10-19T23:58:06+03:00', 'source': '..\\data\\pdf\\NPCK-2021-CATALOGUE-17.pdf', 'file_path': '..\\data\\pdf\\NPCK-2021-CATALOGUE-17.pdf', 'total_pages': 94, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2021-10-20T00:02:46+03:00', 'trapped': '', 'modDate': "D:20211020000246+03'00'", 'creationDate': "D:20211019235806+03'00'", 'page': 0}, page_content=''), Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Windows)', 'creationdate': '2021-10-19T23:58:06+03:00', 'source': '..\\data\\pdf\\NPCK-2021-CATALOGUE-17.pdf', 'file_path': '..\\data\\pdf\\NPCK-2021-CATALOGUE-17.pdf', 'total_pages': 94, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2021-10-20T00:02:46+03:00', 'trapped': '', 'modDate': "D:20211020000246+03'00'", 'creationDate': "D:2

### embedding And Vector Store DB

In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
class EmbeddingManager:
    """Handles document embedding generation using SenteceTransformers"""

    def __init__(self,model_name: str = "all-MiniLM-L6-v2"):
        """Initialize the EmbeddingManager with a specified model. 
        Args:
            model_name Huggingface Model name for sentece Embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Emmbedding Dimension:, {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts.
        Args:
            texts List of texts to generate embeddings for
        Returns:
            np.ndarray Array of embeddings with shape (len(texts), embedding_dimension)
        """
        if not self.model:
            raise ValueError("Model not loaded. Call _load_model() first.")
        
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated {len(embeddings)} embeddings of shape {embeddings.shape}")
        return embeddings
    

    def get_sentence_embedding_dimension(self) -> int:
        """Get the dimension of the sentence embeddings."""
        if not self.model:
            raise ValueError("Model not loaded. Call _load_model() first.")
        return self.model.get_sentence_embedding_dimension()
    

embedding_manager = EmbeddingManager()
embedding_manager 


Loading model: all-MiniLM-L6-v2
Model loaded successfully. Emmbedding Dimension:, 384


<__main__.EmbeddingManager at 0x1a0b74afcb0>

### VECTOR STORE

In [None]:
class VectorStore:
    """Manages document embedding in a ChromaDB vector store."""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """Initialize the VectorStore .
        Args:
            Collection name: Name of the vector database:
            persist_directory: Directory to persist the vector database
        """
        self.collection_name =collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize the ChromaDB client and collection."""
        try:
            print(f"Initializing ChromaDB client with persist directory: {self.persist_directory}")
            os.makedirs(self.persist_directory, exist_ok=True)

            self.client = chromadb.PersistentClient(path = self.persist_directory)

            ## Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata= {"description": "Collection of PDF document embeddings for RAG"}
                )
            print(f"Vector store collection '{self.collection_name}' initialized successfully.")
            print(f"Existing collections: {self.collections.count()}")
            
        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise
    
    def add_documents(self, documents: List[Any],embeddings:np.ndarray) -> List[str]:
        """Add documents to the ChromaDB collection after generating embeddings.

        Args:
            documents List of Document objects to add
            embeddings: Corresponding embeddings for the documents
            
        """
        if len(documents) != len (embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        
        ids = []
        metadatas = []
        documents_text = []
        embeddings_lists = []

        for i, (doc,embedding) in enumerate(zip(documents,embeddings)):
            #Generate a unique ID for each document
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            ## Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length']  = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_lists.append(embedding.tolist())
            
            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embeddings_lists,
                    metadatas=metadatas,
                    documents=documents_text
                )
                print(f"Successfully added {len(documents)} documents to vector store.")
                print(f"Total documents in collection: {self.collection.count()}")

            except Exception as e:
                print(f"Error adding document {doc_id} to collection: {e}")
                raise


In [16]:
chunks 

NameError: name 'chunks' is not defined