In [1]:
from langchain_core.documents import Document

In [2]:
doc=Document(
    page_content="this is a content of the document for testing purposes of RandomDocumentGenerator",
    metadata={"source": "example_source.txt", "author": "test_author", "page": 1,"date_created": "2024-06-01"}
)


In [3]:
doc

Document(metadata={'source': 'example_source.txt', 'author': 'test_author', 'page': 1, 'date_created': '2024-06-01'}, page_content='this is a content of the document for testing purposes of RandomDocumentGenerator')

In [4]:
import os
os.makedirs("../data/example_source", exist_ok=True)

In [5]:
from langchain_community.document_loaders import TextLoader

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
class EmbeddingManager:
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully.")
            print(f"Embedding dimension: {self.get_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e
    def generate_embedding(self, texts:List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded.")
        embedding = self.model.encode(texts, show_progress_bar=True)
        print(f"embedding shapes: {np.array(embedding).shape}")
        return embedding
    
    def get_embedding_dimension(self) -> int:
        if not self.model:
            raise ValueError("Model not loaded.")
        return self.model.get_sentence_embedding_dimension()
    
embedding_manager = EmbeddingManager()
embedding_manager

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 2f2f69dd-c9aa-4c74-8977-4f3fcb56d776)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Model all-MiniLM-L6-v2 loaded successfully.
Embedding dimension: 384


<__main__.EmbeddingManager at 0x1f1c30fa6d0>

In [9]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore
    

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x1f1c4bb1890>

In [13]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/example_source", ## Directory path
    glob="**/*.txt", ## Pattern to match files  
    loader_cls= TextLoader, ##loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=False

)

documents=dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\example_source\\sample_document.txt'}, page_content='The Evolution of Artificial Intelligence and Machine Learning\n\nIntroduction\n\nArtificial Intelligence (AI) has transformed from a theoretical concept into a fundamental technology that shapes our daily lives. From virtual assistants to autonomous vehicles, AI systems are becoming increasingly sophisticated and ubiquitous. This document explores the history, current state, and future prospects of artificial intelligence and machine learning.\n\nHistorical Background\n\nThe concept of artificial intelligence dates back to ancient times, with myths and stories of artificial beings endowed with intelligence. However, the formal foundation of AI as an academic discipline began in the 1950s. In 1956, the Dartmouth Conference marked the birth of AI as a field of study, where pioneers like John McCarthy, Marvin Minsky, and Claude Shannon gathered to discuss the possibilities of creating intelligent

In [20]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf", ## Pattern to match files  
    loader_cls= PyMuPDFLoader, ##loader class to use
    show_progress=False

)

pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'WeasyPrint 66.0', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\NET_Core_Interview_Questions_Complete_V2.pdf', 'file_path': '..\\data\\pdf\\NET_Core_Interview_Questions_Complete_V2.pdf', 'total_pages': 38, 'format': 'PDF 1.7', 'title': '.NET Core Interview Questions', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Complete .NET Core Interview Questions\n& Answers Guide\nFor 2+ Years Experienced Developers\nTable of Contents\nC# & OOP Fundamentals\n.NET Core & Framework\nASP.NET MVC / Web API\nEntity Framework\nAsync & Multithreading\nDesign Patterns\nMicroservices\nPerformance & Security\nTesting\nSQL & Database\nAdvanced Topics\nC# & OOP Fundamentals\n1. What is Language and Framework?\nAnswer:\n- Language (C#): A programming language with syntax and rules for writing code\n- Framework (.NET): A platform providing libraries, runtime, and tools for bu

In [33]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [27]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from pathlib import Path

In [31]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data/pdf")

Found 1 PDF files to process

Processing: NET_Core_Interview_Questions_Complete_V2.pdf
  ✓ Loaded 38 pages

Total documents loaded: 38


In [36]:
chunks = split_documents(all_pdf_documents)

Split 38 documents into 39 chunks

Example chunk:
Content: Complete .NET Core Interview Questions
& Answers Guide
For 2+ Years Experienced Developers
Table of Contents
C# & OOP Fundamentals
.NET Core & Framework
ASP.NET MVC / Web API
Entity Framework
Async & ...
Metadata: {'producer': 'WeasyPrint 66.0', 'creator': 'PyPDF', 'creationdate': '', 'title': '.NET Core Interview Questions', 'source': '..\\data\\pdf\\NET_Core_Interview_Questions_Complete_V2.pdf', 'total_pages': 38, 'page': 0, 'page_label': '1', 'source_file': 'NET_Core_Interview_Questions_Complete_V2.pdf', 'file_type': 'pdf'}


In [37]:
chunks

[Document(metadata={'producer': 'WeasyPrint 66.0', 'creator': 'PyPDF', 'creationdate': '', 'title': '.NET Core Interview Questions', 'source': '..\\data\\pdf\\NET_Core_Interview_Questions_Complete_V2.pdf', 'total_pages': 38, 'page': 0, 'page_label': '1', 'source_file': 'NET_Core_Interview_Questions_Complete_V2.pdf', 'file_type': 'pdf'}, page_content='Complete .NET Core Interview Questions\n& Answers Guide\nFor 2+ Years Experienced Developers\nTable of Contents\nC# & OOP Fundamentals\n.NET Core & Framework\nASP.NET MVC / Web API\nEntity Framework\nAsync & Multithreading\nDesign Patterns\nMicroservices\nPerformance & Security\nTesting\nSQL & Database\nAdvanced Topics\nC# & OOP Fundamentals\n1. What is Language and Framework?\nAnswer:\n- Language (C#): A programming language with syntax and rules for writing code\n- Framework (.NET): A platform providing libraries, runtime, and tools for building\n1. \n2. \n3. \n4. \n5. \n6. \n7. \n8. \n9. \n10. \n11.'),
 Document(metadata={'producer': 'W

In [38]:
texts = [doc.page_content for doc in chunks]
texts

['Complete .NET Core Interview Questions\n& Answers Guide\nFor 2+ Years Experienced Developers\nTable of Contents\nC# & OOP Fundamentals\n.NET Core & Framework\nASP.NET MVC / Web API\nEntity Framework\nAsync & Multithreading\nDesign Patterns\nMicroservices\nPerformance & Security\nTesting\nSQL & Database\nAdvanced Topics\nC# & OOP Fundamentals\n1. What is Language and Framework?\nAnswer:\n- Language (C#): A programming language with syntax and rules for writing code\n- Framework (.NET): A platform providing libraries, runtime, and tools for building\n1. \n2. \n3. \n4. \n5. \n6. \n7. \n8. \n9. \n10. \n11.',
 'applications\n- C# is the language; .NET is the framework that executes C# code\n2. What is the use of public static void Main() / how does a .NET\napplication run?\nAnswer:\n- Entry point of any .NET application\n- public: Accessible from CLR\n- static: No instance needed to invoke\n- void: Returns nothing\n- CLR looks for Main() method to start execution\n3. What is Garbage Colle

In [39]:
emb = embedding_manager.generate_embedding(texts)

Batches: 100%|██████████| 2/2 [00:03<00:00,  1.95s/it]

embedding shapes: (39, 384)





In [40]:
emb

array([[-0.03768422, -0.00744384, -0.0230459 , ...,  0.03413118,
         0.03071013,  0.03184688],
       [-0.04077204, -0.0125097 , -0.07544576, ...,  0.06462511,
        -0.02743407,  0.00470379],
       [-0.08670689,  0.01621116,  0.02904435, ...,  0.03657807,
         0.11026521,  0.0598942 ],
       ...,
       [-0.04869787,  0.08772961, -0.03972964, ...,  0.08758782,
         0.00198828, -0.00253422],
       [-0.0989548 ,  0.00971667,  0.00707669, ...,  0.07179293,
        -0.06456099,  0.02506586],
       [-0.03922788,  0.0351142 ,  0.0421135 , ...,  0.05239733,
        -0.03956318,  0.03683998]], shape=(39, 384), dtype=float32)

In [41]:
vectorstore.add_documents(chunks, emb)

Adding 39 documents to vector store...
Successfully added 39 documents to vector store
Total documents in collection: 39


In [50]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embedding([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)



In [51]:
rag_retriever.retrieve("What is Garbage Collector and how does GC work")

Retrieving documents for query: 'What is Garbage Collector and how does GC work'
Top K: 5, Score threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 56.65it/s]

embedding shapes: (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_d42476ef_1',
  'content': 'applications\n- C# is the language; .NET is the framework that executes C# code\n2. What is the use of public static void Main() / how does a .NET\napplication run?\nAnswer:\n- Entry point of any .NET application\n- public: Accessible from CLR\n- static: No instance needed to invoke\n- void: Returns nothing\n- CLR looks for Main() method to start execution\n3. What is Garbage Collector and how does GC work?\nAnswer:\n- Automatic memory management system\n- Generations: Gen 0 (short-lived), Gen 1 (medium), Gen 2 (long-lived)\n- Process: Mark → Sweep → Compact\n- Runs automatically when Gen 0 is full or memory pressure exists\n- Can be triggered manually using GC.Collect()\n4. What are Access Modifiers?\nAnswer:\n- public: Accessible everywhere\n- private: Only within the same class\n- protected: Within class and derived classes\n- internal: Within same assembly\n- protected internal: Same assembly OR derived classes\n- private protected: Same asse