### RAG pipeline - data ingestion and Vector DB

In [None]:
import os
from langchain_community.document_loaders import  TextLoader,DirectoryLoader, PyMuPDFLoader, PyPDFLoader

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path



### read all the pdf's in the directory


In [None]:

def read_all_pdfs(pdf_directory):

    all_documents = []
    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in {pdf_directory}"
        )
    for pdf_file in pdf_files:
        print(f"Processing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

    ### adding the source information for the metadata

            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'


                all_documents.extend(documents)
                print(f"Loaded{len(documents)} pages")
        except Exception as e:
            print(" Error:{e}")
    
    print(f"Total docuents loaded: {len(all_documents)}")
    return all_documents
  
      
### process all the pdf in data directory
all_pdf_files = read_all_pdfs("../data/pdf")

    

In [52]:
all_pdf_files

[Document(metadata={'producer': 'macOS Version 15.7.1 (Build 24G231) Quartz PDFContext, AppendMode 1.1', 'creator': 'Microsoft¬Æ Word 2016', 'creationdate': "D:20160929004132Z00'00'", 'author': 'IEEE', 'moddate': "D:20251217092537Z00'00'", 'title': 'Paper Title (use style: paper title)', 'source': '../data/pdf/Matrix Profile_I.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1', 'source_file': 'Matrix Profile_I.pdf', 'file_type': 'pdf'}, page_content='Matrix Profile I: All Pairs Similarity Joins for Time Series: \nA Unifying View that Includes Motifs, Discords and Shapelets \nChin-Chia Michael Yeh, Yan Zhu, Liudmila Ulanova, Nurjahan Begum, Yifei Ding,  \nHoang Anh Dau, ‚Ä†Diego Furtado Silva, ‚Ä°Abdullah Mueen, and Eamonn Keogh \nUniversity of California, Riverside, ‚Ä†Universidade de S√£o Paulo, ‚Ä°University of New Mexico \n{myeh003, yzhu015, lulan001, nbegu001, yding007, hdau001}@ucr.edu, diegofsilva@icmc.usp.br, mueen@unm.edu, eamonn@cs.ucr.edu \n \nAbstract‚Äî The all-pairs-sim

In [None]:
### Text splitting get into chunks

def split_documents(documents, chunk_size= 2000, chunk_overlap=200):
    """splitting the documents into smaller chunk for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap= chunk_overlap,
        length_function = len,
        separators=[" ", "", ]
    ) 
    split_doc = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_doc)} chunks")

    # show example chunk 
    if split_doc:
        print("example chunk")
        print(f"Content: {split_doc[0].page_content[:200]}")
        print(f"metadata: {split_doc[0].metadata}")
    return split_doc

In [None]:
chunks = split_documents(all_pdf_files)
chunks

### Embedding and vectorstoreDB

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity



In [None]:
class embeddingManager:
    """Handling document gerneration using using sentence transformer"""

    def __init__(self, model_name:str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the sentence Transformer"""
        try: 
            print(f"Loading the embedding model:  {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:

            print(f"Error loading model {self.model_name}: {e}")
            raise
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """generate embeddings for a list of text"""

        if not self.model:
            raise ValueError("model not loaded")
        print(f"generating embeddings for the {len(texts)} texts.....")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape : {embeddings.shape}")
        return embeddings

## initializing the embedding manager 

embedding_manager = embeddingManager()
embedding_manager

### Vectorstore

In [None]:
class VectorStore:
    """Manages document embeddings in a chromadb vector store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory : str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Intializes the chromadb client and collection"""
        try:
            # create persistent Chromadb
            os.makedirs(self.persist_directory, exist_ok = True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(name=self.collection_name,
                                                                   metadata={"description": "PDF document embeddings for RAG"})
            
            print(f"Vector store initalized Collection: {self.collection_name}")
            print(f"Vector store initalized Collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing the vector store: {e}")
            raise

    def add_documents(self, documents : List[any], embeddings: np.ndarray):
        """Add documents and embeddings to the vector store"""

        if len(documents) != len(embeddings):
            raise ValueError("Number documents must match number embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")

        # prepare the data for chromadb
        ids= []
        metadatas = []
        documents_text = []
        embedding_list = []

        for i ,(doc, embedding) in enumerate(zip(documents,embeddings)):
            # generate unique id 
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # embeddings 
            embedding_list.append(embedding.tolist())

            # Add to collection
            try:
                self.collection.add(
                    ids=ids,
                    embeddings= embedding_list,
                    metadatas=metadatas,
                    documents=documents_text
                )
                print(f"successfully added {len(documents)} documents to vector store")
                print(f"total documents in collection: {self.collection.count()}")

            except Exception as e:
                print(f"Error adding the documents to vector store {e}")
                raise

vectorstore = VectorStore()
vectorstore





In [None]:
chunks

In [None]:
# # convert the text to embeddings 
texts = [doc.page_content for doc in chunks]
texts

In [None]:
### passing texts to embeddings manager 
embeddings = embedding_manager.generate_embeddings(texts)

### store in the vectordb

vectorstore.add_documents(chunks,embeddings)

### RAG retriever pipeline from VectorStore

In [None]:
class RAGRetriever:
    """Handles query based retrieval from the vector store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: embeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retriever(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """Retrieves relevant documents for the query"""
        print(f"retrieving documents for query : '{query}'")
        print(f"top k : {top_k}, score threshold: {score_threshold}")

        # generate the query into embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()], n_results=top_k
            )

            # process results
            retrieved_docs = []

            if results.get('documents') and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(
                    zip(ids, documents, metadatas, distances)
                ):
                    # convert distance to similarity score (for chroma db cosine distance is used)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            return retrieved_docs

        except Exception as e:
            print(f"Error querying vector store: {e}")
            return []
rag_retriever = RAGRetriever(vectorstore, embedding_manager)
rag_retriever


In [None]:
rag_retriever.retriever("similarity search")

### Integration of vectordb context pipelinewith LLM output

In [50]:
### simple RAG pipeline with groq LLM
from multiprocessing import context
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### initialize the groq LLM along with the API_KEY
groq_api_key = "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"
llm = ChatGroq(api_key=groq_api_key, model="groq/compound-mini", temperature=0.1, max_tokens= 512)

## simple rag  function retrieval and generation

def rag_simple(query, retriever, llm , top_k = 5):

    ## retrieve the context
    results = retriever.retriever(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant documents found."
    
   ## generate the answer using the GROQ LLM

    prompt = f"""Use the following context to answer the question.
       Context: {context}
       Question: {query}
       Answer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    return response.content

In [44]:
answer = rag_simple("What is similarity search?", rag_retriever, llm)
print(answer)

retrieving documents for query : 'What is similarity search?'
top k : 5, score threshold: 0.0
generating embeddings for the 1 texts.....


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.99it/s]


Generated embeddings with shape : (1, 384)
Retrieved 5 documents (after filtering)
**Similarity search** is the computational task of finding, for a given query object, the data objects that are most alike to it according to some similarity (or distance) measure. In practice it usually means retrieving the nearest‚Äëneighbor(s) of each object in a collection‚Äîe.g., the items whose Euclidean (or z‚Äënormalized Euclidean) distance to the query is smallest. This operation underlies many applications such as clustering, duplicate detection, motif discovery, and, in the time‚Äëseries domain, all‚Äëpairs‚Äësimilarity joins.


### Enhanced RAG Pipeline Features

In [51]:

# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retriever(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced(" what is similarity search", rag_retriever, llm, top_k=5, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

retrieving documents for query : ' what is similarity search'
top k : 5, score threshold: 0.1
generating embeddings for the 1 texts.....


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  4.26it/s]

Generated embeddings with shape : (1, 384)
Retrieved 0 documents (after filtering)
Answer: No relevant context found.
Sources: []
Confidence: 0.0
Context Preview: 



