In [1]:
!pip install -r ../requirements.txt -q

# Data Ingestion to Vector DB Pipeline

In [2]:
DATA_FILE_PATH = "../data"

In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [4]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """ Process all PDFs """
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # Find all PDFs recursively
    pdf_files = list(
        pdf_dir.glob("**/*.pdf") # Match any PDF, no matter now far in the subdirectories
    )

    print(f"Found {len(pdf_files)} PDFs.")

    # So each pdf_file is a path to that PDF file
    for pdf_file in pdf_files:
        print(f"Processing {pdf_file.name}")
        try:
            # Load that pdf file
            # PyPDFLoader creates chunks from each PDF
            # These chunks are Document objects
            # By default PyPDFLoader will split the PDF as a single text flow
            # This means that they are chunked by one continouos page
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages added")
            
        except Exception as e:
            print(f"Error {e}")

    print(f"\nTotal documents loaded {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs(DATA_FILE_PATH)

Found 2 PDFs.
Processing AttentionIsAllYouNeed.pdf
Loaded 15 pages added
Processing RadGraph.pdf
Loaded 12 pages added

Total documents loaded 27


In [5]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf_files/AttentionIsAllYouNeed.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'AttentionIsAllYouNeed.pdf', 'file_type': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N

# Chunking

## Text Splitting

In [6]:
def split_documents(documents, chunk_size = 1000, chunk_overlap=200):
    """ Split the documents into chunks """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        # Paragraph, New Line, Space, Nothing
        separators = ["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"\nExample_chunk")
        print(f"Content 1: {split_docs[0].page_content[300:]}...")
        print(f"Content 2: {split_docs[1].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}...")

    return split_docs

In [7]:
chunks = split_documents(all_pdf_documents)

Split 27 documents into 111 chunks

Example_chunk
Content 1: mar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗ ‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions...
Content 2: mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine transla...
Metadata: {'producer': 'pdfTeX-1.4

In [8]:
chunks[0]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf_files/AttentionIsAllYouNeed.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'AttentionIsAllYouNeed.pdf', 'file_type': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N.

#### As we can see the document got split into chunks. The length of each chunk is 500 and the first 200 words of the second chunk = last 200 words of the first chunk. 

# Embedding Generation

In [9]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Tuple, Any, Dict
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
class EmbeddingManager:
    """ Handels document embedding generation using SentenceTransformer """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
            Init the Embedding Manager

            Args:
                Name of the pre-trained embedding model.
        """
        self.model_name = model_name
        self.model = None
        # Having this here automatically runs the function as soon as we init a EmbeddingsManager
        self._load_model()

    def _load_model(self):
        """ Loads the SentenceTransformer model """
        try: 
            print(f"Loading model {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model succesfully loaded with embedding dim: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Loading embeddings for texts")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Loaded embeddings for texts with dim: {embeddings.shape}")
        return embeddings

    def get_embeddings_dimension(self) -> int:
        if not self.model:
            raise ValueError("Model not loaded")

        print(f"Model embedding dimension is: {self.model.get_sentence_embedding_dimension()}")
        return self.model.get_sentence_embedding_dimension()

In [11]:
texts = [doc.page_content for doc in chunks]

In [12]:
embm1 = EmbeddingManager()
embeddings = embm1.generate_embeddings(texts)
embm1.get_embeddings_dimension()

Loading model all-MiniLM-L6-v2
Model succesfully loaded with embedding dim: 384
Loading embeddings for texts


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Loaded embeddings for texts with dim: (111, 384)
Model embedding dimension is: 384


384

# Vector Database

In [13]:
class VectorStore: 

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self):
        try: 
            # If the directory exists, keep it, if not make one
            os.makedirs(self.persist_directory, exist_ok=True)
            # Client that has reference to the chromadb vectorstore
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            # Collection, what is in the collection?
            # Collection is where we are storing the vectors inside of the vector store.
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name, 
                metadata = {"description": "PDF document embeddings for RAG"}
            )
        except Exception as e:
            print(f"Error occured with initialization: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        ids = []
        metadatas = []
        document_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            document_text.append(doc.page_content)

            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=document_text
            )
        except Exception as e:
            print(f"Error with adding to vector store: {e}")
            raise

In [14]:
vectorstore = VectorStore()
vectorstore

<__main__.VectorStore at 0x17bcbff50>

In [15]:
# Add everything to the vector database
vectorstore.add_documents(documents=chunks, embeddings=embeddings)

# Retreival

In [16]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embm1)

# Generation

In [17]:
output = rag_retriever.retrieve("What is the mathematical formula for self attention?")
output

Retrieving documents for query: 'What is the mathematical formula for self attention?'
Top K: 5, Score threshold: 0.0
Loading embeddings for texts


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Loaded embeddings for texts with dim: (1, 384)
Retrieved 5 documents (after filtering)


[{'id': 'doc_61138e88_12',
  'content': '3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n3',
  'metadata': {'author': '',
   'content_length': 216,
   'creationdate': '2024-04-10T21:11:43+00:00',
   'creator': 'LaTeX with hyperref',
   'doc_index': 12,
   'file_type': 'pdf',
   'keywords': '',
   'moddate': '2024-04-10T21:11:43+00:00',
   'page': 2,
   'page_label': '3',
   'producer': 'pdfTeX-1.40.25',
   'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
   'source': '../data/pdf_files/AttentionIsAllYouNeed.pdf',
   'source_file': 'AttentionIsAllYouNeed.pdf',
   'subject': '',
   'title': '',
   'total_pages': 15,
   'trapped': '/False'},
  'similarity_score': 0.17520487308502197,
  'distance': 0.824795126914978,
  'rank': 1},
 {'id': 'doc_59f9d02d_12',
  'con

# LLM output

In [18]:
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface.llms import HuggingFacePipeline

checkpoint = "HuggingFaceTB/SmolLM-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.7,
    device=-1
)
hf_llm = HuggingFacePipeline(pipeline=gen_pipe)

Device set to use cpu


In [34]:
def rag_simple(query, retreiver, llm, device="cpu", top_k=3):
    results = retreiver.retrieve(query=query, top_k=top_k)
    context = '\n\n'.join([r['content'] for r in results]) if results else ""
    if len(context) == 0:
        print("Context not long enough for this query")
        return

    # Generate an answer using our prompt
    simple_prompt = f"""
        You are a math and deep learning expert. 
        Use the provided context to answer clearly and include equations when relevant.
        
        Context:
        {context}
        
        Question: {query}
        
        Answer:
        """
        
    llm_response = llm.invoke([simple_prompt.format(context=context, query=query)])
    if "Answer:" in llm_response:
        answer = llm_response.split("Answer:", 1)[1]
    else:
        answer = llm_response
    return " ".join(answer.split())  # Cleans up unwanted characters from the model output

In [35]:
query = "What is attention?"
answer = rag_simple(query, rag_retriever, hf_llm)
answer

Retrieving documents for query: 'What is attention?'
Top K: 3, Score threshold: 0.0
Loading embeddings for texts


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Loaded embeddings for texts with dim: (1, 384)
Retrieved 3 documents (after filtering)


'3.2 Attention Attention is a function, which is used to compute a weighted sum of two vectors, where the dot product is a function of the scores of both vectors. 3 An attention function is an important part of neural networks, and it is used to process large amounts of data. It is used to recognize important information or patterns in data. For example, an attention function might be used to recognize if two sentences are similar or different, or whether two words belong together or not. The output of an attention function is a weighted sum of the scores of both vectors, which is then used to produce a prediction. The input and output of an attention function are a set of two vectors that are concatenated. This can be seen as a set of weights, which are used to compute the output of the attention function. The output of an attention function is a vector, which is the weighted sum of the scores of both vectors. An example of an attention function is the following code: ```python import