<a href="https://colab.research.google.com/github/khajum/ai-playground/blob/main/rag/rag-application-101/EmbeddingManager.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Pipelines - Data Ingestion to Verctor DB Pipeline

In [None]:
# Install pypdf if not already installed
!pip install langchain
!pip install langchain-core
!pip install langchain-community
!pip install pypdf
!pip install pymupdf

In [8]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
#from langchain.text_splitter import RecursiveCharacterTextSplitter


In [9]:
from pathlib import Path

# Read all the PDFs inside directory './data/pdf/'
def process_all_pdfs(pdf_directory):
  # Process all the pdf files inside the directory
  all_documents = []
  pdf_dir = Path(pdf_directory)

  # Find all the pdf files recursively
  pdf_files = list(pdf_dir.glob('**/*.pdf'))

  # print all the pdf files found in the directory
  print(f"Found {len(list(pdf_files))} pdf files in {pdf_directory}")

  for pdf_file in pdf_files:
    print(f"\nProcessing: {pdf_file.name}")
    try:
      loader = PyPDFLoader(str(pdf_file))
      documents = loader.load()

      # Add source information to metadata
      for doc in documents:
        doc.metadata['source_file'] = str(pdf_file.name)
        doc.metadata['file_type'] = "pdf"

      all_documents.extend(documents)
      print(f" Processed {pdf_file.name} with {len(documents)} pages")
    except Exception as e:
      print(f" Error processing {pdf_file.name}: {e}")

  print(f" Total documents processed: {len(all_documents)}")
  return all_documents

# Process all the PDFs in the data directory
# Call the corrected function
all_pdf_document = process_all_pdfs("./data")
print(f"Successfully processed and collected {len(all_pdf_document)} documents.")


Found 1 pdf files in ./data

Processing: Attention-in-ML.pdf
 Processed Attention-in-ML.pdf with 11 pages
 Total documents processed: 11
Successfully processed and collected 11 documents.


In [None]:
!pip install langchain

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_document(documents, chunk_size=1000, chunk_overlap=200):
  # split the documents into smaller chucks for better RAG performance
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size, chunk_overlap,
      length_function=len,
      separators = ["\n", "\n\n", " ", ""])
  splits = text_splitter.split_documents(documents)
  return splits

In [None]:
# Install all embeding and vectordb related packaged if not already installed
!pip install sentence-transformers faiss-cpu chromadb

In [16]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class EmbeddingManager:
  """Handles the document embeding generation using SentenceTransformer"""

  def __init__(self, embedding_model_name:str="all-MiniLM-L6-v2"):
    """
    Initialize the EmbeddingManager.

    Args:
      embedding_model_name (str): Name of the embedding model to use.
    """
    self.embedding_model_name = embedding_model_name
    self.model = None
    self._load_model()

  def _load_model(self):
    """
    Load the SentenceTransformer embedding model.
    """
    try:
      print(f"Loading SentenceTransformer model: {self.embedding_model_name}")
      self.model = SentenceTransformer(self.embedding_model_name)
      print(f"Loaded SentenceTransformer model successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
    except Exception as e:
      print(f"Error loading embedding model: {e}")
      raise

  def generate_embeddings(self, texts:List[str]) -> np.ndarray:
    """
    Generate embeddings for a list of texts.

    Args:
      texts (List[str]): List of texts to generate embeddings for.

    Returns:
      np.ndarray: Array of embeddings.
    """
    if self.model is None:
      self._load_model()
    try:
      print(f"Generating embeddings for {len(texts)} texts...")
      embeddings = self.model.encode(texts, show_progress_bar=True)
      print(f"Successfully generated embeddings for {len(texts)} texts with shape: {embeddings.shape}")
      return embeddings
    except Exception as e:
      print(f"Error generating embeddings: {e}")
      raise


# Initialize the embedding manager
embedding_manager = EmbeddingManager()
embeddings = embedding_manager.generate_embeddings([doc.page_content for doc in all_pdf_document])
