<a href="https://colab.research.google.com/github/khajum/ai-playground/blob/main/rag/rag-application-101/pdf_loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Pipelines - Data Ingestion to Verctor DB Pipeline

In [None]:
# Install pypdf if not already installed
!pip install langchain
!pip install langchain-core
!pip install langchain-community
!pip install pypdf
!pip install pymupdf

In [6]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
#from langchain.text_splitter import RecursiveCharacterTextSplitter


In [None]:
from pathlib import Path

# Read all the PDFs inside directory './data/pdf/'
def process_all_pdfs(pdf_directory):
  # Process all the pdf files inside the directory
  all_documents = []
  pdf_dir = Path(pdf_directory)

  # Find all the pdf files recursively
  pdf_files = list(pdf_dir.glob('**/*.pdf'))

  # print all the pdf files found in the directory
  print(f"Found {len(list(pdf_files))} pdf files in {pdf_directory}")

  for pdf_file in pdf_files:
    print(f"\nProcessing: {pdf_file.name}")
    try:
      loader = PyPDFLoader(str(pdf_file))
      documents = loader.load()

      # Add source information to metadata
      for doc in documents:
        doc.metadata['source_file'] = str(pdf_file.name)
        doc.metadata['file_type'] = "pdf"

      all_documents.extend(documents)
      print(f" Processed {pdf_file.name} with {len(documents)} pages")
    except Exception as e:
      print(f" Error processing {pdf_file.name}: {e}")

  print(f" Total documents processed: {len(all_documents)}")
  return all_documents

# Process all the PDFs in the data directory
# Call the corrected function
all_pdf_document = process_all_pdfs("./data")
print(f"Successfully processed and collected {len(all_pdf_document)} documents.")


In [None]:
!pip install langchain-text-splitters

In [None]:
from langchain_text_splitter import RecursiveCharacterTextSplitter

def split_document(documents, chunk_size=1000, chunk_overlap=200):
  # split the documents into smaller chucks for better RAG performance
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size, chunk_overlap,
      length_function=len,
      separators = ["\n", "\n\n", " ", ""])
  splits = text_splitter.split_documents(documents)
  return splits