RAG Piplelines - Data Ingestion to Vector DB Pipleline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


In [2]:

def process_all_pdfs(pdf_directory):
    """ Process all the pdf files in a directory"""

    all_document = [ ]
    pdf_dir = Path(pdf_directory)

    # Find all PDF files recursively
    pdf_files = list (pdf_dir.glob('**/*.pdf'))

    print(f"Found {len(pdf_files)} PDF files to process.")

    for pdf_file in pdf_files:
        print(f"\nProcessing {pdf_file.name}")

        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            #Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_document.extend(documents)
            print(f" Loaded {len(documents)} pages")
        except Exception as e:
            print(f" Error: {e}")

    print(f"\n Total documents loaded:{len(all_document)}")
    return all_document

# Process all PDF's in the data directory
all_pdf_documents = process_all_pdfs("../data")


Found 3 PDF files to process.

Processing 9_Counterintuitive_Investment_Principles_from_Mohnish_Pabrai.pdf


Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 49 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 79 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)
Ignoring wrong pointing object 97 0 (offset 0)
Ignoring wrong pointing object 107 0 (offset 0)
Ignoring wrong pointing object 137 0 (offset 0)
Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)


 Loaded 13 pages

Processing USA_CHINA_AI.pdf
 Loaded 1 pages

Processing start_hamburg.pdf
 Loaded 4 pages

 Total documents loaded:18


In [3]:
# Checking the all pdfs data.
#all_pdf_documents

[Document(metadata={'producer': 'macOS Version 15.1.1 (Build 24B91) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20250425210232Z00'00'", 'moddate': "D:20250425210232Z00'00'", 'source': '../data/pdf/9_Counterintuitive_Investment_Principles_from_Mohnish_Pabrai.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': '9_Counterintuitive_Investment_Principles_from_Mohnish_Pabrai.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'macOS Version 15.1.1 (Build 24B91) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20250425210232Z00'00'", 'moddate': "D:20250425210232Z00'00'", 'source': '../data/pdf/9_Counterintuitive_Investment_Principles_from_Mohnish_Pabrai.pdf', 'total_pages': 13, 'page': 1, 'page_label': '2', 'source_file': '9_Counterintuitive_Investment_Principles_from_Mohnish_Pabrai.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'macOS Version 15.1.1 (Build 24B91) Quartz PDFContext', 'creator': '

In [9]:
# Text splitting get into chunks

def split_document(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=[
                    "\n\n",      # Paragraph breaks (best)
                    "\n‚Ä¢ ",      # Bullet point separators (good for USA_CHINA_AI.pdf)
                    "\n",        # Line breaks
                    ". ",        # Sentence breaks (optional)
                    " ",         # Word breaks
                    ""           # Character breaks (last resort)
        ]

    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(split_docs)} documents into {len(split_docs)} chunks")

    # Show example of a chunk

    if split_docs:
        print(f"\n Example chunks:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata:\n{split_docs[0].metadata}")

    return split_docs


In [10]:
chunks = split_document(all_pdf_documents)

Split 8 documents into 8 chunks

 Example chunks:
Content: üá®üá≥ China AI News (Last 7 Days) ‚Ä¢ AI-Driven Cyber Espionage: A major story this week involves a report from US-based AI firm Anthropic, which stated it had stopped a cyber-espionage campaign by a Chine...
Metadata:
{'producer': 'macOS Version 15.0 (Build 24A335) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20251117142112Z00'00'", 'moddate': "D:20251117142112Z00'00'", 'source': '../data/pdf/USA_CHINA_AI.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'USA_CHINA_AI.pdf', 'file_type': 'pdf'}
