## Advanced RAG Concept - Data Ingestion Pipeline for PageRAG
### Page-wise Document Processing with LLM Metadata Extraction

Learning Objectives:
- Extract text from PDFs page by page
- Extract metadata using LLM (structured output)
- Store in ChromaDB with rich metadata

#### Real-World Use Cases:
1. Financial Analysis: Process SEC filings (10-K, 10-Q)
2. Legal: Organize contracts and case documents
3. Research: Index academic papers with smart metadata
4. Enterprise: Build searchable document repositories
5. Compliance: Track regulatory documents

In [None]:
# Advanced RAG - Data Ingestion Pipeline
# Page-wise Document Processing with Filename-based Metadata Extraction
from dotenv import load_dotenv
load_dotenv()

import hashlib
from pathlib import Path

from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document

from docling.document_converter import DocumentConverter

In [None]:
# Configuration
DATA_DIR = "data"
CHROMA_DIR = "./chroma_financial_db"
COLLECTION_NAME = "financial_docs"
EMBEDDING_MODEL = "nomic-embed-text"
BASE_URL = "http://localhost:11434"

In [None]:
# Initialize embeddings
embeddings = OllamaEmbeddings(
    model=EMBEDDING_MODEL,
    base_url=BASE_URL
)

# Initialize vector store
vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=CHROMA_DIR
)

In [None]:
# Extract metadata from filename
def extract_metadata_from_filename(filename):
    """
    Extract metadata from filename.
    
    Expected format: {company} {doc_type} {quarter} {year}.pdf
    Examples:
    - amazon 10-k 2024.pdf
    - amazon 10-q q2 2024.pdf
    - apple 8-k q1 2024.pdf
    
    Returns:
        dict with company_name, doc_type, fiscal_year, fiscal_quarter
    """
    # Remove .pdf extension and split
    name = filename.replace('.pdf', '')
    parts = name.split()
    
    metadata = {}
    
    if len(parts) >= 3:
        metadata['company_name'] = parts[0]  # amazon, apple, google
        metadata['doc_type'] = parts[1]      # 10-k, 10-q, 8-k
        
        # Check if there's a quarter (for 10-q reports)
        if len(parts) == 4:
            # Format: company 10-q q2 2024
            metadata['fiscal_quarter'] = parts[2]  # q1, q2, q3, q4
            metadata['fiscal_year'] = int(parts[3])
        else:
            # Format: company 10-k 2024
            metadata['fiscal_quarter'] = None
            metadata['fiscal_year'] = int(parts[2])
    else:
        # Fallback for unexpected format
        metadata['company_name'] = 'unknown'
        metadata['doc_type'] = 'other'
        metadata['fiscal_year'] = 1970
        metadata['fiscal_quarter'] = None
    
    return metadata

In [None]:
def extract_pdf_pages(pdf_path):
    """Extract text from each page of PDF."""

    # source = "finance/Amazon 10-Q Aug 2025.pdf"  # document per local path or URL
    converter = DocumentConverter()
    result = converter.convert(pdf_path)

    page_break = "<!-- page break -->"
    markdown_text = result.document.export_to_markdown(page_break_placeholder="<!-- page break -->")
    pages = markdown_text.split(page_break)
    return pages

# pages = extract_pdf_pages("finance/Amazon 10-Q Aug 2025.pdf")


In [None]:
def compute_file_hash(pdf_path):
    """Compute SHA-256 hash of file content."""
    sha256_hash = hashlib.sha256()
    with open(pdf_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()


In [None]:
# Track processed files (deduplication)
try:
    existing_docs = vector_store.get(where={"file_hash": {"$ne": ""}}, include=["metadatas"])
    processed_hashes = set(m.get('file_hash') for m in existing_docs['metadatas'] if m.get('file_hash'))
except:
    processed_hashes = set()

print(f"Found {len(processed_hashes)} already processed files")

In [None]:
def process_pdf(pdf_path):
    """Process a single PDF file page by page."""
    print(f"\n{'='*80}")
    print(f"Processing: {pdf_path.name}")
    print(f"{'='*80}")

    # Check if already processed (deduplication)
    file_hash = compute_file_hash(pdf_path)
    if file_hash in processed_hashes:
        print(f"[SKIP] Already processed")
        return {"status": "skipped", "reason": "duplicate", "file": pdf_path.name}

    try:
        # Step 1: Extract pages from PDF
        print("\n[1/3] Extracting pages from PDF...")
        pages = extract_pdf_pages(pdf_path)
        print(f"[OK] Extracted {len(pages)} pages")

        # Step 2: Extract metadata from filename
        print("\n[2/3] Extracting metadata from filename...")
        file_metadata = extract_metadata_from_filename(pdf_path.name)
        print(f"[OK] Company: {file_metadata['company_name']}, Doc Type: {file_metadata['doc_type']}, Year: {file_metadata['fiscal_year']}")

        # Step 3: Process each page
        print("\n[3/3] Processing pages...")
        processed_pages = []

        for page_num, page_text in enumerate(pages, start=1):
            # Create metadata dict for this page
            metadata_dict = file_metadata.copy()
            metadata_dict['page'] = page_num
            metadata_dict['file_hash'] = file_hash
            metadata_dict['source_file'] = pdf_path.name

            processed_pages.append({
                'text': page_text,
                'metadata': metadata_dict
            })

        # Step 4: Add to vector store
        print(f"\n[4/4] Adding {len(processed_pages)} pages to vector store...")
        documents = [
            Document(page_content=page['text'], metadata=page['metadata'])
            for page in processed_pages
        ]

        vector_store.add_documents(documents=documents)

        print(f"[OK] Successfully ingested {len(processed_pages)} pages")
        processed_hashes.add(file_hash)

        return {
            "status": "success",
            "file": pdf_path.name,
            "pages": len(processed_pages),
            "file_hash": file_hash
        }

    except Exception as e:
        print(f"\n[ERROR] Processing {pdf_path.name}: {str(e)}")
        return {"status": "error", "file": pdf_path.name, "error": str(e)}

In [None]:
# Collect PDF files from data directory (recursively)
data_path = Path(DATA_DIR)
pdf_files = list(data_path.rglob("*.pdf"))  # rglob searches recursively
print(f"Found {len(pdf_files)} PDF files")
pdf_files[:5]  # Show first 5

In [None]:
results = []
for pdf_path in pdf_files:
    result = process_pdf(pdf_path)
    results.append(result)

In [None]:
success_count = sum(1 for r in results if r['status'] == 'success')
skipped_count = sum(1 for r in results if r['status'] == 'skipped')
error_count = sum(1 for r in results if r['status'] == 'error')

print(f"[OK] Successfully processed: {success_count}")
print(f"[SKIP] Skipped (duplicates): {skipped_count}")
print(f"[ERROR] Errors: {error_count}")

In [None]:
results

In [None]:
def create_retriever(filter_dict={}, k=5):
    """
    Create a retriever from the vector store with optional metadata filters.
    
    Args:
        filter_dict: Dictionary of filters to apply
                    Example: {"company_name": "amazon", "doc_type": "10-q"}
        k: Number of results to retrieve (default: 5)
    
    Returns:
        Retriever object
    
    Example:
        # Single filter
        retriever = create_retriever(filter_dict={"company_name": "amazon"})
        
        # Multiple filters
        retriever = create_retriever(filter_dict={
            "company_name": "amazon",
            "doc_type": "10-q",
            "fiscal_year": 2024
        })
        
        # No filters
        retriever = create_retriever()
    """
    search_kwargs = {"k": k}
    
    if filter_dict:
        # Convert flat dict to ChromaDB $and format if multiple conditions
        if len(filter_dict) == 1:
            # Single condition - use directly
            search_kwargs["filter"] = filter_dict
        else:
            # Multiple conditions - wrap in $and
            filter_conditions = [{key: value} for key, value in filter_dict.items()]
            search_kwargs["filter"] = {"$and": filter_conditions}
    
    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs=search_kwargs
    )
    
    return retriever

In [None]:
# Example: Filter by company
retriever = create_retriever(filter_dict={'fiscal_year': 2024, 'doc_type': '10-k', 'company_name': 'amazon'})

result = retriever.invoke("revenue and financial results")
for i, doc in enumerate(result, 1):
    print(f"\nResult {i}:")
    print(f"Company: {doc.metadata.get('company_name', 'N/A')}")
    print(f"Doc Type: {doc.metadata.get('doc_type', 'N/A')}")
    print(f"Year: {doc.metadata.get('fiscal_year', 'N/A')}")
    print(f"Page: {doc.metadata.get('page', 'N/A')}")
    print(f"Content: {doc.page_content[:150]}...")

In [None]:
result