## Advanced RAG - Data Ingestion Pipeline for PageRAG
### Page-wise Document Processing with Gemini Embeddings and Qdrant

**Learning Objectives:**
- Extract text from PDFs page by page
- Extract metadata from filename
- Store in Qdrant with rich metadata
- Use Gemini embeddings

**Use Cases:**
1. Financial Analysis: Process SEC filings (10-K, 10-Q)
2. Legal: Organize contracts and case documents
3. Research: Index academic papers
4. Enterprise: Searchable document repositories

![image.png](attachment:image.png)

### Setup and Configuration

In [1]:
from dotenv import load_dotenv
load_dotenv()

import hashlib
from pathlib import Path
from typing import List

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

from docling.document_converter import DocumentConverter

In [2]:
# Configuration
DATA_DIR = "data"
QDRANT_PATH = "./qdrant_financial_db"
COLLECTION_NAME = "financial_docs"
EMBEDDING_MODEL = "models/gemini-embedding-001"

### Initialize Gemini Embeddings and Qdrant

In [4]:
embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL)

vector_store = QdrantVectorStore.from_existing_collection(
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    path=QDRANT_PATH
)

RuntimeError: Storage folder ./qdrant_financial_db is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.

### Metadata Extraction from Filename

In [None]:
def extract_metadata_from_filename(filename: str) -> dict:
    """
    Extract metadata from filename.
    
    Expected format: {company} {doc_type} {quarter} {year}.pdf
    Examples:
    - amazon 10-k 2024.pdf
    - amazon 10-q q1 2024.pdf
    
    Returns:
        dict with company_name, doc_type, fiscal_year, fiscal_quarter
    """
    name = filename.replace('.pdf', '')
    parts = name.split()
    
    metadata = {}
    
    if len(parts) == 4:
        metadata['fiscal_quarter'] = parts[2]
        metadata['fiscal_year'] = int(parts[3])
    else:
        metadata['fiscal_quarter'] = None
        metadata['fiscal_year'] = int(parts[2])
    
    metadata['company_name'] = parts[0]
    metadata['doc_type'] = parts[1]
    
    return metadata

In [None]:
extract_metadata_from_filename('amazon 10-k 2023.pdf')

In [None]:
extract_metadata_from_filename('amazon 10-q q1 2024.pdf')

### Extract Text from PDF Pages

In [None]:
def extract_pdf_pages(pdf_path: str) -> List[str]:
    """
    Extract text from each page of PDF.
    
    Returns:
        List of page texts
    """
    converter = DocumentConverter()
    result = converter.convert(pdf_path)
    
    page_break = "<!-- page break -->"
    markdown_text = result.document.export_to_markdown(page_break_placeholder=page_break)
    
    pages = markdown_text.split(page_break)
    
    return pages

In [None]:
pages = extract_pdf_pages('data/amazon/amazon 10-q q1 2024.pdf')
print(f"Total pages: {len(pages)}")

### File Hash for Duplicate Detection

In [None]:
def compute_file_hash(file_path: str) -> str:
    """Compute SHA-256 hash of file content."""
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

In [None]:
compute_file_hash('data/amazon/amazon 10-q q1 2024.pdf')

### Track Processed Files

In [None]:
# Get already processed files from Qdrant
client = vector_store.client

all_points = client.scroll(
    collection_name=COLLECTION_NAME,
    limit=10000,
    with_payload=True
)

processed_hashes = set(
    point.payload.get('file_hash') 
    for point in all_points[0] 
    if point.payload.get('file_hash')
)

print(f"Already processed: {len(processed_hashes)} files")

### Document Ingestion Pipeline

In [None]:
def ingest_docs_in_vectordb(pdf_path: Path):
    """Process and ingest PDF into Qdrant vector store."""
    print(f"Processing: {pdf_path.name}")
    
    file_hash = compute_file_hash(pdf_path)
    if file_hash in processed_hashes:
        print(f"[SKIP] Already processed: {pdf_path.name}")
        return
    
    pages = extract_pdf_pages(str(pdf_path))
    file_metadata = extract_metadata_from_filename(pdf_path.name)
    
    documents = []
    
    for page_num, page_text in enumerate(pages, start=1):
        metadata = file_metadata.copy()
        metadata['page'] = page_num
        metadata['file_hash'] = file_hash
        metadata['source_file'] = pdf_path.name
        
        doc = Document(page_content=page_text, metadata=metadata)
        documents.append(doc)
    
    vector_store.add_documents(documents=documents)
    processed_hashes.add(file_hash)
    
    print(f"[DONE] Ingested {len(documents)} pages from {pdf_path.name}")

### Process All PDFs

In [None]:
data_path = Path(DATA_DIR)
pdf_files = list(data_path.rglob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files")
pdf_files[:3]

In [None]:
for pdf_path in pdf_files:
    ingest_docs_in_vectordb(pdf_path)

### Verify Ingestion

In [None]:
collection_info = vector_store.client.get_collection(COLLECTION_NAME)
print(f"Total documents in Qdrant: {collection_info.points_count}")

In [None]:
# Search example
results = vector_store.similarity_search(
    "What is Tesla's revenue for Q1 2024?",
    k=3
)

results