## Data Extraction with Docling

In this notebook, we'll extract content from PDFs into structured formats:

- **Markdown**: Full document text with page breaks for chunking
- **Images**: Save pages containing large charts/diagrams (>500x500 pixels)
- **Tables**: Extract with 2 paragraphs of context + page number metadata

**Pipeline Overview:**
1. **This Notebook (06-01)**: Extract PDFs → Markdown, Images, Tables
2. **Next Notebook (06-02)**: Load into vector database with embeddings
3. **Notebook 07**: Intelligent search with filters and reranking

**Output Structure:**
```
data/rag-markdown/{company}/{document}.md
data/rag-images/{company}/{document}/page_5.png
data/rag-tables/{company}/{document}/table_1_page_5.md
```

### 1. Setup and Configuration

In [None]:
from pathlib import Path
from typing import List, Tuple

from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

In [None]:
# Directory paths
DATA_DIR = "data/rag-data/rag-pdf"
OUTPUT_MD_DIR = "data/rag-data/rag-markdown"
OUTPUT_IMAGES_DIR = "data/rag-data/rag-images"
OUTPUT_TABLES_DIR = "data/rag-data/rag-tables"

### 2. Helper Functions

In [None]:
def extract_metadata_from_filename(filename: str) -> dict:
    """
    Extract metadata from filename.
    
    Expected format: CompanyName DocType [Quarter] Year.pdf
    Examples:
        - Amazon 10-Q Q1 2024.pdf
        - Microsoft 10-K 2023.pdf
    """
    name = filename.replace('.pdf', '')
    parts = name.split()
    
    return {
        'company_name': parts[0],
        'doc_type': parts[1],
        'fiscal_quarter': parts[2] if len(parts) == 4 else None,
        'fiscal_year': int(parts[-1])
    }

In [None]:
def extract_tables_with_context(markdown_text: str) -> List[Tuple[str, str, int]]:
    """
    Extract markdown tables with 2 paragraphs of context and page number.
    
    Returns:
        List of (context + table, table_name, page_number) tuples
    """
    lines = markdown_text.split('\n')
    tables = []
    i = 0
    table_num = 1
    current_page = 1  # Track current page number
    
    while i < len(lines):
        line = lines[i]
        
        # Update page number when we hit a page break
        if '<!-- page break -->' in line:
            current_page += 1
            i += 1
            continue
        
        # Check if this line is a table row (markdown tables start with |)
        if line.strip().startswith('|') and line.count('|') >= 2:
            
            # Step 1: Extract 2 paragraphs before the table
            context_lines = []
            para_count = 0
            j = i - 1
            
            while j >= 0 and para_count < 2:
                if lines[j].strip():  # Non-empty line
                    # Skip page break markers in context
                    if '<!-- page break -->' not in lines[j]:
                        context_lines.insert(0, lines[j])
                elif context_lines:  # Empty line = paragraph break
                    para_count += 1
                j -= 1
            
            # Step 2: Extract the full table
            table_lines = []
            while i < len(lines) and (lines[i].strip().startswith('|') or not lines[i].strip()):
                if lines[i].strip():
                    table_lines.append(lines[i])
                i += 1
                # Stop when we hit a non-table line
                if i < len(lines) and lines[i].strip() and not lines[i].strip().startswith('|'):
                    break
            
            # Step 3: Combine context + table
            full_content = '\n'.join(context_lines) + '\n\n' + '\n'.join(table_lines)
            tables.append((full_content, f"table_{table_num}", current_page))
            table_num += 1
        else:
            i += 1
    
    return tables

### 3. Main Extraction Function

In [None]:
def extract_pdf_content(pdf_path: Path):
    """Extract PDF to markdown, images, and tables."""
    print(f"Processing: {pdf_path.name}")
    
    # Step 1: Get metadata and create output directories
    metadata = extract_metadata_from_filename(pdf_path.name)
    company = metadata['company_name']
    filename_stem = pdf_path.stem
    
    md_dir = Path(OUTPUT_MD_DIR) / company
    images_dir = Path(OUTPUT_IMAGES_DIR) / company / filename_stem
    tables_dir = Path(OUTPUT_TABLES_DIR) / company / filename_stem
    
    for dir_path in [md_dir, images_dir, tables_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    # Step 2: Configure Docling converter
    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_picture_images = True
    pipeline_options.generate_page_images = True
    
    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
    )
    result = converter.convert(str(pdf_path))
    
    # Step 3: Save markdown with page breaks
    markdown_text = result.document.export_to_markdown(page_break_placeholder="<!-- page break -->")
    (md_dir / f"{filename_stem}.md").write_text(markdown_text, encoding='utf-8')
    print(f"  ✓ Markdown saved")
    
    # Step 4: Find and save pages with large images
    pages_to_save = set()
    
    for item in result.document.iterate_items():
        element = item[0]  # Extract element from tuple
        
        if isinstance(element, PictureItem):
            image = element.get_image(result.document)
            # Check if image is large (>500x500 pixels)
            if image.size[0] > 500 and image.size[1] > 500:
                page_no = element.prov[0].page_no if element.prov else None
                if page_no:
                    pages_to_save.add(page_no)
    
    # Save the full page images
    for page_no in pages_to_save:
        page = result.document.pages[page_no]
        page.image.pil_image.save(images_dir / f"page_{page_no}.png", "PNG")
    
    if pages_to_save:
        print(f"  ✓ Saved {len(pages_to_save)} page images")
    
    # Step 5: Extract and save tables with context and page numbers
    tables = extract_tables_with_context(markdown_text)
    for table_content, table_name, page_num in tables:
        # Add page number metadata at the top
        content_with_page = f"**Page:** {page_num}\n\n{table_content}"
        # Save with page number in filename
        (tables_dir / f"{table_name}_page_{page_num}.md").write_text(content_with_page, encoding='utf-8')
    
    if tables:
        print(f"  ✓ Saved {len(tables)} tables")
    
    print(f"  Done!\n")

### 4. Process a Single PDF (Example)

In [None]:
# Find all PDF files
data_path = Path(DATA_DIR)
pdf_files = list(data_path.rglob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files\n")

# Process one example first to see the output
if pdf_files:
    print("=== Processing Example PDF ===")
    extract_pdf_content(pdf_files[0])
    print("\nCheck the output folders to see extracted files!")
    print(f"- Markdown: {OUTPUT_MD_DIR}")
    print(f"- Images: {OUTPUT_IMAGES_DIR}")
    print(f"- Tables: {OUTPUT_TABLES_DIR}")

### 5. Process All PDFs

In [None]:
# Process all PDFs
print(f"\n=== Processing All {len(pdf_files)} PDFs ===\n")

for idx, pdf_path in enumerate(pdf_files, 1):
    print(f"[{idx}/{len(pdf_files)}]", end=" ")
    extract_pdf_content(pdf_path)

print("\n=== Extraction Complete ===")