## Advanced RAG - Data Extraction Pipeline
### Extract PDFs to Markdown, Images, and Tables

**Learning Objectives:**
- Extract PDF content to markdown format
- Save all figures as PNG images
- Extract tables with context (2 paragraphs before)
- Organize extracted content systematically

**Output Structure:**
- Markdown files: Full document text
- Images: `data/rag-images/{company}/{filename}/image_1.png`
- Tables: `data/rag-tables/{company}/{filename}/table_1.md`

### Setup and Imports

In [None]:
from pathlib import Path
from typing import List, Tuple

from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

### Configuration

In [None]:
# Paths
DATA_DIR = "data/rag-data"
OUTPUT_MD_DIR = "data/rag-markdown"
OUTPUT_IMAGES_DIR = "data/rag-images"
OUTPUT_TABLES_DIR = "data/rag-tables"

### Extract Metadata from Filename

In [None]:
def extract_metadata_from_filename(filename: str) -> dict:
    """
    Extract metadata from filename.
    
    Expected format: {company} {doc_type} {quarter} {year}.pdf
    Examples:
    - amazon 10-k 2024.pdf
    - amazon 10-q q1 2024.pdf
    """
    name = filename.replace('.pdf', '')
    parts = name.split()
    
    metadata = {}
    metadata['company_name'] = parts[0]
    metadata['doc_type'] = parts[1]
    
    if len(parts) == 4:
        metadata['fiscal_quarter'] = parts[2]
        metadata['fiscal_year'] = int(parts[3])
    else:
        metadata['fiscal_quarter'] = None
        metadata['fiscal_year'] = int(parts[2])
    
    return metadata

### Extract Tables with Context

In [None]:
def extract_tables_with_context(markdown_text: str) -> List[Tuple[str, str]]:
    """
    Extract tables with 2 paragraphs of context before each table.
    
    Returns:
        List of (context + table, table_number) tuples
    """
    # Split by table pattern (markdown tables start with |)
    lines = markdown_text.split('\n')
    
    tables = []
    i = 0
    table_num = 1
    
    while i < len(lines):
        line = lines[i]
        
        # Detect table start (line with multiple |)
        if line.strip().startswith('|') and line.count('|') >= 2:
            # Find 2 paragraphs before
            context_lines = []
            para_count = 0
            j = i - 1
            
            while j >= 0 and para_count < 2:
                if lines[j].strip():  # Non-empty line
                    context_lines.insert(0, lines[j])
                elif context_lines:  # Empty line marks paragraph break
                    para_count += 1
                j -= 1
            
            # Extract full table
            table_lines = []
            while i < len(lines) and (lines[i].strip().startswith('|') or not lines[i].strip()):
                if lines[i].strip():  # Skip empty lines within table
                    table_lines.append(lines[i])
                i += 1
                if i < len(lines) and lines[i].strip() and not lines[i].strip().startswith('|'):
                    break
            
            # Combine context + table
            full_content = '\n'.join(context_lines) + '\n\n' + '\n'.join(table_lines)
            tables.append((full_content, f"table_{table_num}"))
            table_num += 1
        else:
            i += 1
    
    return tables

### Extract PDF Content

In [None]:
def extract_pdf_content(pdf_path: Path):
    """Extract PDF to markdown, images, and tables."""
    print(f"Processing: {pdf_path.name}")
    
    # Get metadata
    metadata = extract_metadata_from_filename(pdf_path.name)
    company = metadata['company_name']
    filename_stem = pdf_path.stem
    
    # Create output directories
    md_dir = Path(OUTPUT_MD_DIR) / company
    images_dir = Path(OUTPUT_IMAGES_DIR) / company / filename_stem
    tables_dir = Path(OUTPUT_TABLES_DIR) / company / filename_stem
    
    md_dir.mkdir(parents=True, exist_ok=True)
    images_dir.mkdir(parents=True, exist_ok=True)
    tables_dir.mkdir(parents=True, exist_ok=True)
    
    # Configure pipeline for image extraction
    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_picture_images = True
    
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    
    # Convert PDF
    result = converter.convert(str(pdf_path))
    
    # Save markdown with page breaks
    page_break = "<!-- page break -->"
    markdown_text = result.document.export_to_markdown(page_break_placeholder=page_break)
    md_file = md_dir / f"{filename_stem}.md"
    with open(md_file, 'w', encoding='utf-8') as f:
        f.write(markdown_text)
    print(f"  ✓ Saved markdown: {md_file}")
    
    # Extract images only (not tables)
    picture_counter = 0
    skipped_images = 0
    
    for element, _level in result.document.iterate_items():
        if isinstance(element, PictureItem):
            image = element.get_image(result.document)
            width, height = image.size
            
            # Only save images larger than 500x500
            if width > 500 and height > 500:
                picture_counter += 1
                image_file = images_dir / f"image_{picture_counter}.png"
                with image_file.open("wb") as fp:
                    image.save(fp, "PNG")
            else:
                skipped_images += 1
    
    if picture_counter > 0:
        print(f"  ✓ Saved {picture_counter} images to: {images_dir}")
    if skipped_images > 0:
        print(f"  ⊘ Skipped {skipped_images} small images (< 500x500)")
    
    # Extract tables with context from markdown
    tables = extract_tables_with_context(markdown_text)
    for table_content, table_name in tables:
        table_file = tables_dir / f"{table_name}.md"
        with open(table_file, 'w', encoding='utf-8') as f:
            f.write(table_content)
    
    if tables:
        print(f"  ✓ Saved {len(tables)} tables (text) to: {tables_dir}")
    
    print(f"  [DONE] {pdf_path.name}\n")

### Process All PDFs

In [None]:
# Find all PDF files
data_path = Path(DATA_DIR)
pdf_files = list(data_path.rglob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files\n")

# Process each PDF
for pdf_path in pdf_files:
    extract_pdf_content(pdf_path)

### Verify Extraction

In [None]:
# Count extracted files
md_files = list(Path(OUTPUT_MD_DIR).rglob("*.md"))
image_files = list(Path(OUTPUT_IMAGES_DIR).rglob("*.png"))
table_files = list(Path(OUTPUT_TABLES_DIR).rglob("*.md"))

print(f"Extraction Summary:")
print(f"  Markdown files: {len(md_files)}")
print(f"  Images: {len(image_files)}")
print(f"  Tables: {len(table_files)}")

### Example: View Extracted Content

In [None]:
# Show sample markdown
if md_files:
    sample_md = md_files[0]
    print(f"Sample Markdown: {sample_md}\n")
    with open(sample_md, 'r', encoding='utf-8') as f:
        print(f.read()[:500])

# Show sample table
if table_files:
    sample_table = table_files[0]
    print(f"\n\nSample Table: {sample_table}\n")
    with open(sample_table, 'r', encoding='utf-8') as f:
        print(f.read())