## Advanced RAG - Data Extraction Pipeline
### Extract PDFs to Markdown, Images, and Tables

**Learning Objectives:**
- Extract PDF content to markdown format
- Save all figures as PNG images
- Extract tables with context (2 paragraphs before)
- Organize extracted content systematically

**Output Structure:**
- Markdown files: Full document text
- Images: `data/rag-images/{company}/{filename}/image_1.png`
- Tables: `data/rag-tables/{company}/{filename}/table_1.md`

### Setup and Imports

In [1]:
from pathlib import Path
from typing import List, Tuple

from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

### Configuration

In [2]:
# Paths
DATA_DIR = "data/rag-data/rag-pdf"
OUTPUT_MD_DIR = "data/rag-data/rag-markdown"
OUTPUT_IMAGES_DIR = "data/rag-data/rag-images"
OUTPUT_TABLES_DIR = "data/rag-data/rag-tables"

### Extract Metadata from Filename

In [3]:
def extract_metadata_from_filename(filename: str) -> dict:
    """
    Extract metadata from filename.
    
    Expected format: {company} {doc_type} {quarter} {year}.pdf
    Examples:
    - amazon 10-k 2024.pdf
    - amazon 10-q q1 2024.pdf
    """
    name = filename.replace('.pdf', '')
    parts = name.split()
    
    metadata = {}
    metadata['company_name'] = parts[0]
    metadata['doc_type'] = parts[1]
    
    if len(parts) == 4:
        metadata['fiscal_quarter'] = parts[2]
        metadata['fiscal_year'] = int(parts[3])
    else:
        metadata['fiscal_quarter'] = None
        metadata['fiscal_year'] = int(parts[2])
    
    return metadata

### Extract Tables with Context

In [4]:
def extract_tables_with_context(markdown_text: str) -> List[Tuple[str, str]]:
    """
    Extract tables with 2 paragraphs of context before each table.
    
    Returns:
        List of (context + table, table_number) tuples
    """
    # Split by table pattern (markdown tables start with |)
    lines = markdown_text.split('\n')
    
    tables = []
    i = 0
    table_num = 1
    
    while i < len(lines):
        line = lines[i]
        
        # Detect table start (line with multiple |)
        if line.strip().startswith('|') and line.count('|') >= 2:
            # Find 2 paragraphs before
            context_lines = []
            para_count = 0
            j = i - 1
            
            while j >= 0 and para_count < 2:
                if lines[j].strip():  # Non-empty line
                    context_lines.insert(0, lines[j])
                elif context_lines:  # Empty line marks paragraph break
                    para_count += 1
                j -= 1
            
            # Extract full table
            table_lines = []
            while i < len(lines) and (lines[i].strip().startswith('|') or not lines[i].strip()):
                if lines[i].strip():  # Skip empty lines within table
                    table_lines.append(lines[i])
                i += 1
                if i < len(lines) and lines[i].strip() and not lines[i].strip().startswith('|'):
                    break
            
            # Combine context + table
            full_content = '\n'.join(context_lines) + '\n\n' + '\n'.join(table_lines)
            tables.append((full_content, f"table_{table_num}"))
            table_num += 1
        else:
            i += 1
    
    return tables

### Extract PDF Content

In [5]:
def extract_pdf_content(pdf_path: Path):
    """Extract PDF to markdown, images, and tables."""
    print(f"Processing: {pdf_path.name}")
    
    # Get metadata and create directories
    metadata = extract_metadata_from_filename(pdf_path.name)
    company = metadata['company_name']
    filename_stem = pdf_path.stem
    
    md_dir = Path(OUTPUT_MD_DIR) / company
    images_dir = Path(OUTPUT_IMAGES_DIR) / company / filename_stem
    tables_dir = Path(OUTPUT_TABLES_DIR) / company / filename_stem
    
    for dir_path in [md_dir, images_dir, tables_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    # Configure and convert
    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_picture_images = True
    pipeline_options.generate_page_images = True
    
    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
    )
    result = converter.convert(str(pdf_path))
    
    # Save markdown
    page_break = "<!-- page break -->"
    markdown_text = result.document.export_to_markdown(page_break_placeholder=page_break)
    (md_dir / f"{filename_stem}.md").write_text(markdown_text, encoding='utf-8')
    print(f"  ✓ Markdown saved")
    
    # Find pages with large images and save them
    pages_to_save = set()
    for element, _ in result.document.iterate_items():
        if isinstance(element, PictureItem):
            image = element.get_image(result.document)
            if image.size[0] > 500 and image.size[1] > 500:
                page_no = element.prov[0].page_no if element.prov else None
                if page_no:
                    pages_to_save.add(page_no)
    
    # Save page images
    for page_no in pages_to_save:
        page = result.document.pages[page_no]
        page.image.pil_image.save(images_dir / f"page_{page_no}.png", "PNG")
    
    if pages_to_save:
        print(f"  ✓ Saved {len(pages_to_save)} page images")
    
    # Save tables with context
    tables = extract_tables_with_context(markdown_text)
    for table_content, table_name in tables:
        (tables_dir / f"{table_name}.md").write_text(table_content, encoding='utf-8')
    
    if tables:
        print(f"  ✓ Saved {len(tables)} tables")
    
    print(f"  [DONE]\n")

### Process All PDFs

In [6]:
# Find all PDF files
data_path = Path(DATA_DIR)
pdf_files = list(data_path.rglob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files\n")

# Process each PDF
for pdf_path in pdf_files:
    extract_pdf_content(pdf_path)

2025-12-12 21:14:37,790 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:14:37,817 - INFO - Going to convert document batch...
2025-12-12 21:14:37,818 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:14:37,844 - INFO - Loading plugin 'docling_defaults'
2025-12-12 21:14:37,846 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-12 21:14:37,868 - INFO - Loading plugin 'docling_defaults'
2025-12-12 21:14:37,872 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']


Found 28 PDF files

Processing: amazon 10-k 2023.pdf


2025-12-12 21:14:38,037 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:14:38,047 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:14:38,052 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:14:38,052 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:14:38,099 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:14:38,100 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12 21:14:38,101 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12

  ✓ Markdown saved
  ✓ Saved 61 tables
  [DONE]

Processing: amazon 10-k 2024.pdf


2025-12-12 21:15:11,227 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:15:11,227 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:15:12,313 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:15:12,606 - INFO - Processing document amazon 10-k 2024.pdf
2025-12-12 21:15:45,147 - INFO - Finished converting document amazon 10-k 2024.pdf in 34.16 sec.
2025-12-12 21:15:45,548 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:15:45,553 - INFO - Going to convert document batch...
2025-12-12 21:15:45,554 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:15:45,555 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:15:45,564 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:15:45,568 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[

  ✓ Markdown saved
  ✓ Saved 62 tables
  [DONE]

Processing: amazon 10-q q1 2024.pdf


2025-12-12 21:15:45,814 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:15:45,815 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:15:46,805 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:15:47,119 - INFO - Processing document amazon 10-q q1 2024.pdf
2025-12-12 21:16:02,420 - INFO - Finished converting document amazon 10-q q1 2024.pdf in 16.87 sec.
2025-12-12 21:16:02,520 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:16:02,522 - INFO - Going to convert document batch...
2025-12-12 21:16:02,523 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:16:02,524 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:16:02,533 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:16:02,537 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Markdown saved
  ✓ Saved 35 tables
  [DONE]

Processing: amazon 10-q q1 2025.pdf


2025-12-12 21:16:02,729 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:16:02,730 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:16:03,745 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:16:04,017 - INFO - Processing document amazon 10-q q1 2025.pdf
2025-12-12 21:16:18,004 - INFO - Finished converting document amazon 10-q q1 2025.pdf in 15.49 sec.
2025-12-12 21:16:18,111 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:16:18,113 - INFO - Going to convert document batch...
2025-12-12 21:16:18,114 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:16:18,115 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:16:18,126 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:16:18,130 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Markdown saved
  ✓ Saved 35 tables
  [DONE]

Processing: amazon 10-q q2 2024.pdf


2025-12-12 21:16:18,276 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:16:18,277 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:16:19,564 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:16:19,869 - INFO - Processing document amazon 10-q q2 2024.pdf
2025-12-12 21:16:37,598 - INFO - Finished converting document amazon 10-q q2 2024.pdf in 19.49 sec.
2025-12-12 21:16:37,719 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:16:37,721 - INFO - Going to convert document batch...
2025-12-12 21:16:37,722 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:16:37,722 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:16:37,731 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:16:37,735 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Markdown saved
  ✓ Saved 34 tables
  [DONE]

Processing: amazon 10-q q2 2025.pdf


2025-12-12 21:16:37,964 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:16:37,965 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:16:39,059 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:16:39,375 - INFO - Processing document amazon 10-q q2 2025.pdf
2025-12-12 21:16:56,144 - INFO - Finished converting document amazon 10-q q2 2025.pdf in 18.43 sec.
2025-12-12 21:16:56,267 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:16:56,271 - INFO - Going to convert document batch...
2025-12-12 21:16:56,273 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:16:56,276 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:16:56,287 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:16:56,290 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Markdown saved
  ✓ Saved 34 tables
  [DONE]

Processing: amazon 10-q q3 2024.pdf


2025-12-12 21:16:56,510 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:16:56,510 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:16:57,521 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:16:57,782 - INFO - Processing document amazon 10-q q3 2024.pdf
2025-12-12 21:17:23,519 - INFO - Finished converting document amazon 10-q q3 2024.pdf in 27.25 sec.
2025-12-12 21:17:23,791 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:17:23,794 - INFO - Going to convert document batch...
2025-12-12 21:17:23,795 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:17:23,795 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:17:23,808 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:17:23,812 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Markdown saved
  ✓ Saved 38 tables
  [DONE]

Processing: apple 10-k 2023.pdf


2025-12-12 21:17:23,978 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:17:23,979 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:17:25,054 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:17:25,383 - INFO - Processing document apple 10-k 2023.pdf
2025-12-12 21:17:55,281 - INFO - Finished converting document apple 10-k 2023.pdf in 31.49 sec.
2025-12-12 21:17:55,454 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:17:55,457 - INFO - Going to convert document batch...
2025-12-12 21:17:55,457 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:17:55,458 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:17:55,467 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:17:55,471 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m

  ✓ Markdown saved
  ✓ Saved 53 tables
  [DONE]

Processing: apple 10-k 2024.pdf


2025-12-12 21:17:55,698 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:17:55,699 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:17:56,743 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:17:57,053 - INFO - Processing document apple 10-k 2024.pdf
2025-12-12 21:18:31,362 - INFO - Finished converting document apple 10-k 2024.pdf in 35.91 sec.
2025-12-12 21:18:31,604 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:18:31,607 - INFO - Going to convert document batch...
2025-12-12 21:18:31,608 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:18:31,608 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:18:31,620 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:18:31,625 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m

  ✓ Markdown saved
  ✓ Saved 50 tables
  [DONE]

Processing: apple 10-q q1 2024.pdf


[32m[INFO] 2025-12-12 21:18:31,704 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-12-12 21:18:31,705 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
2025-12-12 21:18:31,793 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:18:31,793 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:18:33,151 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:18:33,459 - INFO - Processing document apple 10-q q1 2024.pdf
2025-12-12 21:18:45,388 - INFO - Finished converting document apple 10-q q1 2024.pdf in 13.78 sec.
2025-12-12 21:18:45,496 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:18:45,499 - INFO - Going to convert document batch...
2025-12-12 21:18:45,500 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d9847

  ✓ Markdown saved
  ✓ Saved 26 tables
  [DONE]

Processing: apple 10-q q2 2024.pdf


2025-12-12 21:18:45,697 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:18:45,698 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:18:46,754 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:18:47,044 - INFO - Processing document apple 10-q q2 2024.pdf
2025-12-12 21:18:58,570 - INFO - Finished converting document apple 10-q q2 2024.pdf in 13.08 sec.
2025-12-12 21:18:58,642 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:18:58,644 - INFO - Going to convert document batch...
2025-12-12 21:18:58,644 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:18:58,645 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:18:58,654 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:18:58,658 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.on

  ✓ Markdown saved
  ✓ Saved 27 tables
  [DONE]

Processing: apple 10-q q4 2023.pdf


2025-12-12 21:18:58,859 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:18:58,860 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:18:59,933 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:19:00,201 - INFO - Processing document apple 10-q q4 2023.pdf
2025-12-12 21:19:09,786 - INFO - Finished converting document apple 10-q q4 2023.pdf in 11.14 sec.
2025-12-12 21:19:09,864 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:19:09,865 - INFO - Going to convert document batch...
2025-12-12 21:19:09,866 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:19:09,867 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:19:09,876 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:19:09,880 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.on

  ✓ Markdown saved
  ✓ Saved 26 tables
  [DONE]

Processing: apple 8-k q4 2023.pdf


2025-12-12 21:19:10,105 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:19:10,105 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:19:11,069 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:19:11,338 - INFO - Processing document apple 8-k q4 2023.pdf
2025-12-12 21:19:14,325 - INFO - Finished converting document apple 8-k q4 2023.pdf in 4.46 sec.
2025-12-12 21:19:14,356 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:19:14,359 - INFO - Going to convert document batch...
2025-12-12 21:19:14,359 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:19:14,360 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:19:14,369 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:19:14,373 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx

  ✓ Markdown saved
  ✓ Saved 5 tables
  [DONE]

Processing: google 10-k 2023.pdf


2025-12-12 21:19:15,738 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:19:16,007 - INFO - Processing document google 10-k 2023.pdf
2025-12-12 21:19:50,743 - INFO - Finished converting document google 10-k 2023.pdf in 36.39 sec.
2025-12-12 21:19:50,946 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:19:50,949 - INFO - Going to convert document batch...
2025-12-12 21:19:50,950 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:19:50,950 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:19:50,959 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:19:50,962 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:19:50,963 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4

  ✓ Markdown saved
  ✓ Saved 70 tables
  [DONE]

Processing: google 10-k 2024.pdf


2025-12-12 21:19:51,139 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:19:51,140 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:19:52,246 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:19:52,541 - INFO - Processing document google 10-k 2024.pdf
2025-12-12 21:20:28,453 - INFO - Finished converting document google 10-k 2024.pdf in 37.51 sec.
2025-12-12 21:20:28,659 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:20:28,661 - INFO - Going to convert document batch...
2025-12-12 21:20:28,662 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:20:28,662 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:20:28,671 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:20:28,674 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[

  ✓ Markdown saved
  ✓ Saved 71 tables
  [DONE]

Processing: google 10-q q1 2025.pdf


2025-12-12 21:20:28,857 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:20:29,925 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:20:30,224 - INFO - Processing document google 10-q q1 2025.pdf
2025-12-12 21:20:50,518 - INFO - Finished converting document google 10-q q1 2025.pdf in 21.86 sec.
2025-12-12 21:20:50,636 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:20:50,638 - INFO - Going to convert document batch...
2025-12-12 21:20:50,639 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:20:50,639 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:20:50,648 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:20:50,652 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:20:50,652 [RapidOCR] main.py:53: Using C:\Users\l

  ✓ Markdown saved
  ✓ Saved 57 tables
  [DONE]

Processing: google 10-q q2 2024.pdf


2025-12-12 21:20:50,825 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:20:50,825 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:20:52,033 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:20:52,331 - INFO - Processing document google 10-q q2 2024.pdf
2025-12-12 21:21:16,598 - INFO - Finished converting document google 10-q q2 2024.pdf in 25.96 sec.
2025-12-12 21:21:16,739 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:21:16,741 - INFO - Going to convert document batch...
2025-12-12 21:21:16,742 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:21:16,742 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:21:16,751 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:21:16,754 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Markdown saved
  ✓ Saved 56 tables
  [DONE]

Processing: google 10-q q2 2025.pdf


2025-12-12 21:21:16,892 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:21:16,893 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:21:18,254 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:21:18,549 - INFO - Processing document google 10-q q2 2025.pdf
2025-12-12 21:21:45,309 - INFO - Finished converting document google 10-q q2 2025.pdf in 28.57 sec.
2025-12-12 21:21:45,454 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:21:45,456 - INFO - Going to convert document batch...
2025-12-12 21:21:45,456 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:21:45,457 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:21:45,466 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:21:45,469 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Markdown saved
  ✓ Saved 63 tables
  [DONE]

Processing: google 10-q q3 2024.pdf


2025-12-12 21:21:45,608 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:21:45,609 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:21:46,764 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:21:47,060 - INFO - Processing document google 10-q q3 2024.pdf
2025-12-12 21:22:11,499 - INFO - Finished converting document google 10-q q3 2024.pdf in 26.05 sec.
2025-12-12 21:22:11,629 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:22:11,633 - INFO - Going to convert document batch...
2025-12-12 21:22:11,634 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:22:11,634 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:22:11,643 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:22:11,646 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Markdown saved
  ✓ Saved 56 tables
  [DONE]

Processing: meta 10-k 2024.pdf


2025-12-12 21:22:11,843 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:22:11,844 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:22:12,860 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:22:13,158 - INFO - Processing document meta 10-k 2024.pdf
2025-12-12 21:23:05,415 - INFO - Finished converting document meta 10-k 2024.pdf in 53.79 sec.
2025-12-12 21:23:05,681 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:23:05,682 - INFO - Going to convert document batch...
2025-12-12 21:23:05,683 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:23:05,684 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:23:05,692 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:05,696 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m


  ✓ Markdown saved
  ✓ Saved 64 tables
  [DONE]

Processing: meta 10-q q1 2024.pdf


2025-12-12 21:23:05,905 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:23:05,906 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:23:06,932 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:23:07,230 - INFO - Processing document meta 10-q q1 2024.pdf
2025-12-12 21:23:12,824 - INFO - Finished converting document meta 10-q q1 2024.pdf in 7.14 sec.


  ✓ Markdown saved
  ✓ Saved 8 page images
  ✓ Saved 3 tables
  [DONE]

Processing: meta 10-q q1 2025.pdf


2025-12-12 21:23:13,043 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:23:13,044 - INFO - Going to convert document batch...
2025-12-12 21:23:13,045 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:23:13,045 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:23:13,054 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:13,057 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:13,058 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:13,104 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:13,105 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\

  ✓ Markdown saved
  ✓ Saved 9 page images
  ✓ Saved 3 tables
  [DONE]



2025-12-12 21:23:20,391 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:23:20,394 - INFO - Going to convert document batch...
2025-12-12 21:23:20,394 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:23:20,395 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:23:20,404 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:20,408 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:20,409 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:20,458 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:20,460 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\

Processing: meta 10-q q2 2024.pdf


2025-12-12 21:23:21,582 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:23:21,835 - INFO - Processing document meta 10-q q2 2024.pdf
2025-12-12 21:23:27,590 - INFO - Finished converting document meta 10-q q2 2024.pdf in 7.20 sec.


  ✓ Markdown saved


2025-12-12 21:23:27,837 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:23:27,839 - INFO - Going to convert document batch...
2025-12-12 21:23:27,839 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:23:27,840 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:23:27,849 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:27,852 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:27,853 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:27,894 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:27,895 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\

  ✓ Saved 9 page images
  ✓ Saved 3 tables
  [DONE]

Processing: meta 10-q q2 2025.pdf


2025-12-12 21:23:28,062 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:23:28,062 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:23:29,434 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:23:29,732 - INFO - Processing document meta 10-q q2 2025.pdf
2025-12-12 21:23:35,542 - INFO - Finished converting document meta 10-q q2 2025.pdf in 7.71 sec.


  ✓ Markdown saved
  ✓ Saved 8 page images
  ✓ Saved 3 tables
  [DONE]

Processing: meta 10-q q3 2024.pdf


2025-12-12 21:23:35,754 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:23:35,756 - INFO - Going to convert document batch...
2025-12-12 21:23:35,756 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:23:35,756 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:23:35,765 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:35,769 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:35,769 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:35,811 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:35,812 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\

  ✓ Markdown saved
  ✓ Saved 9 page images
  ✓ Saved 3 tables
  [DONE]



2025-12-12 21:23:43,099 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:23:43,101 - INFO - Going to convert document batch...
2025-12-12 21:23:43,103 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:23:43,104 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:23:43,113 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:43,116 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:43,117 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:43,163 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:43,165 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\

Processing: meta 10-q q3 2025.pdf


2025-12-12 21:23:43,344 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 21:23:43,345 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:23:44,266 - INFO - Accelerator device: 'cuda:0'
2025-12-12 21:23:44,556 - INFO - Processing document meta 10-q q3 2025.pdf
2025-12-12 21:23:50,172 - INFO - Finished converting document meta 10-q q3 2025.pdf in 7.07 sec.


  ✓ Markdown saved
  ✓ Saved 8 page images
  ✓ Saved 3 tables
  [DONE]

Processing: meta 10-q q4 2024.pdf


2025-12-12 21:23:50,386 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 21:23:50,388 - INFO - Going to convert document batch...
2025-12-12 21:23:50,389 - INFO - Initializing pipeline for StandardPdfPipeline with options hash afb4d61b52d512d984736b9faa45e3e9
2025-12-12 21:23:50,390 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 21:23:50,399 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:50,402 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:50,403 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 21:23:50,446 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 21:23:50,448 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\

  ✓ Markdown saved
  ✓ Saved 9 page images
  ✓ Saved 3 tables
  [DONE]

Processing: meta10-k 2023.pdf


IndexError: list index out of range

### Verify Extraction

In [None]:
# Count extracted files
md_files = list(Path(OUTPUT_MD_DIR).rglob("*.md"))
image_files = list(Path(OUTPUT_IMAGES_DIR).rglob("*.png"))
table_files = list(Path(OUTPUT_TABLES_DIR).rglob("*.md"))

print(f"Extraction Summary:")
print(f"  Markdown files: {len(md_files)}")
print(f"  Images: {len(image_files)}")
print(f"  Tables: {len(table_files)}")

### Example: View Extracted Content

In [None]:
# Show sample markdown
if md_files:
    sample_md = md_files[0]
    print(f"Sample Markdown: {sample_md}\n")
    with open(sample_md, 'r', encoding='utf-8') as f:
        print(f.read()[:500])

# Show sample table
if table_files:
    sample_table = table_files[0]
    print(f"\n\nSample Table: {sample_table}\n")
    with open(sample_table, 'r', encoding='utf-8') as f:
        print(f.read())