## Advanced RAG - Data Extraction Pipeline
### Extract PDFs to Markdown, Images, and Tables

**Learning Objectives:**
- Extract PDF content to markdown format
- Save all figures as PNG images
- Extract tables with context (2 paragraphs before)
- Organize extracted content systematically

**Output Structure:**
- Markdown files: Full document text
- Images: `data/rag-images/{company}/{filename}/image_1.png`
- Tables: `data/rag-tables/{company}/{filename}/table_1.md`

### Setup and Imports

In [1]:
from pathlib import Path
from typing import List, Tuple

from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

### Configuration

In [2]:
# Paths
DATA_DIR = "data/rag-data"
OUTPUT_MD_DIR = "data/rag-markdown"
OUTPUT_IMAGES_DIR = "data/rag-images"
OUTPUT_TABLES_DIR = "data/rag-tables"

### Extract Metadata from Filename

In [3]:
def extract_metadata_from_filename(filename: str) -> dict:
    """
    Extract metadata from filename.
    
    Expected format: {company} {doc_type} {quarter} {year}.pdf
    Examples:
    - amazon 10-k 2024.pdf
    - amazon 10-q q1 2024.pdf
    """
    name = filename.replace('.pdf', '')
    parts = name.split()
    
    metadata = {}
    metadata['company_name'] = parts[0]
    metadata['doc_type'] = parts[1]
    
    if len(parts) == 4:
        metadata['fiscal_quarter'] = parts[2]
        metadata['fiscal_year'] = int(parts[3])
    else:
        metadata['fiscal_quarter'] = None
        metadata['fiscal_year'] = int(parts[2])
    
    return metadata

### Extract Tables with Context

In [4]:
def extract_tables_with_context(markdown_text: str) -> List[Tuple[str, str]]:
    """
    Extract tables with 2 paragraphs of context before each table.
    
    Returns:
        List of (context + table, table_number) tuples
    """
    # Split by table pattern (markdown tables start with |)
    lines = markdown_text.split('\n')
    
    tables = []
    i = 0
    table_num = 1
    
    while i < len(lines):
        line = lines[i]
        
        # Detect table start (line with multiple |)
        if line.strip().startswith('|') and line.count('|') >= 2:
            # Find 2 paragraphs before
            context_lines = []
            para_count = 0
            j = i - 1
            
            while j >= 0 and para_count < 2:
                if lines[j].strip():  # Non-empty line
                    context_lines.insert(0, lines[j])
                elif context_lines:  # Empty line marks paragraph break
                    para_count += 1
                j -= 1
            
            # Extract full table
            table_lines = []
            while i < len(lines) and (lines[i].strip().startswith('|') or not lines[i].strip()):
                if lines[i].strip():  # Skip empty lines within table
                    table_lines.append(lines[i])
                i += 1
                if i < len(lines) and lines[i].strip() and not lines[i].strip().startswith('|'):
                    break
            
            # Combine context + table
            full_content = '\n'.join(context_lines) + '\n\n' + '\n'.join(table_lines)
            tables.append((full_content, f"table_{table_num}"))
            table_num += 1
        else:
            i += 1
    
    return tables

### Extract PDF Content

In [5]:
def extract_pdf_content(pdf_path: Path):
    """Extract PDF to markdown, images, and tables."""
    print(f"Processing: {pdf_path.name}")
    
    # Get metadata
    metadata = extract_metadata_from_filename(pdf_path.name)
    company = metadata['company_name']
    filename_stem = pdf_path.stem
    
    # Create output directories
    md_dir = Path(OUTPUT_MD_DIR) / company
    images_dir = Path(OUTPUT_IMAGES_DIR) / company / filename_stem
    tables_dir = Path(OUTPUT_TABLES_DIR) / company / filename_stem
    
    md_dir.mkdir(parents=True, exist_ok=True)
    images_dir.mkdir(parents=True, exist_ok=True)
    tables_dir.mkdir(parents=True, exist_ok=True)
    
    # Configure pipeline for image extraction
    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_picture_images = True
    
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    
    # Convert PDF
    result = converter.convert(str(pdf_path))
    
    # Save markdown with page breaks
    page_break = "<!-- page break -->"
    markdown_text = result.document.export_to_markdown(page_break_placeholder=page_break)
    md_file = md_dir / f"{filename_stem}.md"
    with open(md_file, 'w', encoding='utf-8') as f:
        f.write(markdown_text)
    print(f"  ✓ Saved markdown: {md_file}")
    
    # Extract images only (not tables)
    picture_counter = 0
    
    for element, _level in result.document.iterate_items():
        if isinstance(element, PictureItem):
            picture_counter += 1
            image_file = images_dir / f"image_{picture_counter}.png"
            with image_file.open("wb") as fp:
                element.get_image(result.document).save(fp, "PNG")
    
    if picture_counter > 0:
        print(f"  ✓ Saved {picture_counter} images to: {images_dir}")
    
    # Extract tables with context from markdown
    tables = extract_tables_with_context(markdown_text)
    for table_content, table_name in tables:
        table_file = tables_dir / f"{table_name}.md"
        with open(table_file, 'w', encoding='utf-8') as f:
            f.write(table_content)
    
    if tables:
        print(f"  ✓ Saved {len(tables)} tables (text) to: {tables_dir}")
    
    print(f"  [DONE] {pdf_path.name}\n")

### Process All PDFs

In [6]:
# Find all PDF files
data_path = Path(DATA_DIR)
pdf_files = list(data_path.rglob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files\n")

# Process each PDF
for pdf_path in pdf_files:
    extract_pdf_content(pdf_path)

2025-12-12 19:03:10,909 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:03:10,932 - INFO - Going to convert document batch...
2025-12-12 19:03:10,933 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:03:10,958 - INFO - Loading plugin 'docling_defaults'
2025-12-12 19:03:10,968 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-12 19:03:10,987 - INFO - Loading plugin 'docling_defaults'
2025-12-12 19:03:11,014 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']


Found 19 PDF files

Processing: amazon 10-k 2023.pdf


2025-12-12 19:03:11,604 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:03:11,615 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:03:11,627 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:03:11,627 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:03:11,691 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:03:11,693 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12 19:03:11,694 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-k 2023.md
  ✓ Saved 61 tables (text) to: data\rag-tables\amazon\amazon 10-k 2023
  [DONE] amazon 10-k 2023.pdf

Processing: amazon 10-k 2024.pdf


2025-12-12 19:03:44,103 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:03:44,103 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:03:45,030 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:03:45,315 - INFO - Processing document amazon 10-k 2024.pdf
2025-12-12 19:04:15,231 - INFO - Finished converting document amazon 10-k 2024.pdf in 31.37 sec.
2025-12-12 19:04:15,392 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:04:15,394 - INFO - Going to convert document batch...
2025-12-12 19:04:15,395 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:04:15,396 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:04:15,404 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:04:15,407 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-k 2024.md
  ✓ Saved 62 tables (text) to: data\rag-tables\amazon\amazon 10-k 2024
  [DONE] amazon 10-k 2024.pdf

Processing: amazon 10-q q1 2024.pdf


2025-12-12 19:04:15,614 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:04:16,672 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:04:16,929 - INFO - Processing document amazon 10-q q1 2024.pdf
2025-12-12 19:04:30,902 - INFO - Finished converting document amazon 10-q q1 2024.pdf in 15.51 sec.
2025-12-12 19:04:31,011 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:04:31,013 - INFO - Going to convert document batch...
2025-12-12 19:04:31,013 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:04:31,014 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:04:31,023 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:04:31,026 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:04:31,027 [RapidOCR] main.py:53: Using C:\Users\l

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q1 2024.md
  ✓ Saved 35 tables (text) to: data\rag-tables\amazon\amazon 10-q q1 2024
  [DONE] amazon 10-q q1 2024.pdf

Processing: amazon 10-q q1 2025.pdf


2025-12-12 19:04:31,244 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:04:31,245 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:04:32,304 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:04:32,603 - INFO - Processing document amazon 10-q q1 2025.pdf
2025-12-12 19:04:46,284 - INFO - Finished converting document amazon 10-q q1 2025.pdf in 15.27 sec.
2025-12-12 19:04:46,390 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:04:46,392 - INFO - Going to convert document batch...
2025-12-12 19:04:46,393 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:04:46,394 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:04:46,402 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:04:46,406 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q1 2025.md
  ✓ Saved 35 tables (text) to: data\rag-tables\amazon\amazon 10-q q1 2025
  [DONE] amazon 10-q q1 2025.pdf

Processing: amazon 10-q q2 2024.pdf


2025-12-12 19:04:46,614 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:04:46,615 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:04:47,541 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:04:47,823 - INFO - Processing document amazon 10-q q2 2024.pdf
2025-12-12 19:05:04,194 - INFO - Finished converting document amazon 10-q q2 2024.pdf in 17.80 sec.
2025-12-12 19:05:04,315 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:05:04,317 - INFO - Going to convert document batch...
2025-12-12 19:05:04,318 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:05:04,318 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:05:04,327 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:05:04,331 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q2 2024.md
  ✓ Saved 34 tables (text) to: data\rag-tables\amazon\amazon 10-q q2 2024
  [DONE] amazon 10-q q2 2024.pdf

Processing: amazon 10-q q2 2025.pdf


2025-12-12 19:05:04,580 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:05:04,580 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:05:05,623 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:05:05,932 - INFO - Processing document amazon 10-q q2 2025.pdf
2025-12-12 19:05:21,304 - INFO - Finished converting document amazon 10-q q2 2025.pdf in 16.99 sec.
2025-12-12 19:05:21,417 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:05:21,420 - INFO - Going to convert document batch...
2025-12-12 19:05:21,421 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:05:21,421 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:05:21,430 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:05:21,433 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q2 2025.md
  ✓ Saved 34 tables (text) to: data\rag-tables\amazon\amazon 10-q q2 2025
  [DONE] amazon 10-q q2 2025.pdf

Processing: amazon 10-q q3 2024.pdf


2025-12-12 19:05:21,584 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:05:21,586 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:05:22,711 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:05:22,962 - INFO - Processing document amazon 10-q q3 2024.pdf
2025-12-12 19:05:46,313 - INFO - Finished converting document amazon 10-q q3 2024.pdf in 24.90 sec.
2025-12-12 19:05:46,578 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:05:46,581 - INFO - Going to convert document batch...
2025-12-12 19:05:46,582 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:05:46,583 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:05:46,592 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:05:46,596 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q3 2024.md
  ✓ Saved 38 tables (text) to: data\rag-tables\amazon\amazon 10-q q3 2024
  [DONE] amazon 10-q q3 2024.pdf

Processing: apple 10-k 2023.pdf


[32m[INFO] 2025-12-12 19:05:46,694 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:05:46,705 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-12-12 19:05:46,706 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
2025-12-12 19:05:46,840 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:05:46,841 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:05:47,844 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:05:48,103 - INFO - Processing document apple 10-k 2023.pdf
2025-12-12 19:06:18,093 - INFO - Finished converting document apple 10-k 2023.pdf in 31.52 sec.
2025-12-12 19:06:18,274 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:06:18,276 - INFO - Going to convert document batch...
2025-12-12 19:06:18,277 - IN

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-k 2023.md
  ✓ Saved 4 images to: data\rag-images\apple\apple 10-k 2023
  ✓ Saved 53 tables (text) to: data\rag-tables\apple\apple 10-k 2023
  [DONE] apple 10-k 2023.pdf

Processing: apple 10-k 2024.pdf


2025-12-12 19:06:18,513 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:06:18,513 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:06:19,492 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:06:19,790 - INFO - Processing document apple 10-k 2024.pdf
2025-12-12 19:06:53,693 - INFO - Finished converting document apple 10-k 2024.pdf in 35.42 sec.
2025-12-12 19:06:53,894 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:06:53,896 - INFO - Going to convert document batch...
2025-12-12 19:06:53,897 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:06:53,898 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:06:53,908 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:06:53,912 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-k 2024.md
  ✓ Saved 6 images to: data\rag-images\apple\apple 10-k 2024
  ✓ Saved 50 tables (text) to: data\rag-tables\apple\apple 10-k 2024
  [DONE] apple 10-k 2024.pdf

Processing: apple 10-q q1 2024.pdf


2025-12-12 19:06:54,063 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:06:54,064 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:06:55,159 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:06:55,423 - INFO - Processing document apple 10-q q1 2024.pdf
2025-12-12 19:07:06,904 - INFO - Finished converting document apple 10-q q1 2024.pdf in 13.01 sec.
2025-12-12 19:07:06,988 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:07:06,991 - INFO - Going to convert document batch...
2025-12-12 19:07:06,992 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:07:06,992 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:07:07,001 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:07:07,004 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.on

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-q q1 2024.md
  ✓ Saved 2 images to: data\rag-images\apple\apple 10-q q1 2024
  ✓ Saved 26 tables (text) to: data\rag-tables\apple\apple 10-q q1 2024
  [DONE] apple 10-q q1 2024.pdf

Processing: apple 10-q q2 2024.pdf


2025-12-12 19:07:07,245 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:07:07,246 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:07:08,216 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:07:08,477 - INFO - Processing document apple 10-q q2 2024.pdf
2025-12-12 19:07:20,440 - INFO - Finished converting document apple 10-q q2 2024.pdf in 13.45 sec.
2025-12-12 19:07:20,522 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:07:20,524 - INFO - Going to convert document batch...
2025-12-12 19:07:20,524 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:07:20,525 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:07:20,537 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:07:20,541 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.on

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-q q2 2024.md
  ✓ Saved 1 images to: data\rag-images\apple\apple 10-q q2 2024
  ✓ Saved 27 tables (text) to: data\rag-tables\apple\apple 10-q q2 2024
  [DONE] apple 10-q q2 2024.pdf

Processing: apple 10-q q4 2023.pdf


2025-12-12 19:07:20,743 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:07:20,744 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:07:21,831 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:07:22,093 - INFO - Processing document apple 10-q q4 2023.pdf
2025-12-12 19:07:31,469 - INFO - Finished converting document apple 10-q q4 2023.pdf in 10.95 sec.
2025-12-12 19:07:31,536 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:07:31,538 - INFO - Going to convert document batch...
2025-12-12 19:07:31,538 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:07:31,539 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:07:31,547 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:07:31,551 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.on

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-q q4 2023.md
  ✓ Saved 3 images to: data\rag-images\apple\apple 10-q q4 2023
  ✓ Saved 26 tables (text) to: data\rag-tables\apple\apple 10-q q4 2023
  [DONE] apple 10-q q4 2023.pdf

Processing: apple 8-k q4 2023.pdf


2025-12-12 19:07:31,721 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:07:31,722 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:07:32,795 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:07:33,058 - INFO - Processing document apple 8-k q4 2023.pdf
2025-12-12 19:07:35,847 - INFO - Finished converting document apple 8-k q4 2023.pdf in 4.31 sec.
2025-12-12 19:07:35,879 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:07:35,882 - INFO - Going to convert document batch...
2025-12-12 19:07:35,883 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:07:35,884 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:07:35,893 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:07:35,896 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx

  ✓ Saved markdown: data\rag-markdown\apple\apple 8-k q4 2023.md
  ✓ Saved 1 images to: data\rag-images\apple\apple 8-k q4 2023
  ✓ Saved 5 tables (text) to: data\rag-tables\apple\apple 8-k q4 2023
  [DONE] apple 8-k q4 2023.pdf

Processing: google 10-k 2023.pdf


2025-12-12 19:07:36,074 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:07:37,191 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:07:37,448 - INFO - Processing document google 10-k 2023.pdf
2025-12-12 19:08:12,581 - INFO - Finished converting document google 10-k 2023.pdf in 36.70 sec.
2025-12-12 19:08:12,812 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:08:12,815 - INFO - Going to convert document batch...
2025-12-12 19:08:12,816 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:08:12,816 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:08:12,825 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:08:12,829 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:08:12,830 [RapidOCR] main.py:53: Using C:\Users\laxmi\a

  ✓ Saved markdown: data\rag-markdown\google\google 10-k 2023.md
  ✓ Saved 2 images to: data\rag-images\google\google 10-k 2023
  ✓ Saved 70 tables (text) to: data\rag-tables\google\google 10-k 2023
  [DONE] google 10-k 2023.pdf

Processing: google 10-k 2024.pdf


2025-12-12 19:08:13,030 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:08:13,030 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:08:14,139 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:08:14,439 - INFO - Processing document google 10-k 2024.pdf
2025-12-12 19:08:55,775 - INFO - Finished converting document google 10-k 2024.pdf in 42.96 sec.
2025-12-12 19:08:56,021 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:08:56,023 - INFO - Going to convert document batch...
2025-12-12 19:08:56,024 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:08:56,025 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:08:56,034 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:08:56,038 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[

  ✓ Saved markdown: data\rag-markdown\google\google 10-k 2024.md
  ✓ Saved 2 images to: data\rag-images\google\google 10-k 2024
  ✓ Saved 71 tables (text) to: data\rag-tables\google\google 10-k 2024
  [DONE] google 10-k 2024.pdf

Processing: google 10-q q1 2025.pdf


[32m[INFO] 2025-12-12 19:08:56,124 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:08:56,129 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-12-12 19:08:56,134 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
2025-12-12 19:08:56,208 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:08:56,209 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:08:57,391 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:08:57,736 - INFO - Processing document google 10-q q1 2025.pdf
2025-12-12 19:09:20,447 - INFO - Finished converting document google 10-q q1 2025.pdf in 24.43 sec.
2025-12-12 19:09:20,588 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:09:20,591 - INFO - Going to convert document batch...
2025-12-12 19:09:20,

  ✓ Saved markdown: data\rag-markdown\google\google 10-q q1 2025.md
  ✓ Saved 1 images to: data\rag-images\google\google 10-q q1 2025
  ✓ Saved 57 tables (text) to: data\rag-tables\google\google 10-q q1 2025
  [DONE] google 10-q q1 2025.pdf

Processing: google 10-q q2 2024.pdf


2025-12-12 19:09:20,864 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:09:20,865 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:09:21,842 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:09:22,112 - INFO - Processing document google 10-q q2 2024.pdf
2025-12-12 19:09:48,083 - INFO - Finished converting document google 10-q q2 2024.pdf in 27.50 sec.
2025-12-12 19:09:48,262 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:09:48,264 - INFO - Going to convert document batch...
2025-12-12 19:09:48,265 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:09:48,266 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:09:48,277 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:09:48,282 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\google\google 10-q q2 2024.md
  ✓ Saved 56 tables (text) to: data\rag-tables\google\google 10-q q2 2024
  [DONE] google 10-q q2 2024.pdf

Processing: google 10-q q2 2025.pdf


2025-12-12 19:09:48,471 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:09:48,472 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:09:49,512 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:09:49,785 - INFO - Processing document google 10-q q2 2025.pdf
2025-12-12 19:10:19,956 - INFO - Finished converting document google 10-q q2 2025.pdf in 31.69 sec.
2025-12-12 19:10:20,142 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:10:20,144 - INFO - Going to convert document batch...
2025-12-12 19:10:20,144 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:10:20,145 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:10:20,158 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:10:20,164 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\google\google 10-q q2 2025.md
  ✓ Saved 63 tables (text) to: data\rag-tables\google\google 10-q q2 2025
  [DONE] google 10-q q2 2025.pdf

Processing: google 10-q q3 2024.pdf


2025-12-12 19:10:20,313 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:10:20,313 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:10:21,527 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:10:21,820 - INFO - Processing document google 10-q q3 2024.pdf
2025-12-12 19:10:49,738 - INFO - Finished converting document google 10-q q3 2024.pdf in 29.60 sec.


  ✓ Saved markdown: data\rag-markdown\google\google 10-q q3 2024.md
  ✓ Saved 56 tables (text) to: data\rag-tables\google\google 10-q q3 2024
  [DONE] google 10-q q3 2024.pdf



### Verify Extraction

In [None]:
# Count extracted files
md_files = list(Path(OUTPUT_MD_DIR).rglob("*.md"))
image_files = list(Path(OUTPUT_IMAGES_DIR).rglob("*.png"))
table_files = list(Path(OUTPUT_TABLES_DIR).rglob("*.md"))

print(f"Extraction Summary:")
print(f"  Markdown files: {len(md_files)}")
print(f"  Images: {len(image_files)}")
print(f"  Tables: {len(table_files)}")

### Example: View Extracted Content

In [None]:
# Show sample markdown
if md_files:
    sample_md = md_files[0]
    print(f"Sample Markdown: {sample_md}\n")
    with open(sample_md, 'r', encoding='utf-8') as f:
        print(f.read()[:500])

# Show sample table
if table_files:
    sample_table = table_files[0]
    print(f"\n\nSample Table: {sample_table}\n")
    with open(sample_table, 'r', encoding='utf-8') as f:
        print(f.read())