## Advanced RAG - Data Extraction Pipeline
### Extract PDFs to Markdown, Images, and Tables

**Learning Objectives:**
- Extract PDF content to markdown format
- Save all figures as PNG images
- Extract tables with context (2 paragraphs before)
- Organize extracted content systematically

**Output Structure:**
- Markdown files: Full document text
- Images: `data/rag-images/{company}/{filename}/image_1.png`
- Tables: `data/rag-tables/{company}/{filename}/table_1.md`

### Setup and Imports

In [1]:
from pathlib import Path
from typing import List, Tuple

from docling_core.types.doc import PictureItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

### Configuration

In [2]:
# Paths
DATA_DIR = "data/rag-data"
OUTPUT_MD_DIR = "data/rag-markdown"
OUTPUT_IMAGES_DIR = "data/rag-images"
OUTPUT_TABLES_DIR = "data/rag-tables"

### Extract Metadata from Filename

In [3]:
def extract_metadata_from_filename(filename: str) -> dict:
    """
    Extract metadata from filename.
    
    Expected format: {company} {doc_type} {quarter} {year}.pdf
    Examples:
    - amazon 10-k 2024.pdf
    - amazon 10-q q1 2024.pdf
    """
    name = filename.replace('.pdf', '')
    parts = name.split()
    
    metadata = {}
    metadata['company_name'] = parts[0]
    metadata['doc_type'] = parts[1]
    
    if len(parts) == 4:
        metadata['fiscal_quarter'] = parts[2]
        metadata['fiscal_year'] = int(parts[3])
    else:
        metadata['fiscal_quarter'] = None
        metadata['fiscal_year'] = int(parts[2])
    
    return metadata

### Extract Tables with Context

In [4]:
def extract_tables_with_context(markdown_text: str) -> List[Tuple[str, str]]:
    """
    Extract tables with 2 paragraphs of context before each table.
    
    Returns:
        List of (context + table, table_number) tuples
    """
    # Split by table pattern (markdown tables start with |)
    lines = markdown_text.split('\n')
    
    tables = []
    i = 0
    table_num = 1
    
    while i < len(lines):
        line = lines[i]
        
        # Detect table start (line with multiple |)
        if line.strip().startswith('|') and line.count('|') >= 2:
            # Find 2 paragraphs before
            context_lines = []
            para_count = 0
            j = i - 1
            
            while j >= 0 and para_count < 2:
                if lines[j].strip():  # Non-empty line
                    context_lines.insert(0, lines[j])
                elif context_lines:  # Empty line marks paragraph break
                    para_count += 1
                j -= 1
            
            # Extract full table
            table_lines = []
            while i < len(lines) and (lines[i].strip().startswith('|') or not lines[i].strip()):
                if lines[i].strip():  # Skip empty lines within table
                    table_lines.append(lines[i])
                i += 1
                if i < len(lines) and lines[i].strip() and not lines[i].strip().startswith('|'):
                    break
            
            # Combine context + table
            full_content = '\n'.join(context_lines) + '\n\n' + '\n'.join(table_lines)
            tables.append((full_content, f"table_{table_num}"))
            table_num += 1
        else:
            i += 1
    
    return tables

### Extract PDF Content

In [None]:
def extract_pdf_content(pdf_path: Path):
    """Extract PDF to markdown, images, and tables."""
    print(f"Processing: {pdf_path.name}")
    
    # Get metadata and create directories
    metadata = extract_metadata_from_filename(pdf_path.name)
    company = metadata['company_name']
    filename_stem = pdf_path.stem
    
    md_dir = Path(OUTPUT_MD_DIR) / company
    images_dir = Path(OUTPUT_IMAGES_DIR) / company / filename_stem
    tables_dir = Path(OUTPUT_TABLES_DIR) / company / filename_stem
    
    for dir_path in [md_dir, images_dir, tables_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    # Configure and convert
    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_picture_images = True
    pipeline_options.generate_page_images = True
    
    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
    )
    result = converter.convert(str(pdf_path))
    
    # Save markdown
    page_break = "<!-- page break -->"
    markdown_text = result.document.export_to_markdown(page_break_placeholder=page_break)
    (md_dir / f"{filename_stem}.md").write_text(markdown_text, encoding='utf-8')
    print(f"  ✓ Markdown saved")
    
    # Find pages with large images and save them
    pages_to_save = set()
    for element, _ in result.document.iterate_items():
        if isinstance(element, PictureItem):
            image = element.get_image(result.document)
            if image.size[0] > 500 and image.size[1] > 500:
                page_no = element.prov[0].page_no if element.prov else None
                if page_no:
                    pages_to_save.add(page_no)
    
    # Save page images
    for page_no in pages_to_save:
        page = result.document.pages[page_no]
        page.image.pil_image.save(images_dir / f"page_{page_no}.png", "PNG")
    
    if pages_to_save:
        print(f"  ✓ Saved {len(pages_to_save)} page images")
    
    # Save tables with context
    tables = extract_tables_with_context(markdown_text)
    for table_content, table_name in tables:
        (tables_dir / f"{table_name}.md").write_text(table_content, encoding='utf-8')
    
    if tables:
        print(f"  ✓ Saved {len(tables)} tables")
    
    print(f"  [DONE]\n")

### Process All PDFs

In [6]:
# Find all PDF files
data_path = Path(DATA_DIR)
pdf_files = list(data_path.rglob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files\n")

# Process each PDF
for pdf_path in pdf_files:
    extract_pdf_content(pdf_path)

2025-12-12 19:53:42,171 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:53:42,192 - INFO - Going to convert document batch...
2025-12-12 19:53:42,192 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:53:42,212 - INFO - Loading plugin 'docling_defaults'
2025-12-12 19:53:42,214 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-12 19:53:42,233 - INFO - Loading plugin 'docling_defaults'
2025-12-12 19:53:42,237 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']


Found 28 PDF files

Processing: amazon 10-k 2023.pdf


2025-12-12 19:53:42,400 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:53:42,409 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:53:42,413 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:53:42,414 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:53:42,463 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:53:42,464 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12 19:53:42,465 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-k 2023.md
  ✓ Saved 61 tables (text) to: data\rag-tables\amazon\amazon 10-k 2023
  [DONE] amazon 10-k 2023.pdf

Processing: amazon 10-k 2024.pdf


2025-12-12 19:54:13,859 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:54:13,860 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:54:14,962 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:54:15,270 - INFO - Processing document amazon 10-k 2024.pdf
2025-12-12 19:54:44,746 - INFO - Finished converting document amazon 10-k 2024.pdf in 31.06 sec.
2025-12-12 19:54:44,909 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:54:44,911 - INFO - Going to convert document batch...
2025-12-12 19:54:44,911 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:54:44,912 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:54:44,921 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:54:44,924 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-k 2024.md
  ✓ Saved 62 tables (text) to: data\rag-tables\amazon\amazon 10-k 2024
  [DONE] amazon 10-k 2024.pdf

Processing: amazon 10-q q1 2024.pdf


2025-12-12 19:54:45,148 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:54:45,148 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:54:46,120 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:54:46,394 - INFO - Processing document amazon 10-q q1 2024.pdf
2025-12-12 19:55:00,541 - INFO - Finished converting document amazon 10-q q1 2024.pdf in 15.63 sec.
2025-12-12 19:55:00,651 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:55:00,654 - INFO - Going to convert document batch...
2025-12-12 19:55:00,655 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:55:00,656 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:55:00,666 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:55:00,670 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q1 2024.md
  ✓ Saved 35 tables (text) to: data\rag-tables\amazon\amazon 10-q q1 2024
  [DONE] amazon 10-q q1 2024.pdf

Processing: amazon 10-q q1 2025.pdf


2025-12-12 19:55:00,835 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:55:00,836 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:55:01,913 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:55:02,195 - INFO - Processing document amazon 10-q q1 2025.pdf
2025-12-12 19:55:16,011 - INFO - Finished converting document amazon 10-q q1 2025.pdf in 15.36 sec.
2025-12-12 19:55:16,107 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:55:16,109 - INFO - Going to convert document batch...
2025-12-12 19:55:16,110 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:55:16,110 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:55:16,119 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:55:16,123 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q1 2025.md
  ✓ Saved 35 tables (text) to: data\rag-tables\amazon\amazon 10-q q1 2025
  [DONE] amazon 10-q q1 2025.pdf

Processing: amazon 10-q q2 2024.pdf


2025-12-12 19:55:16,310 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:55:16,311 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:55:17,329 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:55:17,635 - INFO - Processing document amazon 10-q q2 2024.pdf
2025-12-12 19:55:33,476 - INFO - Finished converting document amazon 10-q q2 2024.pdf in 17.37 sec.
2025-12-12 19:55:33,587 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:55:33,589 - INFO - Going to convert document batch...
2025-12-12 19:55:33,590 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:55:33,591 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:55:33,599 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:55:33,602 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q2 2024.md
  ✓ Saved 34 tables (text) to: data\rag-tables\amazon\amazon 10-q q2 2024
  [DONE] amazon 10-q q2 2024.pdf

Processing: amazon 10-q q2 2025.pdf


2025-12-12 19:55:33,826 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:55:33,827 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:55:34,835 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:55:35,097 - INFO - Processing document amazon 10-q q2 2025.pdf
2025-12-12 19:55:50,647 - INFO - Finished converting document amazon 10-q q2 2025.pdf in 17.06 sec.
2025-12-12 19:55:50,759 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:55:50,762 - INFO - Going to convert document batch...
2025-12-12 19:55:50,762 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:55:50,763 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:55:50,772 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:55:50,775 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q2 2025.md
  ✓ Saved 34 tables (text) to: data\rag-tables\amazon\amazon 10-q q2 2025
  [DONE] amazon 10-q q2 2025.pdf

Processing: amazon 10-q q3 2024.pdf


2025-12-12 19:55:50,962 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:55:50,963 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:55:51,994 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:55:52,247 - INFO - Processing document amazon 10-q q3 2024.pdf
2025-12-12 19:56:14,992 - INFO - Finished converting document amazon 10-q q3 2024.pdf in 24.23 sec.
2025-12-12 19:56:15,221 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:56:15,224 - INFO - Going to convert document batch...
2025-12-12 19:56:15,225 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:56:15,225 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:56:15,234 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:56:15,238 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\amazon\amazon 10-q q3 2024.md
  ✓ Saved 38 tables (text) to: data\rag-tables\amazon\amazon 10-q q3 2024
  [DONE] amazon 10-q q3 2024.pdf

Processing: apple 10-k 2023.pdf


2025-12-12 19:56:15,446 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:56:16,586 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:56:16,844 - INFO - Processing document apple 10-k 2023.pdf
2025-12-12 19:56:45,203 - INFO - Finished converting document apple 10-k 2023.pdf in 29.98 sec.
2025-12-12 19:56:45,353 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:56:45,355 - INFO - Going to convert document batch...
2025-12-12 19:56:45,356 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:56:45,356 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:56:45,365 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:56:45,369 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:56:45,370 [RapidOCR] main.py:53: Using C:\Users\laxmi\ana

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-k 2023.md
  ⊘ Skipped 4 small images (< 500x500)
  ✓ Saved 53 tables (text) to: data\rag-tables\apple\apple 10-k 2023
  [DONE] apple 10-k 2023.pdf

Processing: apple 10-k 2024.pdf


2025-12-12 19:56:45,578 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:56:45,578 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:56:46,558 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:56:46,822 - INFO - Processing document apple 10-k 2024.pdf
2025-12-12 19:57:17,697 - INFO - Finished converting document apple 10-k 2024.pdf in 32.34 sec.
2025-12-12 19:57:17,887 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:57:17,889 - INFO - Going to convert document batch...
2025-12-12 19:57:17,890 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:57:17,890 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:57:17,899 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:57:17,902 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-k 2024.md
  ⊘ Skipped 6 small images (< 500x500)
  ✓ Saved 50 tables (text) to: data\rag-tables\apple\apple 10-k 2024
  [DONE] apple 10-k 2024.pdf

Processing: apple 10-q q1 2024.pdf


2025-12-12 19:57:18,107 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:57:19,159 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:57:19,423 - INFO - Processing document apple 10-q q1 2024.pdf
2025-12-12 19:57:30,273 - INFO - Finished converting document apple 10-q q1 2024.pdf in 12.39 sec.
2025-12-12 19:57:30,353 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:57:30,355 - INFO - Going to convert document batch...
2025-12-12 19:57:30,356 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:57:30,357 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:57:30,366 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:57:30,369 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:57:30,370 [RapidOCR] main.py:53: Using C:\Users\lax

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-q q1 2024.md
  ⊘ Skipped 2 small images (< 500x500)
  ✓ Saved 26 tables (text) to: data\rag-tables\apple\apple 10-q q1 2024
  [DONE] apple 10-q q1 2024.pdf

Processing: apple 10-q q2 2024.pdf


2025-12-12 19:57:30,537 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:57:31,612 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:57:31,876 - INFO - Processing document apple 10-q q2 2024.pdf
2025-12-12 19:57:42,796 - INFO - Finished converting document apple 10-q q2 2024.pdf in 12.44 sec.
2025-12-12 19:57:42,866 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:57:42,868 - INFO - Going to convert document batch...
2025-12-12 19:57:42,868 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:57:42,869 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:57:42,878 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:57:42,881 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:57:42,882 [RapidOCR] main.py:53: Using C:\Users\lax

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-q q2 2024.md
  ⊘ Skipped 1 small images (< 500x500)
  ✓ Saved 27 tables (text) to: data\rag-tables\apple\apple 10-q q2 2024
  [DONE] apple 10-q q2 2024.pdf

Processing: apple 10-q q4 2023.pdf


2025-12-12 19:57:44,160 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:57:44,425 - INFO - Processing document apple 10-q q4 2023.pdf
2025-12-12 19:57:53,710 - INFO - Finished converting document apple 10-q q4 2023.pdf in 10.84 sec.
2025-12-12 19:57:53,777 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:57:53,779 - INFO - Going to convert document batch...
2025-12-12 19:57:53,780 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:57:53,780 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:57:53,791 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:57:53,794 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:57:53,795 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-O

  ✓ Saved markdown: data\rag-markdown\apple\apple 10-q q4 2023.md
  ⊘ Skipped 3 small images (< 500x500)
  ✓ Saved 26 tables (text) to: data\rag-tables\apple\apple 10-q q4 2023
  [DONE] apple 10-q q4 2023.pdf

Processing: apple 8-k q4 2023.pdf


2025-12-12 19:57:54,937 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:57:55,210 - INFO - Processing document apple 8-k q4 2023.pdf
2025-12-12 19:57:58,020 - INFO - Finished converting document apple 8-k q4 2023.pdf in 4.24 sec.
2025-12-12 19:57:58,051 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:57:58,055 - INFO - Going to convert document batch...
2025-12-12 19:57:58,055 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:57:58,056 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:57:58,065 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:57:58,068 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:57:58,069 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv

  ✓ Saved markdown: data\rag-markdown\apple\apple 8-k q4 2023.md
  ⊘ Skipped 1 small images (< 500x500)
  ✓ Saved 5 tables (text) to: data\rag-tables\apple\apple 8-k q4 2023
  [DONE] apple 8-k q4 2023.pdf

Processing: google 10-k 2023.pdf


2025-12-12 19:57:59,234 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:57:59,500 - INFO - Processing document google 10-k 2023.pdf
2025-12-12 19:58:34,680 - INFO - Finished converting document google 10-k 2023.pdf in 36.63 sec.
2025-12-12 19:58:34,888 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:58:34,892 - INFO - Going to convert document batch...
2025-12-12 19:58:34,892 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:58:34,892 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:58:34,902 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:58:34,905 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 19:58:34,906 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4

  ✓ Saved markdown: data\rag-markdown\google\google 10-k 2023.md
  ⊘ Skipped 2 small images (< 500x500)
  ✓ Saved 70 tables (text) to: data\rag-tables\google\google 10-k 2023
  [DONE] google 10-k 2023.pdf

Processing: google 10-k 2024.pdf


2025-12-12 19:58:35,080 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:58:35,081 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:58:36,152 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:58:36,414 - INFO - Processing document google 10-k 2024.pdf
2025-12-12 19:59:13,618 - INFO - Finished converting document google 10-k 2024.pdf in 38.73 sec.
2025-12-12 19:59:13,826 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:59:13,829 - INFO - Going to convert document batch...
2025-12-12 19:59:13,829 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:59:13,829 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:59:13,839 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:59:13,842 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[

  ✓ Saved markdown: data\rag-markdown\google\google 10-k 2024.md
  ⊘ Skipped 2 small images (< 500x500)
  ✓ Saved 71 tables (text) to: data\rag-tables\google\google 10-k 2024
  [DONE] google 10-k 2024.pdf

Processing: google 10-q q1 2025.pdf


2025-12-12 19:59:14,025 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:59:14,026 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:59:15,156 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:59:15,468 - INFO - Processing document google 10-q q1 2025.pdf
2025-12-12 19:59:36,052 - INFO - Finished converting document google 10-q q1 2025.pdf in 22.23 sec.
2025-12-12 19:59:36,195 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 19:59:36,197 - INFO - Going to convert document batch...
2025-12-12 19:59:36,197 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 19:59:36,198 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 19:59:36,207 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 19:59:36,211 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\google\google 10-q q1 2025.md
  ⊘ Skipped 1 small images (< 500x500)
  ✓ Saved 57 tables (text) to: data\rag-tables\google\google 10-q q1 2025
  [DONE] google 10-q q1 2025.pdf

Processing: google 10-q q2 2024.pdf


2025-12-12 19:59:36,369 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 19:59:36,369 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:59:37,509 - INFO - Accelerator device: 'cuda:0'
2025-12-12 19:59:37,772 - INFO - Processing document google 10-q q2 2024.pdf
2025-12-12 20:00:02,282 - INFO - Finished converting document google 10-q q2 2024.pdf in 26.09 sec.
2025-12-12 20:00:02,421 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 20:00:02,423 - INFO - Going to convert document batch...
2025-12-12 20:00:02,424 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 20:00:02,425 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 20:00:02,434 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:00:02,437 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\google\google 10-q q2 2024.md
  ✓ Saved 56 tables (text) to: data\rag-tables\google\google 10-q q2 2024
  [DONE] google 10-q q2 2024.pdf

Processing: google 10-q q2 2025.pdf


2025-12-12 20:00:02,625 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 20:00:02,626 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:00:04,014 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:00:04,267 - INFO - Processing document google 10-q q2 2025.pdf
2025-12-12 20:00:31,969 - INFO - Finished converting document google 10-q q2 2025.pdf in 29.55 sec.
2025-12-12 20:00:32,113 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 20:00:32,115 - INFO - Going to convert document batch...
2025-12-12 20:00:32,116 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 20:00:32,116 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 20:00:32,125 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:00:32,128 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.

  ✓ Saved markdown: data\rag-markdown\google\google 10-q q2 2025.md
  ✓ Saved 63 tables (text) to: data\rag-tables\google\google 10-q q2 2025
  [DONE] google 10-q q2 2025.pdf

Processing: google 10-q q3 2024.pdf


2025-12-12 20:00:32,317 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:00:33,311 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:00:33,575 - INFO - Processing document google 10-q q3 2024.pdf
2025-12-12 20:00:58,390 - INFO - Finished converting document google 10-q q3 2024.pdf in 26.28 sec.
2025-12-12 20:00:58,522 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 20:00:58,527 - INFO - Going to convert document batch...
2025-12-12 20:00:58,527 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 20:00:58,528 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 20:00:58,536 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:00:58,540 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 20:00:58,541 [RapidOCR] main.py:53: Using C:\Users\l

  ✓ Saved markdown: data\rag-markdown\google\google 10-q q3 2024.md
  ✓ Saved 56 tables (text) to: data\rag-tables\google\google 10-q q3 2024
  [DONE] google 10-q q3 2024.pdf

Processing: meta 10-k 2024.pdf


2025-12-12 20:00:58,729 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:00:59,778 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:01:00,038 - INFO - Processing document meta 10-k 2024.pdf
2025-12-12 20:01:51,577 - INFO - Finished converting document meta 10-k 2024.pdf in 53.06 sec.
2025-12-12 20:01:51,834 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 20:01:51,836 - INFO - Going to convert document batch...
2025-12-12 20:01:51,837 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 20:01:51,837 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 20:01:51,846 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:01:51,850 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-12-12 20:01:51,851 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaco

  ✓ Saved markdown: data\rag-markdown\meta\meta 10-k 2024.md
  ⊘ Skipped 7 small images (< 500x500)
  ✓ Saved 64 tables (text) to: data\rag-tables\meta\meta 10-k 2024
  [DONE] meta 10-k 2024.pdf

Processing: meta 10-q q1 2024.pdf


2025-12-12 20:01:52,045 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 20:01:52,045 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:01:53,116 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:01:53,374 - INFO - Processing document meta 10-q q1 2024.pdf
2025-12-12 20:01:58,821 - INFO - Finished converting document meta 10-q q1 2024.pdf in 6.99 sec.
2025-12-12 20:01:58,953 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 20:01:58,955 - INFO - Going to convert document batch...
2025-12-12 20:01:58,955 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57fe606de9c74
2025-12-12 20:01:58,956 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-12 20:01:58,965 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:01:58,969 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx

  ✓ Saved markdown: data\rag-markdown\meta\meta 10-q q1 2024.md
  ✓ Saved 8 images to: data\rag-images\meta\meta 10-q q1 2024
  ⊘ Skipped 23 small images (< 500x500)
  ✓ Saved 3 tables (text) to: data\rag-tables\meta\meta 10-q q1 2024
  [DONE] meta 10-q q1 2024.pdf

Processing: meta 10-q q1 2025.pdf


[32m[INFO] 2025-12-12 20:01:59,039 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:01:59,045 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-12-12 20:01:59,046 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
2025-12-12 20:01:59,149 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 20:01:59,149 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:02:00,134 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:02:00,396 - INFO - Processing document meta 10-q q1 2025.pdf
2025-12-12 20:02:05,856 - INFO - Finished converting document meta 10-q q1 2025.pdf in 6.90 sec.
2025-12-12 20:02:06,009 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 20:02:06,022 - INFO - Going to convert document batch...
2025-12-12 20:02:06,023 -

  ✓ Saved markdown: data\rag-markdown\meta\meta 10-q q1 2025.md
  ✓ Saved 9 images to: data\rag-images\meta\meta 10-q q1 2025
  ⊘ Skipped 20 small images (< 500x500)
  ✓ Saved 3 tables (text) to: data\rag-tables\meta\meta 10-q q1 2025
  [DONE] meta 10-q q1 2025.pdf

Processing: meta 10-q q2 2024.pdf


[32m[INFO] 2025-12-12 20:02:06,082 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:02:06,084 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:06,084 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:06,109 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:02:06,114 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:06,115 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
2025-12-12 20:02:06,222 - INFO - Auto OCR model selected rapidocr with onnxruntime.


  ✓ Saved markdown: data\rag-markdown\meta\meta 10-q q2 2024.md
  ✓ Saved 9 images to: data\rag-images\meta\meta 10-q q2 2024
  ⊘ Skipped 20 small images (< 500x500)
  ✓ Saved 3 tables (text) to: data\rag-tables\meta\meta 10-q q2 2024
  [DONE] meta 10-q q2 2024.pdf

Processing: meta 10-q q2 2025.pdf


[32m[INFO] 2025-12-12 20:02:13,358 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:13,358 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:13,381 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:02:13,394 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:13,396 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
2025-12-12 20:02:13,481 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 20:02:13,483 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:02:14,564 - INFO -

  ✓ Saved markdown: data\rag-markdown\meta\meta 10-q q2 2025.md
  ✓ Saved 8 images to: data\rag-images\meta\meta 10-q q2 2025
  ⊘ Skipped 26 small images (< 500x500)
  ✓ Saved 3 tables (text) to: data\rag-tables\meta\meta 10-q q2 2025
  [DONE] meta 10-q q2 2025.pdf

Processing: meta 10-q q3 2024.pdf


[32m[INFO] 2025-12-12 20:02:20,374 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:20,375 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
2025-12-12 20:02:20,462 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 20:02:20,464 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:02:21,518 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:02:21,788 - INFO - Processing document meta 10-q q3 2024.pdf
2025-12-12 20:02:27,162 - INFO - Finished converting document meta 10-q q3 2024.pdf in 6.88 sec.
2025-12-12 20:02:27,321 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-12 20:02:27,333 - INFO - Going to convert document batch...
2025-12-12 20:02:27,334 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f3d5bd58852fb93aafa57f

  ✓ Saved markdown: data\rag-markdown\meta\meta 10-q q3 2024.md
  ✓ Saved 9 images to: data\rag-images\meta\meta 10-q q3 2024
  ⊘ Skipped 20 small images (< 500x500)
  ✓ Saved 3 tables (text) to: data\rag-tables\meta\meta 10-q q3 2024
  [DONE] meta 10-q q3 2024.pdf

Processing: meta 10-q q3 2025.pdf


[32m[INFO] 2025-12-12 20:02:27,404 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:27,404 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:27,429 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:02:27,436 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:27,436 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
2025-12-12 20:02:27,509 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 20:02:27,510 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:02:28,543 - INFO -

  ✓ Saved markdown: data\rag-markdown\meta\meta 10-q q3 2025.md
  ✓ Saved 8 images to: data\rag-images\meta\meta 10-q q3 2025
  ⊘ Skipped 25 small images (< 500x500)
  ✓ Saved 3 tables (text) to: data\rag-tables\meta\meta 10-q q3 2025
  [DONE] meta 10-q q3 2025.pdf

Processing: meta 10-q q4 2024.pdf


[32m[INFO] 2025-12-12 20:02:34,362 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-12 20:02:34,368 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-12-12 20:02:34,368 [RapidOCR] main.py:53: Using C:\Users\laxmi\anaconda3\envs\ml\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_rec_infer.onnx[0m
2025-12-12 20:02:34,434 - INFO - Auto OCR model selected rapidocr with onnxruntime.
2025-12-12 20:02:34,434 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:02:35,443 - INFO - Accelerator device: 'cuda:0'
2025-12-12 20:02:35,710 - INFO - Processing document meta 10-q q4 2024.pdf
2025-12-12 20:02:41,232 - INFO - Finished converting document meta 10-q q4 2024.pdf in 6.96 sec.


  ✓ Saved markdown: data\rag-markdown\meta\meta 10-q q4 2024.md
  ✓ Saved 9 images to: data\rag-images\meta\meta 10-q q4 2024
  ⊘ Skipped 20 small images (< 500x500)
  ✓ Saved 3 tables (text) to: data\rag-tables\meta\meta 10-q q4 2024
  [DONE] meta 10-q q4 2024.pdf

Processing: meta10-k 2023.pdf


IndexError: list index out of range

### Verify Extraction

In [None]:
# Count extracted files
md_files = list(Path(OUTPUT_MD_DIR).rglob("*.md"))
image_files = list(Path(OUTPUT_IMAGES_DIR).rglob("*.png"))
table_files = list(Path(OUTPUT_TABLES_DIR).rglob("*.md"))

print(f"Extraction Summary:")
print(f"  Markdown files: {len(md_files)}")
print(f"  Images: {len(image_files)}")
print(f"  Tables: {len(table_files)}")

### Example: View Extracted Content

In [None]:
# Show sample markdown
if md_files:
    sample_md = md_files[0]
    print(f"Sample Markdown: {sample_md}\n")
    with open(sample_md, 'r', encoding='utf-8') as f:
        print(f.read()[:500])

# Show sample table
if table_files:
    sample_table = table_files[0]
    print(f"\n\nSample Table: {sample_table}\n")
    with open(sample_table, 'r', encoding='utf-8') as f:
        print(f.read())