In [1]:
#%pip install "natural-pdf[all]"

# Tutorial 12: OCR Integration for Scanned Documents

Optical Character Recognition (OCR) allows you to extract text from scanned documents where the text isn't embedded in the PDF. This tutorial demonstrates how to work with scanned documents.

In [2]:
from natural_pdf import PDF

# Load a PDF
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf")
page = pdf.pages[0]

# Try extracting text without OCR
text_without_ocr = page.extract_text()
f"Without OCR: {len(text_without_ocr)} characters extracted"

'Without OCR: 0 characters extracted'

## Enabling OCR

In [3]:
# Enable OCR for text extraction
page.use_ocr = True

# Extract text with OCR enabled
text_with_ocr = page.extract_text()

# Preview the extracted text
text_with_ocr[:200] + "..." if len(text_with_ocr) > 200 else text_with_ocr

''

## Finding Text Elements with OCR

In [4]:
# Convert text-as-image to text elements
page.apply_ocr()

# Select all text pieces on the page
text_elements = page.find_all('text')
f"Found {len(text_elements)} text elements"

# Visualize the elements
text_elements.highlight()



<ElementCollection[Mixed](count=0)>

## OCR Configuration Options

In [5]:
# Set OCR configuration for better results
page.ocr_config = {
    'language': 'eng',  # English
    'dpi': 300,         # Higher resolution
}

# Extract text with the improved configuration
improved_text = page.extract_text()

# Preview the text
improved_text[:200] + "..." if len(improved_text) > 200 else improved_text

''

## Working with Multi-language Documents

In [6]:
# Configure for multiple languages
page.ocr_config = {
    'language': 'eng+fra+deu',  # English, French, German
    'dpi': 300
}

# Extract text with multi-language support
multilang_text = page.extract_text()
multilang_text[:200]

''

## Extracting Tables from Scanned Documents

In [7]:
# Enable OCR and analyze the document layout
page.use_ocr = True
page.analyze_layout()

# Find table regions
table_regions = page.find_all('region[type=table]')

# Visualize any detected tables
table_regions.highlight()

# Extract the first table if found
if table_regions:
    table_data = table_regions[0].extract_table()
    table_data
else:
    "No tables found in the document"




image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmppfct6zry/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1631.0ms


Speed: 5.8ms preprocess, 1631.0ms inference, 0.9ms postprocess per image at shape (1, 3, 1024, 800)


## Finding Form Fields in Scanned Documents

In [8]:
# Look for potential form labels (containing a colon)
labels = page.find_all('text:contains(":")') 

# Visualize the labels
labels.highlight()

# Extract form data by looking to the right of each label
form_data = {}
for label in labels:
    # Clean the label text
    field_name = label.text.strip().rstrip(':')
    
    # Find the value to the right
    value_element = label.right(width=200)
    value = value_element.extract_text().strip()
    
    # Add to our dictionary
    form_data[field_name] = value

# Display the extracted data
form_data

{}

## Combining OCR with Layout Analysis

In [9]:
# Apply OCR and analyze layout
page.use_ocr = True
page.analyze_layout()

# Find document structure elements
headings = page.find_all('region[type=heading]')
paragraphs = page.find_all('region[type=paragraph]')

# Visualize the structure
headings.highlight(color="red", label="Headings")
paragraphs.highlight(color="blue", label="Paragraphs")

# Create a simple document outline
document_outline = []
for heading in headings:
    heading_text = heading.extract_text()
    document_outline.append(heading_text)

document_outline




image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpwuo1j2tg/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1584.3ms


Speed: 3.8ms preprocess, 1584.3ms inference, 0.6ms postprocess per image at shape (1, 3, 1024, 800)


[]

## Working with Multiple Pages

In [10]:
# Process all pages in the document
all_text = []

for i, page in enumerate(pdf.pages):
    # Enable OCR for each page
    page.use_ocr = True
    
    # Extract text
    page_text = page.extract_text()
    
    # Add to our collection with page number
    all_text.append(f"Page {i+1}: {page_text[:100]}...")

# Show the first few pages
all_text

['Page 1: ...']

OCR integration enables you to work with scanned documents, historical archives, and image-based PDFs that don't have embedded text. By combining OCR with natural-pdf's layout analysis capabilities, you can turn any document into structured, searchable data. 