Official Python SDK for LeapOCR - Transform documents into structured data using AI-powered OCR.
LeapOCR provides enterprise-grade document processing with AI-powered data extraction. This SDK offers a Python-native, async-first interface for seamless integration into your applications.
pip install leapocrOr using uv:
uv add leapocr- Python 3.9 or higher
- LeapOCR API key (sign up here)
import asyncio
import os
from leapocr import LeapOCR, ProcessOptions, Format
async def main():
# Initialize the SDK with your API key
async with LeapOCR(os.getenv("LEAPOCR_API_KEY")) as client:
# Submit document for processing
job = await client.ocr.process_url(
"https://example.com/document.pdf",
options=ProcessOptions(
format=Format.STRUCTURED,
),
)
print(f"Job created: {job.job_id}")
# Wait for processing to complete
result = await client.ocr.wait_until_done(job.job_id)
print(f"Credits used: {result.credits_used}")
print(f"Pages processed: {len(result.pages)}")
print(f"Extracted text: {result.pages[0].text[:200]}...")
# Optional: Delete job after processing (auto-deleted after 7 days)
await client.ocr.delete_job(job.job_id)
asyncio.run(main())- Async-First Design - Built on asyncio with httpx for high-performance concurrent processing
- Type-Safe API - Full type hints and mypy strict mode support
- Multiple Processing Formats - Structured data extraction, markdown output, or per-page processing
- Flexible Model Selection - Choose from standard, pro, or custom AI models
- Custom Schema Support - Define extraction schemas for your specific use case
- Built-in Retry Logic - Automatic exponential backoff for transient failures
- Context Manager Support - Proper resource cleanup with async context managers
- Direct File Upload - Efficient multipart uploads for local files
- Progress Tracking - Real-time callbacks for long-running operations
- Automatic Cleanup - Jobs and files are automatically deleted after 7 days, or delete immediately with
delete_job()
Use the model parameter in ProcessOptions to specify a model. Defaults to Model.STANDARD_V1.
You can also use custom model strings for organization-specific models:
from leapocr import ProcessOptions
options = ProcessOptions(
model="my-custom-model-v1", # Custom string model
)import asyncio
from leapocr import LeapOCR, ProcessOptions, Format, Model
async def process_url():
async with LeapOCR("your-api-key") as client:
# Submit job
job = await client.ocr.process_url(
"https://example.com/invoice.pdf",
options=ProcessOptions(
format=Format.STRUCTURED,
model=Model.PRO_V1, # Use premium model for better accuracy
instructions="Extract invoice number, date, and total amount",
),
)
# Wait for completion
result = await client.ocr.wait_until_done(job.job_id)
print(f"Processing time: {result.processing_time_seconds:.2f}s")
print(f"Credits used: {result.credits_used}")
print(f"Pages: {len(result.pages)}")
await client.ocr.delete_job(job.job_id)
asyncio.run(process_url())from pathlib import Path
async def process_file():
async with LeapOCR("your-api-key") as client:
# Submit file
job = await client.ocr.process_file(
Path("invoice.pdf"),
options=ProcessOptions(
format=Format.STRUCTURED,
model=Model.ENGLISH_PRO_V1, # High-accuracy for English documents
),
)
# Wait for completion
result = await client.ocr.wait_until_done(job.job_id)
for page in result.pages:
print(f"Page {page.page_number}: {len(page.text)} characters")
await client.ocr.delete_job(job.job_id)
asyncio.run(process_file())async def extract_with_schema():
schema = {
"type": "object",
"properties": {
"patient_name": {"type": "string"},
"date_of_birth": {"type": "string"},
"medications": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"dosage": {"type": "string"},
},
},
},
},
}
async with LeapOCR("your-api-key") as client:
# Submit with schema
job = await client.ocr.process_file(
"medical-record.pdf",
options=ProcessOptions(
format=Format.STRUCTURED,
schema=schema,
instructions="Extract patient information and medications",
),
)
# Wait for completion
result = await client.ocr.wait_until_done(job.job_id)
# Parse the structured data
import json
data = json.loads(result.pages[0].text)
print(f"Patient: {data['patient_name']}")
print(f"Medications: {len(data['medications'])}")
await client.ocr.delete_job(job.job_id)
asyncio.run(extract_with_schema())| Format | Description | Use Case |
|---|---|---|
Format.STRUCTURED |
Single JSON object | Extract specific fields across entire document |
Format.MARKDOWN |
Text per page | Convert document to readable text |
Format.PER_PAGE_STRUCTURED |
JSON per page | Extract fields from multi-section documents |
async def manual_polling():
async with LeapOCR("your-api-key") as client:
# Start processing
job = await client.ocr.process_url(
"https://example.com/document.pdf",
options=ProcessOptions(format=Format.MARKDOWN),
)
print(f"Job created: {job.job_id}")
# Poll for status
import asyncio
while True:
status = await client.ocr.get_job_status(job.job_id)
print(f"Status: {status.status.value} - {status.progress:.1f}%")
if status.status.value == "completed":
break
await asyncio.sleep(2)
# Get results
result = await client.ocr.get_results(job.job_id)
print(f"Processing complete: {len(result.pages)} pages")
asyncio.run(manual_polling())from leapocr import PollOptions
async def track_progress():
def progress_callback(status):
print(f"Progress: {status.progress:.1f}% "
f"({status.processed_pages}/{status.total_pages} pages)")
async with LeapOCR("your-api-key") as client:
# Submit job
job = await client.ocr.process_file("large-document.pdf")
# Wait with progress tracking
result = await client.ocr.wait_until_done(
job.job_id,
poll_options=PollOptions(
poll_interval=2.0,
max_wait=300.0,
on_progress=progress_callback,
),
)
asyncio.run(track_progress())Use pre-configured templates for common document types. Templates include predefined schemas, instructions, and model settings:
async def use_template():
async with LeapOCR("your-api-key") as client:
# Use a pre-configured template by its slug
job = await client.ocr.process_file(
"invoice.pdf",
options=ProcessOptions(
template_slug="invoice-extraction", # Reference existing template
),
)
result = await client.ocr.wait_until_done(job.job_id)
print(f"Extracted data: {result.pages[0].text}")
await client.ocr.delete_job(job.job_id)
asyncio.run(use_template())Common Template Use Cases:
- Invoice extraction
- Receipt processing
- Medical records
- Identity documents
- Custom organizational templates
Automatic Deletion: All jobs and their associated files are automatically deleted after 7 days for security and storage management.
Manual Deletion: Delete jobs immediately after processing to remove sensitive data or free up resources sooner:
async def delete_after_processing():
async with LeapOCR("your-api-key") as client:
# Submit and process document
job = await client.ocr.process_file("sensitive-doc.pdf")
result = await client.ocr.wait_until_done(job.job_id)
# Use the extracted data
print(f"Extracted data: {result.pages[0].text}")
# Delete job immediately (redacts content and marks as deleted)
await client.ocr.delete_job(result.job_id)
print("Job deleted successfully")
asyncio.run(delete_after_processing())Best Practices:
- Sensitive Data: Delete jobs containing PII/PHI immediately after use
- Compliance: Manual deletion helps meet data retention requirements
- Auto-Cleanup: All jobs are automatically purged after 7 days regardless
- Irreversible: Deleted jobs cannot be recovered
async def batch_process():
urls = [
"https://example.com/doc1.pdf",
"https://example.com/doc2.pdf",
"https://example.com/doc3.pdf",
]
async with LeapOCR("your-api-key") as client:
# Submit all documents concurrently
jobs = await asyncio.gather(*[
client.ocr.process_url(url)
for url in urls
])
# Wait for all to complete
results = await asyncio.gather(*[
client.ocr.wait_until_done(job.job_id)
for job in jobs
])
total_credits = sum(r.credits_used for r in results)
total_pages = sum(len(r.pages) for r in results)
print(f"Processed {len(results)} documents")
print(f"Total credits: {total_credits}")
print(f"Total pages: {total_pages}")
await asyncio.gather(*[
client.ocr.delete_job(job.job_id)
for job in jobs
])
asyncio.run(batch_process())For more examples, see the examples/ directory.
from leapocr import LeapOCR, ClientConfig
config = ClientConfig(
base_url="https://api.leapocr.com/api/v1",
timeout=120.0,
max_retries=5,
retry_delay=2.0,
retry_multiplier=2.0,
)
async with LeapOCR("your-api-key", config) as client:
# Use client with custom configuration
passexport LEAPOCR_API_KEY="your-api-key"
export OCR_BASE_URL="https://api.leapocr.com/api/v1" # optionalThe SDK provides a comprehensive error hierarchy for robust error handling:
from leapocr import (
LeapOCRError,
AuthenticationError,
ValidationError,
APIError,
JobError,
NetworkError,
)
async def handle_errors():
try:
async with LeapOCR("your-api-key") as client:
result = await client.ocr.process_and_wait("document.pdf")
except AuthenticationError as e:
print(f"Authentication failed: {e.message}")
except ValidationError as e:
print(f"Invalid input: {e.message}")
if e.field:
print(f" Field: {e.field}")
except JobError as e:
print(f"Job failed: {e.message}")
print(f" Job ID: {e.job_id}")
except NetworkError as e:
print(f"Network error: {e.message}")
# Implement retry logic
except APIError as e:
print(f"API error: {e.message}")
print(f" Status code: {e.status_code}")
except LeapOCRError as e:
print(f"SDK error: {e.message}")
asyncio.run(handle_errors())LeapOCRError- Base class for all SDK errorsAuthenticationError- Invalid or missing API keyRateLimitError- Rate limit exceededValidationError- Invalid input parametersFileError- File-related errors (not found, too large, etc.)JobError- Job processing errorsJobFailedError- Job processing failedJobTimeoutError- Job processing timed outNetworkError- Network connectivity issuesAPIError- API returned an error responseInsufficientCreditsError- Not enough credits
Main client class for interacting with the LeapOCR API.
class LeapOCR:
def __init__(self, api_key: str, config: ClientConfig | None = None)
async def close(self) -> None
async def health(self) -> bool
# Use as async context manager
async with LeapOCR(api_key) as client:
...OCR operations service accessible via client.ocr.
# Process file or URL (submit job)
async def process_file(
file: str | Path | BinaryIO,
options: ProcessOptions | None = None,
) -> ProcessResult
async def process_url(
url: str,
options: ProcessOptions | None = None,
) -> ProcessResult
# Wait for job completion
async def wait_until_done(
job_id: str,
poll_options: PollOptions | None = None,
) -> JobResult
# Job management
async def get_job_status(job_id: str) -> JobStatus
async def get_results(job_id: str, page: int = 1, limit: int = 100) -> JobResult
async def delete_job(job_id: str) -> dict[str, Any]@dataclass
class ProcessOptions:
format: Format = Format.STRUCTURED
model: Model | None = None
schema: dict[str, Any] | None = None
instructions: str | None = None
template_slug: str | None = None # Use existing template by slug
metadata: dict[str, str] = field(default_factory=dict)@dataclass
class PollOptions:
poll_interval: float = 2.0 # seconds
max_wait: float = 300.0 # seconds (5 minutes)
on_progress: Callable[[JobStatus], None] | None = None@dataclass
class ClientConfig:
base_url: str = "https://api.leapocr.com/api/v1"
timeout: float = 30.0
max_retries: int = 3
retry_delay: float = 1.0
retry_multiplier: float = 2.0- Python 3.9+
- uv (recommended) or pip
- OpenAPI Generator (for code generation)
- Java 21+ (for OpenAPI Generator)
# Clone the repository
git clone https://github.com/leapocr/leapocr-python.git
cd leapocr-python
# Install dependencies
make dev-install
# Or using uv directly
uv syncmake test # Run unit tests
make test-cov # Run tests with coverage
make test-integration # Run integration tests (requires API key)
make lint # Run linters
make format # Format code
make type-check # Run type checking
make check # Run all checks (format, lint, type-check)The SDK includes generated client code from the OpenAPI specification:
make fetch-spec # Download OpenAPI spec
make generate # Generate client code
make regenerate # Fetch + generate (full refresh)# Unit tests only
pytest tests/unit/
# Integration tests (requires API key)
export LEAPOCR_API_KEY="your-api-key"
pytest tests/integration/
# All tests with coverage
pytest tests/ --cov=leapocr --cov-report=htmlleapocr-python/
├── leapocr/ # Main package
│ ├── __init__.py # Public API
│ ├── client.py # Main client class
│ ├── ocr.py # OCR service
│ ├── models.py # Data models
│ ├── errors.py # Error classes
│ ├── config.py # Configuration
│ ├── _internal/ # Internal utilities
│ │ ├── retry.py # Retry logic
│ │ ├── upload.py # File upload
│ │ ├── polling.py # Status polling
│ │ ├── validation.py # Input validation
│ │ └── utils.py # Common utilities
│ └── generated/ # Generated OpenAPI client
├── tests/
│ ├── unit/ # Unit tests
│ └── integration/ # Integration tests
├── examples/ # Usage examples
├── scripts/ # Development scripts
└── Makefile # Common development tasks
We welcome contributions! Please follow these guidelines:
- Fork the repository
- Create a feature branch (
git checkout -b feature/amazing-feature) - Commit your changes (
git commit -m 'feat: add amazing feature') - Push to the branch (
git push origin feature/amazing-feature) - Open a Pull Request
- Follow PEP 8 style guide
- Add type hints for all functions
- Write docstrings for public APIs
- Add tests for new features
- Update documentation as needed
This project is licensed under the MIT License - see the LICENSE file for details.
- Documentation: docs.leapocr.com
- API Reference: API Documentation
- Issues: GitHub Issues
- Examples: examples/
- Website: leapocr.com
- Support: support@leapocr.com
Version: 0.0.3
