# ExtractAI Notebook (Batch / Asynchronous)

Use this notebook for lower-cost asynchronous processing with the OpenAI Batch API.
Flow: submit once, check status later, then collect final results + CSV.


## Setup

Install once in your environment:
```bash
pip install -e .
```


In [10]:
from typing import Literal
from datetime import date

from pydantic import BaseModel

from extractai import (
    ExtractAIConfig,
    build_prompt,
    collect_batch_results,
    get_batch_status,
    submit_directory_batch,
)


In [11]:
# Define output schema and fully user-defined prompt
class ExtractedDocument(BaseModel):
    summary: str
    document_date: date | None = None
    category: Literal['Financial', 'Research', 'Government', 'Other']

prompt = build_prompt(
    """
    Extract three fields from the document text: summary, document_date, and category.
    Document date should be the date the document was published. If not available, return None.
    Category must be exactly one of: Financial, Research, Government, Other.
    If uncertain, choose the best matching option.
    Keep summary concise (2-4 sentences).
    """
)


In [12]:
batch_config = ExtractAIConfig(
    pdf_dir='sample_data',
    model='gpt-5-nano',
    max_input_tokens=5000,
    use_batch=True,
)

# Optional directory to store batch manifest/input/output artifacts
output_dir = 'outputs'


In [4]:
# 1) Submit batch job
batch_submission = submit_directory_batch(
    config=batch_config,
    schema=ExtractedDocument,
    prompt=prompt,
    output_dir=output_dir,
)

print('submission_id:', batch_submission.submission_id)
print('batch_id:', batch_submission.batch_id)
print('status:', batch_submission.status)


Skipping ScientificResearch.pdf: 14853 tokens exceeds the 5000 token limit.
submission_id: batch_69874e781a4881909da9899ec32cfdb4
batch_id: batch_69874e781a4881909da9899ec32cfdb4
status: SUBMITTED


In [13]:
# 2) Check status later (can take minutes to hours)
# If you restart the notebook, set submission_id manually.
submission_id = batch_submission.submission_id

status = get_batch_status(
    submission_id=submission_id,
    output_dir=output_dir,
)

print('batch status:', status.get('status'))
status


batch status: completed


{'id': 'batch_69874e781a4881909da9899ec32cfdb4',
 'completion_window': '24h',
 'created_at': 1770475128,
 'endpoint': '/v1/chat/completions',
 'input_file_id': 'file-8d7GFMpWmGH8xizAjwkzuf',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1770475153,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1770561528,
 'failed_at': None,
 'finalizing_at': 1770475150,
 'in_progress_at': 1770475130,
 'metadata': None,
 'model': 'gpt-5-nano-2025-08-07',
 'output_file_id': 'file-DRZRxRz5HM7UiNkTmamN1Y',
 'request_counts': {'completed': 2, 'failed': 0, 'total': 2},
 'usage': {'input_tokens': 3838,
  'input_tokens_details': {'cached_tokens': 0},
  'output_tokens': 2795,
  'output_tokens_details': {'reasoning_tokens': 2496},
  'total_tokens': 6633}}

In [14]:
# 3) Collect final results once status is terminal
# (completed / failed / expired / cancelled).
batch_results, batch_csv_path = collect_batch_results(
    submission_id=submission_id,
    schema=ExtractedDocument,
    output_dir=output_dir,
)

for result in batch_results:
    print(f'=== {result.file_name} === {result.status}')

print(f'Batch CSV saved to: {batch_csv_path}')


=== FY26_Q1_Consolidated_Financial_Statements.pdf === COMPLETE
=== IF10408.51.pdf === COMPLETE
=== ScientificResearch.pdf === SKIPPED
Batch CSV saved to: outputs/extractai_results.csv
