# ExtractAI Notebook (Main / Synchronous)

Use this notebook for the standard synchronous workflow:
1. Point to a PDF directory
2. Specify the model
3. Specify max input tokens
4. Define prompt + output schema


## Setup

Install once in your environment:
```bash
pip install -e .
```


In [None]:
from typing import Literal
from datetime import date

from pydantic import BaseModel

from extractai import (
    ExtractAIConfig,
    build_prompt,
    run_directory_extraction,
    save_results_to_csv,
)


In [None]:
# 4) Define output schema and fully user-defined prompt
class ExtractedDocument(BaseModel):
    summary: str
    document_date: date | None = None
    category: Literal['Financial', 'Research', 'Government', 'Other']

prompt = build_prompt(
    """
    Extract three fields from the document text: summary, document_date, and category.
    Document date should be the date the document was published. If not available, return None.
    Category must be exactly one of: Financial, Research, Government, Other.
    If uncertain, choose the best matching option.
    Keep summary concise (2-4 sentences).
    """
)


In [None]:
# 1) PDF directory
# 2) Model
# 3) Max input tokens
config = ExtractAIConfig(
    pdf_dir='sample_data',
    model='gpt-5-nano',
    max_input_tokens=5000,
)

# Optional output directory for CSV export.
# Set to None to save in the current working directory.
output_dir = 'outputs'


In [None]:
results = run_directory_extraction(
    config=config,
    schema=ExtractedDocument,
    prompt=prompt,
)

for result in results:
    print(f'=== {result.file_name} === {result.status}')
    print(f'Input tokens: {result.input_tokens}')
    print()

csv_path = save_results_to_csv(results, output_dir=output_dir)
print(f'CSV saved to: {csv_path}')
