In [0]:
%pip install --upgrade "openai==1.105.0" "pymupdf>=1.24.0" pillow "pytesseract>=0.3.10" "jsonschema>=4.22.0"



[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

In [0]:
# ==== CONFIG ====
USE_OCR  = True
OCR_DPI  = 600
OCR_LANG = "eng"
MODEL    = "gpt-4o-2024-08-06"

import os, json, re, io, base64
from typing import List, Dict
from openai import OpenAI
from jsonschema import Draft202012Validator

# --- API KEY ---
OPENAI_API_KEY = dbutils.secrets.get(scope="kv", key="openai-api-key")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
client = OpenAI()

# ==== Schema definition ====
structured_schema = {
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "document_title": {"type": ["string", "null"]},
        "summary": {"type": ["string", "null"]},
        "sections": {
            "type": "array",
            "items": {
                "type": "object",
                "additionalProperties": False,
                "properties": {
                    "section": {"type": ["string", "null"]},
                    "items": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": False,
                            "properties": {
                                "label": {"type": "string"},
                                "value": {"type": ["string", "null"]}
                            },
                            "required": ["label", "value"]
                        }
                    }
                },
                "required": ["section", "items"]
            }
        }
    },
    "required": ["document_title", "summary", "sections"]
}
validator = Draft202012Validator(structured_schema)

# ==== PDF OCR / Native extraction ====
import fitz
from PIL import Image, ImageEnhance

def get_text_native(path: str, max_pages=50) -> str:
    doc = fitz.open(path)
    parts = []
    for i, p in enumerate(doc):
        if i >= max_pages: break
        parts.append(p.get_text("text", sort=True) or "")
    doc.close()
    return "\n".join(parts)

def get_text_ocr(path: str, dpi=OCR_DPI, max_pages=50, lang=OCR_LANG) -> str:
    import pytesseract
    doc = fitz.open(path)
    parts = []
    scale = dpi/72.0
    matrix = fitz.Matrix(scale, scale)
    for i, p in enumerate(doc):
        if i >= max_pages: break
        pix = p.get_pixmap(matrix=matrix, alpha=False)
        png = pix.tobytes("png")
        img = Image.open(io.BytesIO(png)).convert("L")
        img = ImageEnhance.Contrast(img).enhance(2.0)
        img = ImageEnhance.Sharpness(img).enhance(1.5)
        t = pytesseract.image_to_string(img, lang=lang) or ""
        parts.append(t)
    doc.close()
    return "\n".join(parts)

def render_pages_as_data_urls(path: str, dpi=OCR_DPI, max_pages=20) -> list:
    doc = fitz.open(path)
    urls = []
    scale = dpi/72.0
    matrix = fitz.Matrix(scale, scale)
    for i, p in enumerate(doc):
        if i >= max_pages: break
        pix = p.get_pixmap(matrix=matrix, alpha=False)
        png = pix.tobytes("png")
        b64 = base64.b64encode(png).decode("utf-8")
        urls.append("data:image/png;base64," + b64)
    doc.close()
    return urls

# ==== Minimal seed ====
def parse_deterministic(text: str) -> dict:
    first_line = (text.splitlines() or [""])[0].strip()
    return {
        "document_title": first_line if first_line else None,
        "summary": None,
        "sections": []
    }

# ==== GPT-4o Prompt ====
JSON_ONLY_SYSTEM = (
    "You are a financial OCR + information extraction agent. "
    "You MUST NOT hallucinate values. "
    "Return ONLY one JSON object with exactly this schema: "
    + json.dumps(structured_schema) +
    "\nRules:\n"
    "- Copy numbers exactly as printed.\n"
    "- If a label includes parentheses (e.g. '(0)', '(5)', '(9)'), always set value to '$0.00'.\n"
    "- Normalize zero-like values (0, 0.00, -, blank) to '$0.00'.\n"
    "- All values must be formatted as money: '$<amount with commas and 2 decimals>'.\n"
    "- If unclear, default to '$0.00'.\n"
    "- No commentary, JSON only."
)

def validate_json(s: str) -> dict:
    s = re.sub(r"^```(?:json)?\s*|\s*```$", "", s.strip(), flags=re.I|re.DOTALL)
    obj = json.loads(s)
    validator.validate(obj)
    return obj

# ==== AI Call ====
def gpt4o_complete(pdf_path: str, ocr_text: str, seed_doc: dict) -> dict:
    images = render_pages_as_data_urls(pdf_path, dpi=OCR_DPI)
    ocr_chunk = ocr_text[:120000]
    seed_chunk = json.dumps(seed_doc)[:120000]
    messages = [{
        "role": "system",
        "content": JSON_ONLY_SYSTEM
    },{
        "role": "user",
        "content": [
            {"type": "input_text", "text": "Here is OCR/native text:\n" + ocr_chunk},
            {"type": "input_text", "text": "Here is a partial parse:\n" + seed_chunk},
        ] + [{"type": "input_image", "image_url": u} for u in images]
    }]
    resp = client.responses.create(model=MODEL, input=messages, temperature=0)
    return validate_json(resp.output_text)

# ==== Post-processing cleanup (ensure $0.00 format everywhere) ====
def cleanup_json(data: dict) -> dict:
    for section in data.get("sections", []):
        for item in section.get("items", []):
            label = item.get("label", "")
            val = item.get("value")

            # Force $0.00 if label has parentheses
            if "(" in label and ")" in label:
                item["value"] = "$0.00"
                continue

            # Normalize zero-like values
            if val is None or str(val).strip() in ["0", "0.00", "-", ""]:
                item["value"] = "$0.00"
                continue

            # Format everything else with $ and commas
            try:
                s = str(val).replace("$", "").replace(",", "").strip()
                num = float(s)
                item["value"] = f"${num:,.2f}"
            except Exception:
                item["value"] = "$0.00"

    return data

# ==== Main Orchestrator ====
def extract_structured_pdf(pdf_path: str, use_ocr=USE_OCR, dpi=OCR_DPI, lang=OCR_LANG) -> dict:
    text = get_text_ocr(pdf_path, dpi=dpi, lang=lang) if use_ocr else get_text_native(pdf_path)
    seed = parse_deterministic(text)
    raw = gpt4o_complete(pdf_path, text, seed)
    return cleanup_json(raw)


In [0]:

pdf_path = "/Volumes/workspace/default/pdfvolume/IntraDay_Report.pdf"  # change if needed

# ---------- Extract the structured data using pipelien ------------------
doc = extract_structured_pdf(pdf_path, use_ocr=True, dpi=800, lang="eng")
display(doc)
#print(json.dumps(doc, indent=2))


{'document_title': 'Intraday Customized Report',
 'summary': None,
 'sections': [{'section': 'Account: Collections Pins - 2314 329681223815 Jitbank National Association 021300089',
   'items': [{'label': 'Opening Ledger', 'value': '$100,000.00'},
    {'label': 'Holds', 'value': '$0.00'},
    {'label': 'Uncollected Funds', 'value': '$0.00'},
    {'label': 'Current Ledger', 'value': '$100,000.00'},
    {'label': 'Current Available', 'value': '$100,000.00'},
    {'label': 'Early ACH Credits (0)', 'value': '$0.00'},
    {'label': 'Late ACH Credits (0)', 'value': '$0.00'},
    {'label': 'Lockbox Credits', 'value': '$0.00'},
    {'label': 'Wire Transfer Credits (0)', 'value': '$0.00'},
    {'label': 'Deposits', 'value': '$0.00'},
    {'label': 'Other Misc Credits (0)', 'value': '$0.00'},
    {'label': 'Real Time Payment Credits (0)', 'value': '$0.00'},
    {'label': 'Total Credits (0)', 'value': '$0.00'},
    {'label': 'Early ACH Debits (0)', 'value': '$0.00'},
    {'label': 'Late ACH Debits