In [0]:
%pip install --upgrade "openai==1.105.0" "pymupdf>=1.24.0" pillow "pytesseract>=0.3.10" "jsonschema>=4.22.0"



[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

In [0]:
# ==== CONFIG ====
USE_OCR  = True            # Use OCR for scanned/blurry PDFs
OCR_DPI  = 600             # Higher DPI for OCR improves accuracy
OCR_LANG = "eng"           # Add "eng+hin" if you need multi-language
MODEL    = "gpt-4o-2024-08-06"

import os, json, re, io, base64
from typing import List, Dict

from openai import OpenAI
from jsonschema import Draft202012Validator

# --- API KEY (if Community Edition, set manually) ---
# os.environ["OPENAI_API_KEY"] = "sk-xxxx"
OPENAI_API_KEY = dbutils.secrets.get(scope="kv", key="openai-api-key")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
client = OpenAI()

# ==== Schema definition (matches your expected output) ====
structured_schema = {
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "document_title": {"type": ["string", "null"]},
        "summary": {"type": ["string", "null"]},
        "sections": {
            "type": "array",
            "items": {
                "type": "object",
                "additionalProperties": False,
                "properties": {
                    "section": {"type": "string"},
                    "items": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": False,
                            "properties": {
                                "label": {"type": "string"},
                                "value": {"type": ["string", "null"]}
                            },
                            "required": ["label", "value"]
                        }
                    }
                },
                "required": ["section", "items"]
            }
        },
        "transactions": {
            "type": "array",
            "items": {
                "type": "object",
                "additionalProperties": False,
                "properties": {
                    "value_date": {"type": ["string", "null"]},
                    "value_balance": {"type": ["number", "string", "null"]}
                },
                "required": ["value_date", "value_balance"]
            }
        }
    },
    "required": ["document_title", "summary", "sections", "transactions"]
}
validator = Draft202012Validator(structured_schema)

# ==== PDF text extraction & OCR ====
import fitz
from PIL import Image, ImageEnhance

def get_text_native(path: str, max_pages=50) -> str:
    doc = fitz.open(path)
    parts = []
    for i, p in enumerate(doc):
        if i >= max_pages: break
        parts.append(p.get_text("text", sort=True) or "")
    doc.close()
    return "\n".join(parts)

def get_text_ocr(path: str, dpi=OCR_DPI, max_pages=50, lang=OCR_LANG) -> str:
    import pytesseract
    doc = fitz.open(path)
    parts = []
    scale = dpi/72.0
    matrix = fitz.Matrix(scale, scale)
    for i, p in enumerate(doc):
        if i >= max_pages: break
        pix = p.get_pixmap(matrix=matrix, alpha=False)
        png = pix.tobytes("png")
        img = Image.open(io.BytesIO(png)).convert("L")
        img = ImageEnhance.Contrast(img).enhance(2.0)
        img = ImageEnhance.Sharpness(img).enhance(1.5)
        t = pytesseract.image_to_string(img, lang=lang) or ""
        parts.append(t)
    doc.close()
    return "\n".join(parts)

def render_pages_as_data_urls(path: str, dpi=OCR_DPI, max_pages=20) -> list:
    doc = fitz.open(path)
    urls = []
    scale = dpi/72.0
    matrix = fitz.Matrix(scale, scale)
    for i, p in enumerate(doc):
        if i >= max_pages: break
        pix = p.get_pixmap(matrix=matrix, alpha=False)
        png = pix.tobytes("png")
        b64 = base64.b64encode(png).decode("utf-8")
        urls.append("data:image/png;base64," + b64)
    doc.close()
    return urls

# ==== Seed parse (minimal) ====
def parse_deterministic(text: str) -> dict:
    first_line = (text.splitlines() or [""])[0].strip()
    return {
        "document_title": first_line if first_line else None,
        "summary": None,
        "sections": [],
        "transactions": []
    }

# ==== GPT-4o System Prompt ====
JSON_ONLY_SYSTEM = (
    "You are an OCR + information extraction agent. "
    "You MUST NOT invent values. "
    "Return ONLY one JSON object with exactly this schema: "
    + json.dumps(structured_schema) +
    " Always include these sections if present in the document: "
    "'Account details', 'Bank details', 'Account balances', 'Selected view'. "
    "For each section, extract label→value pairs. "
    "For transactions, extract value_date and value_balance. "
    "If something is missing, still include it but set value to null. "
    "Output JSON only, no commentary or code fences."
)

def validate_json(s: str) -> dict:
    s = re.sub(r"^```(?:json)?\s*|\s*```$", "", s.strip(), flags=re.I|re.DOTALL)
    obj = json.loads(s)
    validator.validate(obj)
    return obj

def gpt4o_complete(pdf_path: str, ocr_text: str, seed_doc: dict) -> dict:
    images = render_pages_as_data_urls(pdf_path, dpi=OCR_DPI)
    ocr_chunk = ocr_text[:120000]
    seed_chunk = json.dumps(seed_doc)[:120000]
    messages = [{
        "role": "system",
        "content": JSON_ONLY_SYSTEM
    },{
        "role": "user",
        "content": [
            {"type": "input_text", "text": "Here is OCR/native text:\n" + ocr_chunk},
            {"type": "input_text", "text": "Here is a partial parse:\n" + seed_chunk},
        ] + [{"type": "input_image", "image_url": u} for u in images]
    }]
    resp = client.responses.create(model=MODEL, input=messages, temperature=0)
    return validate_json(resp.output_text)

# ==== Main orchestrator (AI always) ====
def extract_structured_pdf(pdf_path: str, use_ocr=USE_OCR, dpi=OCR_DPI, lang=OCR_LANG) -> dict:
    text = get_text_ocr(pdf_path, dpi=dpi, lang=lang) if use_ocr else get_text_native(pdf_path)
    seed = parse_deterministic(text)
    return gpt4o_complete(pdf_path, text, seed)



In [0]:

pdf_path = "/Volumes/workspace/default/pdfvolume/Reffidity_pdf.pdf"  # change if needed

# ---------- Extract the structured data using pipelien ------------------
doc = extract_structured_pdf(pdf_path, use_ocr=True, dpi=600, lang="eng")
display(doc)
#print(json.dumps(doc, indent=2))


{'document_title': 'Liquid Account Details: ABCDEFGH/157942833/EUR',
 'summary': None,
 'sections': [{'section': 'Account details',
   'items': [{'label': 'Account', 'value': 'ABCDEFGH/1157942833/EUR'},
    {'label': 'Description', 'value': None},
    {'label': 'Account number', 'value': '134283795'},
    {'label': 'IBAN', 'value': 'NT 76 310154283379 0000'},
    {'label': 'Account owner', 'value': 'JITENDRA TRANSACTION SERVICES'},
    {'label': 'Country', 'value': 'Newzeland (NT)'},
    {'label': 'Currency', 'value': 'EUR (Euro)'},
    {'label': 'Account group', 'value': None}]},
  {'section': 'Bank details',
   'items': [{'label': 'Bank', 'value': '<Undefined>'},
    {'label': 'Bank code', 'value': '51000'},
    {'label': 'BIC', 'value': 'ABCDEFGH'},
    {'label': 'Bank access', 'value': 'Reffidity Bank International'}]},
  {'section': 'Account balances', 'items': []},
  {'section': 'Selected view',
   'items': [{'label': 'Filter settings', 'value': None},
    {'label': 'Starting dat