In [1]:
import typing_extensions as _te

if not hasattr(_te, "Sentinel"):
    class _Sentinel:
        def __init__(self, name: str):
            self.name = name
        def __repr__(self):
            return self.name

    def Sentinel(name: str):
        return _Sentinel(name)

    _te.Sentinel = Sentinel
# --- END PATCH ---

from openai import OpenAI
import json
import re
import pandas as pd
from pathlib import Path

In [2]:
from pathlib import Path

PROMPT_PATH = Path("prompt/extract_sankey_data.txt")

def load_prompt(path: Path = PROMPT_PATH) -> str:
    return path.read_text(encoding="utf-8")

prompt_text = load_prompt()

In [None]:
def get_report(code, prompt_text):
    # from openai import OpenAI
    client = OpenAI()

    file = client.files.create(
        file=open(f"data/{code}.pdf", "rb"),
        purpose="user_data"
    )


    response = client.responses.create(
        model="gpt-5",
        input=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_file",
                        "file_id": file.id,
                    },
                    {
                        "type": "input_text",
                        "text": prompt_text,
                    },
                ]
            }
        ]
    )

    print(response.output_text)

    return response

In [4]:
def process_table(response, code):

    # 1) get the text from the response
    output_text = response.output_text  # <- what you said you have

    # 2) make it valid JSON
    clean = output_text

    # fix patterns like `: -? ,`  → make them null
    clean = re.sub(r':\s*-\?\s*,', ': null,', clean)
    clean = re.sub(r':\s*-\?\s*}', ': null}', clean)

    # fix numbers like `108?` → `108`
    clean = re.sub(r'(\d+)\?', r'\1', clean)

    # sometimes the model puts `-?` without colon context, we've handled the common ones above

    # 3) parse to python
    data = json.loads(clean)

    # define output path
    output_json_path = Path("output") / f"{code}.json"
    output_json_path.parent.mkdir(parents=True, exist_ok=True)

    # save JSON to file (nicely formatted)
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"✅ JSON saved to: {output_json_path}")


In [15]:
code = 'KEJU'
response = get_report(code, prompt_text)
process_table(response, code)

{
  "meta": {
    "company": "PT Mulia Boga Raya Tbk",
    "period_label": "9M2025 vs 9M2024",
    "currency": "IDR",
    "unit": "million",
    "source_type": "pdf",
    "source_notes": "Standalone interim financial statements: 9M2025 unaudited; 2024 audited",
    "version": "sankey-v1.1",
    "validation": {
      "rev_breakdown_basis": "segment",
      "rev_breakdown_sum": 1088770.621179,
      "rev_total": 1088770.621179,
      "rev_breakdown_tolerance": 0.005,
      "rev_breakdown_ok": true,
      "cogs_sub_sum": -752226.702799,
      "cogs_total": -752226.702799,
      "cogs_tolerance": 0.01,
      "cogs_reconciles": true,
      "opex_parts_sum": -162161.415984,
      "opex_total": -162161.415984,
      "opex_tolerance": 0.01,
      "opex_reconciles": true
    }
  },
  "table": [
    {
      "anchor": "REV_TOTAL",
      "hierarchy": ["Revenue", "", ""],
      "display_name": "Net sales",
      "description": "Total revenue for the period",
      "current": 1088770.621179,
      "