In [1]:

import json
import os
import sys
from typing import Any, List, Dict, Union
import yaml

root_dir = os.path.dirname(os.getcwd())
sys.path.append(os.path.join(root_dir, "src"))

from anthropic import Anthropic
from openai import OpenAI

from config import get_settings
from data_cutter.types.prompt.template import (
    TextTemplate,
    ImageTemplate,
    IterableTemplate,
    PromptTemplate
)
from data_cutter.types.output_schema import StructuredOutputSchema
from data_cutter.model_maker.maker import PydanticModelMaker
from data_cutter.formatter import (
    OpenAIPromptFormatter,
    AnthropicPromptFormatter
)

settings = get_settings()
provider = settings.VLM_PROVIDER

In [2]:
prompt_template_fpath = "tasks/table_extraction/prompt_template.yaml"

with open(prompt_template_fpath, "r", encoding="utf-8") as f:
    data = yaml.safe_load(f)
template = PromptTemplate.model_validate(data)

In [3]:
template.messages[0]

MessageTemplate(role='user', contents=[TextTemplate(type='text', value='Parse all tables (parsed_text) into {grammar} from given page images and extract their bounding boxes (bbox)\n- parsed_text should be like "<table>...</table>"\nReturn in the following JSON\n{{\n  "items": [\n    {{\n      "document_no": int,\n      "page_no": int,\n      "parsed_text": "{grammar} parsed table text",\n      "bbox": {{\n        "x1": int,\n        "y1": int,\n        "x2": int,\n        "y2": int\n      }}\n    }}\n  ]\n}}', input_variables=['grammar'])])

In [4]:
template.messages[1]

MessageTemplate(role='user', contents=[IterableTemplate(type='iterable', input_variable='documents', items=[TextTemplate(type='text', value='Document Title: {title}', input_variables=['title']), IterableTemplate(type='iterable', input_variable='pages', items=[TextTemplate(type='text', value='Page {page_no}', input_variables=['page_no']), ImageTemplate(type='image', input_name='page_image')])])])

In [5]:
input_example_fpath = "tasks/table_extraction/input_example.json"

with open(input_example_fpath, "r", encoding="utf-8") as f:
    input_variables = json.load(f)

print(input_variables.keys())

dict_keys(['grammar', 'documents'])


In [6]:
if provider=="openai":
    messages = OpenAIPromptFormatter.format(
        template=template,
        variables=input_variables
    )
elif provider=="anthropic":
    messages = AnthropicPromptFormatter.format(
        template=template,
        variables=input_variables
    )
else:
    raise ValueError(f"Provider {provider} not recognized")

In [7]:
import json
print(json.dumps(messages, indent=2))

[
  {
    "role": "user",
    "content": [
      {
        "type": "text",
        "text": "Parse all tables (parsed_text) into html from given page images and extract their bounding boxes (bbox)\n- parsed_text should be like \"<table>...</table>\"\nReturn in the following JSON\n{\n  \"items\": [\n    {\n      \"document_no\": int,\n      \"page_no\": int,\n      \"parsed_text\": \"html parsed table text\",\n      \"bbox\": {\n        \"x1\": int,\n        \"y1\": int,\n        \"x2\": int,\n        \"y2\": int\n      }\n    }\n  ]\n}"
      }
    ]
  },
  {
    "role": "user",
    "content": [
      {
        "type": "text",
        "text": "Document Title: document1"
      },
      {
        "type": "text",
        "text": "Page 1"
      },
      {
        "type": "image",
        "source": {
          "type": "url",
          "url": "https://ofasys-multimodal-wlcb-3-toshanghai.oss-cn-shanghai.aliyuncs.com/Qwen3VL/demo/omni_parsing/179729.jpg"
        }
      },
      {
        "type

# Output Schema

In [8]:
output_schema_fpath = "tasks/table_extraction/output_schema.json"

with open(output_schema_fpath, "r", encoding="utf-8") as f:
    output_schema = json.load(f)

output_schema = StructuredOutputSchema.model_validate(output_schema)

In [9]:
output_schema.definition

ModelSpecification(name='Result', fields=[FieldSpec(name='items', specification=DtypeSpecification(dim=1, dtype='Table', allowed_values=None, optional=False, description=None, pattern=None, format=None, multipleOf=None, minimum=None, maximum=None, exclusiveMinimum=None, exclusiveMaximum=None, minItems=None, maxItems=None))], custom_dtypes=[CustomDTypeSpecification(name='Table', fields=[FieldSpec(name='document_no', specification=DtypeSpecification(dim=0, dtype='integer', allowed_values=None, optional=False, description=None, pattern=None, format=None, multipleOf=None, minimum=None, maximum=None, exclusiveMinimum=None, exclusiveMaximum=None, minItems=None, maxItems=None)), FieldSpec(name='page_no', specification=DtypeSpecification(dim=0, dtype='integer', allowed_values=None, optional=False, description=None, pattern=None, format=None, multipleOf=None, minimum=None, maximum=None, exclusiveMinimum=None, exclusiveMaximum=None, minItems=None, maxItems=None)), FieldSpec(name='parsed_text', s

In [10]:
output_model = PydanticModelMaker().make(output_schema.definition)

In [11]:
print(json.dumps(output_model.model_json_schema(), indent=2))

{
  "$defs": {
    "Bbox": {
      "additionalProperties": false,
      "description": "Bounding box for grounding tasks",
      "properties": {
        "x1": {
          "title": "X1",
          "type": "integer"
        },
        "y1": {
          "title": "Y1",
          "type": "integer"
        },
        "x2": {
          "title": "X2",
          "type": "integer"
        },
        "y2": {
          "title": "Y2",
          "type": "integer"
        }
      },
      "required": [
        "x1",
        "y1",
        "x2",
        "y2"
      ],
      "title": "Bbox",
      "type": "object"
    },
    "Table": {
      "additionalProperties": false,
      "properties": {
        "document_no": {
          "title": "Document No",
          "type": "integer"
        },
        "page_no": {
          "title": "Page No",
          "type": "integer"
        },
        "parsed_text": {
          "title": "Parsed Text",
          "type": "string"
        },
        "bbox": {
          "$r

# Generation (VLM)

In [12]:
if provider=="openai":
    client = OpenAI(
        base_url=settings.VLM_BASE_URL,
        api_key=settings.VLM_API_KEY,
    )
    response = client.chat.completions.parse(
        model=settings.VLM_MODEL,
        messages=messages,
        response_format=output_model,
        temperature=0.2
    )
    result = response.choices[0].message.parsed
    
elif provider=="anthropic":
    client = Anthropic(
        base_url=settings.VLM_BASE_URL,
        api_key=settings.VLM_API_KEY,
    )

    response = client.beta.messages.parse(
        model=settings.VLM_MODEL,
        max_tokens=4096,
        betas=["structured-outputs-2025-11-13"],
        messages=messages,
        output_format=output_model
    )
    result = response.parsed_output
else:
    raise ValueError(f"Provider {provider} not recognized")

In [13]:
response

ParsedBetaMessage[TypeVar](id='msg_01CNMY51B2XD9UsDipS9WPb2', container=None, content=[ParsedBetaTextBlock[TypeVar](citations=None, text='{"items":[{"document_no":1,"page_no":1,"parsed_text":"<table><tr><td>行业</td><td>汽车和汽车零部件</td></tr><tr><td>公司网址</td><td>www.yutong.com</td></tr><tr><td>大股东/持股</td><td>宇通集团/37.19%</td></tr><tr><td>总股本(百万股)</td><td>2,214</td></tr><tr><td>流通 A 股(百万股)</td><td>1,903</td></tr><tr><td>流通 B/H 股(百万股)</td><td>0</td></tr><tr><td>总市值(亿元)</td><td>488.39</td></tr><tr><td>流通 A 股市值(亿元)</td><td>419.91</td></tr><tr><td>每股净资产(元)</td><td>6.28</td></tr><tr><td>资产负债率(%)</td><td>52.30</td></tr></table>","bbox":{"x1":86,"y1":354,"x2":358,"y2":577}},{"document_no":1,"page_no":1,"parsed_text":"<table><tr><td></td><td>2015A</td><td>2016A</td><td>2017E</td><td>2018E</td><td>2019E</td></tr><tr><td>营业收入(百万元)</td><td>31211</td><td>35,850</td><td>38,715</td><td>43,226</td><td>46,108</td></tr><tr><td>YoY(%)</td><td>21.3</td><td>14.9</td><td>8.0</td><td>11.7</td><td>6.7</td></tr><tr><

In [14]:
print(result.model_dump_json(indent=2))

{
  "items": [
    {
      "document_no": 1,
      "page_no": 1,
      "parsed_text": "<table><tr><td>行业</td><td>汽车和汽车零部件</td></tr><tr><td>公司网址</td><td>www.yutong.com</td></tr><tr><td>大股东/持股</td><td>宇通集团/37.19%</td></tr><tr><td>总股本(百万股)</td><td>2,214</td></tr><tr><td>流通 A 股(百万股)</td><td>1,903</td></tr><tr><td>流通 B/H 股(百万股)</td><td>0</td></tr><tr><td>总市值(亿元)</td><td>488.39</td></tr><tr><td>流通 A 股市值(亿元)</td><td>419.91</td></tr><tr><td>每股净资产(元)</td><td>6.28</td></tr><tr><td>资产负债率(%)</td><td>52.30</td></tr></table>",
      "bbox": {
        "x1": 86,
        "y1": 354,
        "x2": 358,
        "y2": 577
      }
    },
    {
      "document_no": 1,
      "page_no": 1,
      "parsed_text": "<table><tr><td></td><td>2015A</td><td>2016A</td><td>2017E</td><td>2018E</td><td>2019E</td></tr><tr><td>营业收入(百万元)</td><td>31211</td><td>35,850</td><td>38,715</td><td>43,226</td><td>46,108</td></tr><tr><td>YoY(%)</td><td>21.3</td><td>14.9</td><td>8.0</td><td>11.7</td><td>6.7</td></tr><tr><td>净利润(百万元)</td>