In [1]:
import json
import os
import sys
from typing import Any, List, Dict, Union
import yaml

root_dir = os.path.dirname(os.getcwd())
sys.path.append(os.path.join(root_dir, "src"))

from openai import OpenAI

from config import get_settings
from data_cutter.types.prompt.template import (
    TextTemplate,
    ImageTemplate,
    IterableTemplate,
    PromptTemplate
)
from data_cutter.types.output_schema import StructuredOutputSchema
from data_cutter.model_maker.maker import PydanticModelMaker

settings = get_settings()

In [2]:
prompt_template_fpath = "tasks/table_extraction/prompt_template.yaml"

with open(prompt_template_fpath, "r", encoding="utf-8") as f:
    data = yaml.safe_load(f)
template = PromptTemplate.model_validate(data)

In [3]:
template.messages[0]

MessageTemplate(role='user', contents=[TextTemplate(type='text', value='Parse all tables (parsed_text) into {grammar} from given page images and extract their bounding boxes (bbox)\n- parsed_text should be like "<table>...</table>"\nReturn in the following JSON\n{{\n  "items": [\n    {{\n      "document_no": int,\n      "page_no": int,\n      "parsed_text": "{grammar} parsed table text",\n      "bbox": {{\n        "x1": int,\n        "y1": int,\n        "x2": int,\n        "y2": int\n      }}\n    }}\n  ]\n}}', input_variables=['grammar'])])

In [4]:
template.messages[1]

MessageTemplate(role='user', contents=[IterableTemplate(type='iterable', input_variable='documents', items=[TextTemplate(type='text', value='Document Title: {title}', input_variables=['title']), IterableTemplate(type='iterable', input_variable='pages', items=[TextTemplate(type='text', value='Page {page_no}', input_variables=[]), ImageTemplate(type='image', input_name='page_image')])])])

In [5]:
class PromptTemplateProcessor:
    """Processes PromptTemplate and formats it with provided variables"""
    
    def __init__(self, template: PromptTemplate):
        self.template = template
    
    @classmethod
    def from_yaml(cls, yaml_path: str) -> "PromptTemplateProcessor":
        """Load and validate prompt template from YAML file"""
        with open(yaml_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
        template = PromptTemplate.model_validate(data)
        return cls(template)
    
    @classmethod
    def from_yaml_string(cls, yaml_string: str) -> "PromptTemplateProcessor":
        """Load and validate prompt template from YAML string"""
        data = yaml.safe_load(yaml_string)
        template = PromptTemplate.model_validate(data)
        return cls(template)
    
    def _format_string(self, value: str, variables: Dict[str, Any]) -> str:
        """Format string based on template_format (f-string or jinja2)"""
        if self.template.template_format == "f-string":
            # Only format with available variables to avoid KeyError
            return value.format(**variables)
        elif self.template.template_format == "jinja2":
            from jinja2 import Template
            jinja_template = Template(value)
            return jinja_template.render(**variables)
        return value
    
    def _process_text_template(
        self, 
        item: TextTemplate, 
        variables: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        """Process TextTemplate and return formatted text content"""
        formatted_text = self._format_string(item.value, variables)
        return [{"type": "text", "text": formatted_text}]
    
    def _process_image_template(
        self, 
        item: ImageTemplate, 
        variables: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        """Process ImageTemplate and return image content"""
        image_url = variables.get(item.input_name, "")
        return [{"type": "image_url", "image_url": {"url": image_url}}]
    
    def _process_iterable_template(
        self, 
        item: IterableTemplate, 
        variables: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        """Process IterableTemplate by iterating over the specified variable"""
        results = []
        iterable_data = variables.get(item.input_variable, [])
        
        if not isinstance(iterable_data, (list, tuple)):
            raise ValueError(
                f"Variable '{item.input_variable}' must be a list, "
                f"got {type(iterable_data).__name__}"
            )
        
        for item_data in iterable_data:
            # Merge parent variables with current iteration item variables
            if isinstance(item_data, dict):
                merged_vars = {**variables, **item_data}
            else:
                merged_vars = {**variables, item.input_variable: item_data}
            
            # Process each sub-item in the iterable
            for sub_item in item.items:
                results.extend(self._process_template_item(sub_item, merged_vars))
        
        return results
    
    def _process_template_item(
        self, 
        item: Union[TextTemplate, ImageTemplate, IterableTemplate], 
        variables: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        """Route to appropriate processor based on item type"""
        if isinstance(item, TextTemplate):
            return self._process_text_template(item, variables)
        elif isinstance(item, ImageTemplate):
            return self._process_image_template(item, variables)
        elif isinstance(item, IterableTemplate):
            return self._process_iterable_template(item, variables)
        else:
            raise ValueError(f"Unknown template item type: {type(item)}")
    
    def format(self, variables: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Format the prompt template with provided variables.
        
        Args:
            variables: Dictionary containing all variables needed for formatting
            
        Returns:
            List of formatted messages ready for API consumption
        """
        messages = []
        
        for message_template in self.template.messages:
            content = []
            
            for item in message_template.contents:
                content.extend(self._process_template_item(item, variables))
            
            messages.append({
                "role": message_template.role,
                "content": content
            })
        
        return messages

In [6]:
# Load and process template
processor = PromptTemplateProcessor(template=template)

In [7]:
input_example_fpath = "tasks/table_extraction/input_example.json"

with open(input_example_fpath, "r", encoding="utf-8") as f:
    input_variables = json.load(f)

print(input_variables.keys())

dict_keys(['grammar', 'documents'])


In [8]:
messages = processor.format(input_variables)

In [9]:
import json
print(json.dumps(messages, indent=2))

[
  {
    "role": "user",
    "content": [
      {
        "type": "text",
        "text": "Parse all tables (parsed_text) into html from given page images and extract their bounding boxes (bbox)\n- parsed_text should be like \"<table>...</table>\"\nReturn in the following JSON\n{\n  \"items\": [\n    {\n      \"document_no\": int,\n      \"page_no\": int,\n      \"parsed_text\": \"html parsed table text\",\n      \"bbox\": {\n        \"x1\": int,\n        \"y1\": int,\n        \"x2\": int,\n        \"y2\": int\n      }\n    }\n  ]\n}"
      }
    ]
  },
  {
    "role": "user",
    "content": [
      {
        "type": "text",
        "text": "Document Title: document1"
      },
      {
        "type": "text",
        "text": "Page 1"
      },
      {
        "type": "image_url",
        "image_url": {
          "url": "https://ofasys-multimodal-wlcb-3-toshanghai.oss-cn-shanghai.aliyuncs.com/Qwen3VL/demo/omni_parsing/179729.jpg"
        }
      },
      {
        "type": "text",
       

# Output Schema

In [10]:
output_schema_fpath = "tasks/table_extraction/output_schema.json"

with open(output_schema_fpath, "r", encoding="utf-8") as f:
    output_schema = json.load(f)

output_schema = StructuredOutputSchema.model_validate(output_schema)

In [11]:
output_schema.definition

ModelSpecification(name='Result', fields=[FieldSpec(name='items', specification=DtypeSpecification(dim=1, dtype='Table', allowed_values=None, optional=False, description=None, pattern=None, format=None, multipleOf=None, minimum=None, maximum=None, exclusiveMinimum=None, exclusiveMaximum=None, minItems=None, maxItems=None))], custom_dtypes=[CustomDTypeSpecification(name='Table', fields=[FieldSpec(name='document_no', specification=DtypeSpecification(dim=0, dtype='integer', allowed_values=None, optional=False, description=None, pattern=None, format=None, multipleOf=None, minimum=None, maximum=None, exclusiveMinimum=None, exclusiveMaximum=None, minItems=None, maxItems=None)), FieldSpec(name='page_no', specification=DtypeSpecification(dim=0, dtype='integer', allowed_values=None, optional=False, description=None, pattern=None, format=None, multipleOf=None, minimum=None, maximum=None, exclusiveMinimum=None, exclusiveMaximum=None, minItems=None, maxItems=None)), FieldSpec(name='parsed_text', s

In [12]:
output_model = PydanticModelMaker().make(output_schema.definition)

In [13]:
print(json.dumps(output_model.model_json_schema(), indent=2))

{
  "$defs": {
    "Bbox": {
      "additionalProperties": false,
      "description": "Bounding box for grounding tasks",
      "properties": {
        "x1": {
          "title": "X1",
          "type": "integer"
        },
        "y1": {
          "title": "Y1",
          "type": "integer"
        },
        "x2": {
          "title": "X2",
          "type": "integer"
        },
        "y2": {
          "title": "Y2",
          "type": "integer"
        }
      },
      "required": [
        "x1",
        "y1",
        "x2",
        "y2"
      ],
      "title": "Bbox",
      "type": "object"
    },
    "Table": {
      "additionalProperties": false,
      "properties": {
        "document_no": {
          "title": "Document No",
          "type": "integer"
        },
        "page_no": {
          "title": "Page No",
          "type": "integer"
        },
        "parsed_text": {
          "title": "Parsed Text",
          "type": "string"
        },
        "bbox": {
          "$r

# Generation (VLM)

In [14]:
client = OpenAI(
    base_url=settings.VLM_BASE_URL,
    api_key=settings.VLM_API_KEY,
)

In [15]:
response = client.chat.completions.parse(
    model=settings.VLM_MODEL,
    messages=messages,
    response_format=output_model,
    temperature=0.2
)

In [16]:
response

ParsedChatCompletion[Result](id='chatcmpl-95451e29c9d9571c', choices=[ParsedChoice[Result](finish_reason='stop', index=0, logprobs=None, message=ParsedChatCompletionMessage[Result](content='{\n  "items": [\n    {\n      "document_no": 1,\n      "page_no": 1,\n      "parsed_text": "<table><tr><th>主要数据</th></tr><tr><td>行业</td><td>汽车和汽车零部件</td></tr><tr><td>公司网址</td><td>www.yutong.com</td></tr><tr><td>大股东/持股</td><td>宇通集团/37.19%</td></tr><tr><td>总股本(百万股)</td><td>2,214</td></tr><tr><td>流通 A 股(百万股)</td><td>1,903</td></tr><tr><td>流通 B/H 股(百万股)</td><td>0</td></tr><tr><td>总市值 ( 亿元 )</td><td>488.39</td></tr><tr><td>流通 A 股市值(亿元)</td><td>419.91</td></tr><tr><td>每股净资产(元)</td><td>6.28</td></tr><tr><td>资产负债率(%)</td><td>52.30</td></tr></table>",\n      "bbox": {\n        "x1": 104,\n        "y1": 275,\n        "x2": 380,\n        "y2": 448\n      }\n    },\n    {\n      "document_no": 1,\n      "page_no": 1,\n      "parsed_text": "<table><tr><th>2015A</th><th>2016A</th><th>2017E</th><th>2018E</th><th>2

In [17]:
result = response.choices[0].message.parsed

In [18]:
print(result.model_dump_json(indent=2))

{
  "items": [
    {
      "document_no": 1,
      "page_no": 1,
      "parsed_text": "<table><tr><th>主要数据</th></tr><tr><td>行业</td><td>汽车和汽车零部件</td></tr><tr><td>公司网址</td><td>www.yutong.com</td></tr><tr><td>大股东/持股</td><td>宇通集团/37.19%</td></tr><tr><td>总股本(百万股)</td><td>2,214</td></tr><tr><td>流通 A 股(百万股)</td><td>1,903</td></tr><tr><td>流通 B/H 股(百万股)</td><td>0</td></tr><tr><td>总市值 ( 亿元 )</td><td>488.39</td></tr><tr><td>流通 A 股市值(亿元)</td><td>419.91</td></tr><tr><td>每股净资产(元)</td><td>6.28</td></tr><tr><td>资产负债率(%)</td><td>52.30</td></tr></table>",
      "bbox": {
        "x1": 104,
        "y1": 275,
        "x2": 380,
        "y2": 448
      }
    },
    {
      "document_no": 1,
      "page_no": 1,
      "parsed_text": "<table><tr><th>2015A</th><th>2016A</th><th>2017E</th><th>2018E</th><th>2019E</th></tr><tr><td>营业收入(百万元)</td><td>31211</td><td>35,850</td><td>38,715</td><td>43,226</td><td>46,108</td></tr><tr><td>YoY(%)</td><td>21.3</td><td>14.9</td><td>8.0</td><td>11.7</td><td>6.7</td></tr><tr>