In [None]:
# !wget https://huggingface.co/datasets/bsmock/FinTabNet.c/resolve/main/FinTabNet.c-PDF_Annotations.tar.gz -O finqa_pdf.tar.gz
# !tar -xzvf finqa_pdf.tar.gz

In [None]:
# # download dataset from github
# !wget https://github.com/czyssrs/FinQA/archive/refs/heads/main.zip -O finqa.zip
# !unzip -o finqa.zip 

In [11]:
import json
import os
import xml.etree.ElementTree as ET
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key=os.environ.get("NVIDIA_API_KEY")
)

# 2. Read in train.json
def load_finqa_sample(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data  # 先取前 5 筆做實驗

# 假設 train.json 與 notebook 在同一目錄
dataset = load_finqa_sample("/root/hsin_research/FinQA-main/dataset/train.json")

print(f"Loaded {len(dataset)} samples for testing.")
print("Sample Question 1:", dataset[0]['qa']['question'])

Loaded 6251 samples for testing.
Sample Question 1: what is the the interest expense in 2009?


In [19]:
print(dataset[0]['qa'].keys())
print(dataset[0]['qa']['question'])
print()
print(dataset[0]['qa']['gold_inds'])
print()
print(dataset[0]['qa']['program'])
print()
print(dataset[0]['qa']['answer'])
print()
print(dataset[0]['qa']['ann_table_rows'])
# divide(100, 100), divide(3.8, #0)
#                          =>
# divide(
# 100
#     ......
# #0
# )

dict_keys(['question', 'answer', 'explanation', 'ann_table_rows', 'ann_text_rows', 'steps', 'program', 'gold_inds', 'exe_ans', 'tfidftopn', 'program_re', 'model_input'])
what is the the interest expense in 2009?

{'text_1': 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .'}

divide(100, 100), divide(3.8, #0)

380
[]


In [3]:
#  XML Rulebook
rulebook_xml_content = """
<Rulebook domain="finqa_reasoning">
    <Rule id="01" phase="generation", confidence="1", source="log_1">
        <Trigger>write program reasoning steps math operation finqa format</Trigger>
        <Action>CRITICAL FORMATTING RULE: You must output the answer as a Domain Specific Language (DSL) program. Use functions: add(), subtract(), multiply(), divide(). Do NOT write Python code. Do NOT write explanations. Example output: "subtract(10, 5), divide(#0, 2)"</Action>
    </Rule>
    <Rule id="02" phase="generation", confidence="1", source="log_1">
        <Trigger>basis points interest rate change bps fluctuation</Trigger>
        <Action>KNOWLEDGE INJECTION: "Basis points" are a unit of measure for interest rates. 100 basis points = 1% = 0.01. If the text says "100 basis points change results in $3.8 million", use this ratio for calculation.</Action>
    </Rule>
</Rulebook>
"""

class RuleRetriever:
    def __init__(self, xml_content):
        self.root = ET.fromstring(xml_content)
    
    def retrieve(self, query, top_k=2):
        # Simulated Vector Search
        # TODO: Change to Embedding Cosine Similarity or Stochastic Sampling
        query_lower = query.lower()
        hits = []
        for rule in self.root.findall('Rule'):
            triggers = rule.find('Trigger').text.split()
            
            # Hit trigger word then recall 
            score = sum(1 for t in triggers if t in query_lower)
            if score > 0 or rule.get('id') == 'fin_fmt_01': 
                hits.append(rule.find('Action').text)
        return hits[:top_k]

retriever = RuleRetriever(rulebook_xml_content)
print("Test Retrieval:", retriever.retrieve("calculate interest rate basis points"))

ParseError: not well-formed (invalid token): line 3, column 36 (<string>)

In [4]:
def generate_response(prompt, model="meta/llama-3.3-70b-instruct"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
        max_tokens=128
    )
    return response.choices[0].message.content

def run_experiment(sample, with_rules=False):
    # build context
    context_text = " ".join(sample['pre_text'] + sample['post_text'])
    table_text = str(sample['table'])                                   # store table in string format
    question = sample['qa']['question']
    
    # system prompt
    base_prompt = f"""
    You are a financial reasoning expert. 
    Context: {context_text}
    Table Data: {table_text}
    
    Question: {question}
    
    Task: Write a logical program steps to answer the question.
    """
    print(base_prompt)
    
    if with_rules:
        # rule retrieve and inject
        rules = retriever.retrieve(question + " write program")
        rules_block = "\n### IMPORTANT RULES (Must Follow):\n" + "\n".join([f"- {r}" for r in rules])
        final_prompt = base_prompt + rules_block
    else:
        final_prompt = base_prompt

    # call model
    return generate_response(final_prompt)

In [None]:
from docling.document_converter import DocumentConverter

source = "https://www.canmat.org/wp-content/uploads/2019/07/Yatham-LN-2018-CANMAT-ISBD-guidelines-for-bipolar-disorder-Bipol-Disord.pdf"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)
print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"

In [5]:
import math
from typing import List, Union

def add(a: float, b: float) -> float:
    """
    Calculates the sum of two numbers.

    Args:
        a: The first number to add.
        b: The second number to add.

    Returns:
        The sum of a and b.
    """
    return a + b

def subtract(a: float, b: float) -> float:
    """
    Calculates the difference between two numbers.

    Args:
        a: The number to be subtracted from (minuend).
        b: The number to subtract (subtrahend).

    Returns:
        The difference of a minus b.
    """
    return a - b

def multiply(a: float, b: float) -> float:
    """
    Calculates the product of two numbers.

    Args:
        a: The first factor.
        b: The second factor.

    Returns:
        The product of a and b.
    """
    return a * b

def divide(a: float, b: float) -> float:
    """
    Calculates the ratio of two numbers. Handles division by zero.

    Args:
        a: The dividend (numerator).
        b: The divisor (denominator).

    Returns:
        The result of a divided by b. Returns 0.0 if the divisor is zero to prevent crashes.
    """
    if b == 0:
        return 0.0
    return a / b

def exp(a: float, b: float) -> float:
    """
    Calculates the power of a number.

    Args:
        a: The base.
        b: The exponent.

    Returns:
        The result of a raised to the power of b.
    """
    return math.pow(a, b)

def greater(a: float, b: float) -> bool:
    """
    Compares two numbers to see if the first is larger than the second.

    Args:
        a: The first number to compare.
        b: The second number to compare.

    Returns:
        True if a is greater than b, False otherwise.
    """
    return a > b

def table_sum(values: List[float]) -> float:
    """
    Calculates the total sum of a list of numerical values extracted from a table.

    Args:
        values: A list of floats to be summed.

    Returns:
        The total sum.
    """
    return sum(values)

def table_average(values: List[float]) -> float:
    """
    Calculates the arithmetic mean of a list of numerical values from a table.

    Args:
        values: A list of floats.

    Returns:
        The average value. Returns 0.0 if the list is empty.
    """
    if not values:
        return 0.0
    return sum(values) / len(values)

def table_max(values: List[float]) -> float:
    """
    Identifies the maximum value in a list of numerical values from a table.

    Args:
        values: A list of floats.

    Returns:
        The highest value in the list.
    """
    if not values:
        return 0.0
    return max(values)

def table_min(values: List[float]) -> float:
    """
    Identifies the minimum value in a list of numerical values from a table.

    Args:
        values: A list of floats.

    Returns:
        The lowest value in the list.
    """
    if not values:
        return 0.0
    return min(values)

tools_map = {
    "add": add,
    "subtract": subtract,
    "multiply": multiply,
    "divide": divide,
    "exp": exp,
    "greater": greater,
    "table_sum": table_sum,
    "table_average": table_average,
    "table_max": table_max,
    "table_min": table_min
}

print(f"成功映射 {len(tools_map)} 個工具。")
print(f"範例測試 (divide): {tools_map['divide'](3.8, 0.01)}") 


成功映射 10 個工具。
範例測試 (divide): 380.0


In [6]:
import json
from typing import List, Dict, Any, Callable
from openai import OpenAI

class ProofOfConcept:
    def __init__(
        self, 
        client: OpenAI, 
        model: str = "meta/llama-3.3-70b-instruct",
        rulebook_xml: str = "",
        tools_map: Dict[str, Callable] = None
    ):
        """
        初始化 Protocol-Z 執行引擎。
        
        Args:
            client: OpenAI SDK client 
            model: Tool Calling Model
            rulebook_xml: Rulebook
            tools_map: Python 函式映射表 (例如: {"add": add_func})。
        """
        self.client = client
        self.model = model
        self.rulebook_xml = rulebook_xml
        self.tools_map = tools_map or {}
        self.tools_schema = self._generate_tools_schema()

    def _generate_tools_schema(self) -> List[Dict]:
        """將 tools_map 中的函式轉換為 OpenAI/NIM 要求的 JSON Schema。"""
        # 這裡簡化處理，實際開發建議使用 pydantic 或 inspect 自動生成
        schemas = []
        for name, func in self.tools_map.items():
            # 假設你已經將 Docstrings 寫好，這裡手動定義核心架構
            schemas.append({
                "type": "function",
                "function": {
                    "name": name,
                    "description": func.__doc__.split("Args:")[0].strip() if func.__doc__ else "",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "a": {"type": "number"},
                            "b": {"type": "number"},
                            "values": {"type": "array", "items": {"type": "number"}}
                        },
                    }
                }
            })
        return schemas

    def _retrieve_rules(self, query: str) -> str:
        """
        [Protocol-Z 核心邏輯] 
        根據 Query 中的 Trigger 關鍵字檢索相關的 XML Rules。
        (未來可升級為 Whitening Transformation 向量檢索)
        """
        # 這裡先實作簡單的 Keyword-based 檢索作為 PoC
        relevant_rules = []
        # 假設你的規則庫已經被解析成 list of dict
        # if trigger in query: relevant_rules.append(rule)
        return f"<Rulebook_Snippet>\n{self.rulebook_xml[:500]}...\n</Rulebook_Snippet>"

    def run(self, user_query: str, context: str = "") -> str:
        """
        執行完整的推理循環：檢索規則 -> 規劃 -> 執行工具 -> 產出答案。
        """
        # 1. 檢索與注入規則
        rules_context = self._retrieve_rules(user_query)
        
        system_prompt = f"""
        You are a financial advisor using the Protocol-Z reasoning framework.
        Rules to follow:
        {rules_context}
        
        Context provided from table:
        {context}
        """

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_query}
        ]

        # 2. 發送請求 (NIM API)
        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            tools=self.tools_schema,
            tool_choice="auto"
        )

        response_message = response.choices[0].message
        tool_calls = response_message.tool_calls

        # 3. 處理工具調用 (Iterative Loop)
        if tool_calls:
            messages.append(response_message)
            
            for tool_call in tool_calls:
                function_name = tool_call.function.name
                function_args = json.loads(tool_call.function.arguments)
                
                # 執行 Python 端的工具
                print(f"[*] Executing Tool: {function_name} with {function_args}")
                function_to_call = self.tools_map[function_name]
                function_response = function_to_call(**function_args)

                messages.append({
                    "tool_call_id": tool_call.id,
                    "role": "tool",
                    "name": function_name,
                    "content": str(function_response),
                })
            
            # 4. 再次呼叫模型以總結結果
            second_response = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
            )
            return second_response.choices[0].message.content
        
        return response_message.content

In [None]:
finqa_dataset = [dataset[i]['qa'] for i in range(len(dataset))]

In [10]:
import json

# 1. 準備資料樣本

sample = train_data_qa[0]
{
    'question': 'what is the interest expense in 2009?',
    'answer': '380',
    'gold_inds': {'text_1': 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .'},
    # ... 其他欄位
}

# 2. 自動提取 Context 與 Ground Truth
# 我們將 gold_inds 的所有 value 合併為一段背景文字
context_from_gold = " ".join(sample['gold_inds'].values())
ground_truth = sample['answer']
program = sample['program']

print(f"[System Info] 提取 Context: {context_from_gold}")
print(f"[System Info] 目標答案 (GT): {ground_truth}")
print(f"[System Info] 程式 (PG): {program}")


client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key=os.environ.get("NVIDIA_API_KEY")
)

# 3. 執行無規則測試 (Baseline)
# 我們將 rulebook_xml 設為空字串，模擬模型在沒有先驗規則下的表現
executor_no_rule = ProofOfConcept(
    client=client, 
    rulebook_xml="",  # Baseline: No Rules
    tools_map=tools_map
)

# 執行推論
predicted_answer = executor_no_rule.run(sample['question'], context_from_gold)

# 4. 判斷正確率 (考慮數值誤差)
def evaluate(pred, gt):
    try:
        # 財務計算通常比對數值，排除字串格式干擾
        return float(pred) == float(gt)
    except:
        return str(pred).strip() == str(gt).strip()

is_correct = evaluate(predicted_answer, ground_truth)
print(f"\n[實驗結果] 模型預測: {predicted_answer} | 是否正確: {is_correct}")

[System Info] 提取 Context: if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .
[System Info] 目標答案 (GT): 380
[System Info] 程式 (PG): divide(100, 100), divide(3.8, #0)
[*] Executing Tool: table_sum with {'a': '2009', 'b': 'interest expense'}


TypeError: table_sum() got an unexpected keyword argument 'a'

In [None]:
import json

with open("results.jsonl") as file:
    lines = file.readlines()
    data = [json.loads(line) for line in lines] 

data
data[2]

{'question': 'what was the total operating expenses in 2018 in millions',
 'ground_truth': '41932',
 'raw_response': '41.86',
 'cleaned_prediction': '41.86',
 'is_correct': False}