In [None]:
!wget https://huggingface.co/datasets/bsmock/FinTabNet.c/resolve/main/FinTabNet.c-PDF_Annotations.tar.gz -O finqa_pdf.tar.gz
!tar -xzvf finqa_pdf.tar.gz

In [None]:
# download dataset from github
!wget https://github.com/czyssrs/FinQA/archive/refs/heads/main.zip -O finqa.zip
!unzip -o finqa.zip 

In [None]:
import os
import json
import pathlib

dir = "./FinTabNet.c-PDF_Annotations/"
filename = os.listdir(dir)[0]

print(dir + filename)

full_path = dir + filename
with open(full_path, 'r') as f:
    data = json.load(f)


In [30]:
import json
import os
import xml.etree.ElementTree as ET
from openai import OpenAI  # NVIDIA NIM 兼容 OpenAI SDK

# 1. setup NVIDIA NIM API 
os.environ["NVIDIA_API_KEY"] = "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" # <--- Replace this

client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key=os.environ.get("NVIDIA_API_KEY")
)

# 2. 讀取你上傳的 train.json
def load_finqa_sample(file_path, sample_size=15):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data[:sample_size]  # 先取前 5 筆做實驗

# 假設 train.json 與 notebook 在同一目錄
dataset = load_finqa_sample("/root/hsin_research/FinQA-main/dataset/train.json")

print(f"Loaded {len(dataset)} samples for testing.")
print("Sample Question 1:", dataset[0]['qa']['question'])

Loaded 15 samples for testing.
Sample Question 1: what is the the interest expense in 2009?


In [37]:
print(dataset[0].keys())
print(len(dataset[0]['pre_text']))
print(dataset[0]['pre_text'][0])
print(len(dataset[0]['post_text']))
print(len(dataset[0]['table']))

dataset[0]['qa']['program']

dict_keys(['pre_text', 'post_text', 'filename', 'table_ori', 'table', 'qa', 'id', 'table_retrieved', 'text_retrieved', 'table_retrieved_all', 'text_retrieved_all'])
15
interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .
35
4


'divide(100, 100), divide(3.8, #0)'

In [32]:
#  XML Rulebook
rulebook_xml_content = """
<Rulebook domain="finqa_reasoning">
    <Rule id="01" phase="generation", confidence="1", source="log_1">
        <Trigger>write program reasoning steps math operation finqa format</Trigger>
        <Action>CRITICAL FORMATTING RULE: You must output the answer as a Domain Specific Language (DSL) program. Use functions: add(), subtract(), multiply(), divide(). Do NOT write Python code. Do NOT write explanations. Example output: "subtract(10, 5), divide(#0, 2)"</Action>
    </Rule>
    <Rule id="02" phase="generation", confidence="1", source="log_1">
        <Trigger>basis points interest rate change bps fluctuation</Trigger>
        <Action>KNOWLEDGE INJECTION: "Basis points" are a unit of measure for interest rates. 100 basis points = 1% = 0.01. If the text says "100 basis points change results in $3.8 million", use this ratio for calculation.</Action>
    </Rule>
</Rulebook>
"""

class RuleRetriever:
    def __init__(self, xml_content):
        self.root = ET.fromstring(xml_content)
    
    def retrieve(self, query, top_k=2):
        # Simulated Vector Search
        # TODO: Change to Embedding Cosine Similarity or Stochastic Sampling
        query_lower = query.lower()
        hits = []
        for rule in self.root.findall('Rule'):
            triggers = rule.find('Trigger').text.split()
            
            # Hit trigger word then recall 
            score = sum(1 for t in triggers if t in query_lower)
            if score > 0 or rule.get('id') == 'fin_fmt_01': 
                hits.append(rule.find('Action').text)
        return hits[:top_k]

retriever = RuleRetriever(rulebook_xml_content)
print("Test Retrieval:", retriever.retrieve("calculate interest rate basis points"))

Test Retrieval: ['KNOWLEDGE INJECTION: "Basis points" are a unit of measure for interest rates. 100 basis points = 1% = 0.01. If the text says "100 basis points change results in $3.8 million", use this ratio for calculation.']


In [33]:
def generate_response(prompt, model="meta/llama-3.3-70b-instruct"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
        max_tokens=128
    )
    return response.choices[0].message.content

def run_experiment(sample, with_rules=False):
    # build context
    context_text = " ".join(sample['pre_text'] + sample['post_text'])
    table_text = str(sample['table'])                                   # store table in string format
    question = sample['qa']['question']
    
    # system prompt
    base_prompt = f"""
    You are a financial reasoning expert. 
    Context: {context_text}
    Table Data: {table_text}
    
    Question: {question}
    
    Task: Write a logical program steps to answer the question.
    """
    print(base_prompt)
    
    if with_rules:
        # rule retrieve and inject
        rules = retriever.retrieve(question + " write program")
        rules_block = "\n### IMPORTANT RULES (Must Follow):\n" + "\n".join([f"- {r}" for r in rules])
        final_prompt = base_prompt + rules_block
    else:
        final_prompt = base_prompt

    # call model
    return generate_response(final_prompt)

In [34]:
import pandas as pd

results = []

print("Running Experiment on Llama 3.3 70B...")

for i, sample in enumerate(dataset):
    print(f"Processing Sample {i+1}...")
    
    # Run Baseline
    baseline_out = run_experiment(sample, with_rules=False)
    
    # Run Enhanced
    enhanced_out = run_experiment(sample, with_rules=True)
    
    # Record Result
    results.append({
        "ID": sample['id'],
        "Question": sample['qa']['question'],
        "Gold_Program": sample['qa']['program'],
        "Baseline_Output": baseline_out,
        "Enhanced_Output": enhanced_out
    })

# Display result
df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
display(df)


# df.to_csv("experiment_results_llama3.3.csv", index=False)

Running Experiment on Llama 3.3 70B...
Processing Sample 1...
Processing Sample 2...
Processing Sample 3...
Processing Sample 4...
Processing Sample 5...
Processing Sample 6...
Processing Sample 7...
Processing Sample 8...
Processing Sample 9...
Processing Sample 10...
Processing Sample 11...
Processing Sample 12...
Processing Sample 13...
Processing Sample 14...
Processing Sample 15...


Unnamed: 0,ID,Question,Gold_Program,Baseline_Output,Enhanced_Output
0,ADI/2009/page_49.pdf-1,what is the the interest expense in 2009?,"divide(100, 100), divide(3.8, #0)","To calculate the interest expense in 2009, we need to follow these steps:\n\n1. **Understand the given information**: The interest rate is based on the three-month LIBOR plus 2.05%. As of October 31, 2009, the LIBOR rate was 2.34%. If LIBOR changes by 100 basis points, the annual interest expense would change by $3.8 million.\n\n2. **Calculate the total interest rate**: The total interest rate is the sum of the LIBOR rate and the additional 2.05%. So, the total interest rate = LIBOR rate + 2","define(libor_change, 100)\ndefine(libor_change_in_percent, 1)\ndefine(interest_expense_change, 3800000)\ndefine(current_libor, 2.34)\ndefine(additional_interest_rate, 2.05)\ndefine(total_interest_rate, add(current_libor, additional_interest_rate))\ndefine(ratio, divide(interest_expense_change, libor_change_in_percent))\ndefine(current_interest_expense, multiply(ratio, total_interest_rate)) \ncurrent_interest_expense"
1,ABMD/2012/page_75.pdf-1,"during the 2012 year , did the equity awards in which the prescribed performance milestones were achieved exceed the equity award compensation expense for equity granted during the year?","multiply(607, 18.13), multiply(#0, const_1000), multiply(3.3, const_1000000), greater(#1, #2)","To answer the question, we need to compare the equity awards in which the prescribed performance milestones were achieved with the equity award compensation expense for equity granted during the year. Here are the logical program steps:\n\n1. **Extract relevant data**:\n\t* Equity awards in which the prescribed performance milestones were achieved: $3.3 million (mentioned in the text as ""the company has recorded $3.3 million in stock-based compensation expense for equity awards in which the prescribed performance milestones have been achieved or are probable of being achieved."")\n\t* Equity award compensation expense for equity granted during the year: Not explicitly mentioned, but we can calculate it","define equity_awards_achieved_expense 3.3\ndefine equity_award_granted_expense multiply(607, 18.13)\nsubtract(equity_awards_achieved_expense, equity_award_granted_expense)"
2,AAL/2018/page_13.pdf-2,what was the total operating expenses in 2018 in millions,"divide(9896, 23.6%)","To find the total operating expenses in 2018, we can use the information provided in the table. Here are the logical program steps to answer the question:\n\n1. **Identify the relevant data**: From the table, we need the ""aircraft fuel expense"" and ""percent of total operating expenses"" for the year 2018.\n - Aircraft fuel expense in 2018: $9896 million\n - Percent of total operating expenses: 23.6%\n\n2. **Set up the equation**: Let x be the total operating expenses. The equation based on the percentage can be set up as follows:\n -","divide(9896, 0.236)"
3,INTC/2013/page_71.pdf-4,what percentage of total cash and investments as of dec . 29 2012 was comprised of available-for-sale investments?,"divide(14001, 26302)","To find the percentage of total cash and investments as of Dec. 29, 2012, that was comprised of available-for-sale investments, we can follow these steps:\n\n1. **Identify the total cash and investments as of Dec. 29, 2012**: This value is given in the table as $26,302 million.\n\n2. **Identify the value of available-for-sale investments as of Dec. 29, 2012**: This value is also given in the table as $14,001 million.\n\n3. **Calculate the percentage**: To find the percentage, divide the value of available-for-sale investments","divide(14001, 26302), multiply(#0, 100)"
4,ETR/2008/page_313.pdf-3,what is the growth rate in net revenue in 2008?,"subtract(959.2, 991.1), divide(#0, 991.1)","To calculate the growth rate in net revenue in 2008 compared to 2007, we can follow these logical program steps:\n\n1. **Extract the necessary data**: From the given table, we need the net revenue values for 2007 and 2008.\n - 2007 net revenue: $991.1 million\n - 2008 net revenue: $959.2 million\n\n2. **Calculate the change in net revenue**: Subtract the 2008 net revenue from the 2007 net revenue to find the change.\n - Change = 2007 net revenue - 2008 net revenue\n","subtract(959.2, 991.1), divide(#0, 991.1), multiply(#1, 100)"
5,C/2010/page_272.pdf-1,what was the growth rate of the loans held-for-sale that are carried at locom from 2009 to 2010,"divide(2.5, 1.6), divide(#0, 1.6)","To calculate the growth rate of the loans held-for-sale that are carried at LCOM (Lower of Cost or Market) from 2009 to 2010, we need to follow these steps:\n\n1. **Identify the relevant data**: From the provided table, we need the aggregate cost or fair value of loans held-for-sale for both December 31, 2009, and December 31, 2010. Since the question does not specify whether to use aggregate cost or fair value, we will calculate the growth rate for both.\n\n2. **Extract the data for 2009 and 2010**:\n -","subtract(3.1, 2.5), divide(#0, 2.5)"
6,AMT/2012/page_121.pdf-1,"for acquired customer-related and network location intangibles , what is the expected annual amortization expenses , in millions?","add(75.0, 72.7), divide(#0, 20)","To calculate the expected annual amortization expenses for acquired customer-related and network location intangibles, we can follow these logical program steps:\n\n1. **Identify the total value of customer-related and network location intangibles**: From the given text, we have two sets of acquisitions with intangibles values:\n - The first acquisition has customer-related intangibles of approximately $10.7 million and network location intangibles of approximately $10.4 million.\n - The initial note mentions customer-related intangibles of approximately $75.0 million and network location intangibles of approximately $72.7 million, but these","add(75.0, 72.7), add(10.7, 10.4), add(#0, #2), divide(#4, 20)"
7,GIS/2019/page_45.pdf-1,in 2019 what was the percent of the net earnings to the net cash provided by operating activities,"divide(1786.2, 2807.0)","To find the percent of the net earnings to the net cash provided by operating activities in 2019, we can follow these logical program steps:\n\n1. **Extract Relevant Data**:\n - Net earnings including earnings attributable to redeemable and noncontrolling interests as reported: $1786.2 million\n - Net cash provided by operating activities: $2807.0 million\n\n2. **Calculate the Conversion Rate**:\n - The conversion rate of net earnings to net cash provided by operating activities can be calculated by dividing the net cash provided by operating activities by the net earnings and then multiplying by 100 to get the percentage","divide(2807.0, 1786.2)"
8,IPG/2009/page_89.pdf-3,what percentage decrease occurred from 2011-2012 for deferred acquisition payments?,"subtract(34.8, 1.2), divide(#0, 34.8), multiply(#1, const_100)","To calculate the percentage decrease in deferred acquisition payments from 2011 to 2012, we can follow these steps:\n\n1. **Identify the relevant data**: From the table provided, the deferred acquisition payments for 2011 and 2012 are $34.8 million and $1.2 million, respectively.\n\n2. **Calculate the decrease**: Subtract the 2012 amount from the 2011 amount to find the decrease.\n - Decrease = 2011 amount - 2012 amount\n - Decrease = $34.8 million - $1.2 million = $33.6 million\n\n3","subtract(34.8, 1.2), divide(#0, 34.8), multiply(#1, 100)"
9,CDNS/2018/page_32.pdf-2,"how is net change in cash from financing activity affected by the share repurchase during december 20018 , ( in millions ) ?","multiply(1327657, 42.61), divide(#0, const_1000000)","To determine the net change in cash from financing activity affected by the share repurchase during December 2018, we need to follow these logical program steps:\n\n1. **Calculate the total amount spent on share repurchases**:\n - Identify the total number of shares purchased during December 2018.\n - Identify the average price paid per share during December 2018.\n - Multiply the total number of shares purchased by the average price paid per share to get the total amount spent.\n\n2. **Determine the impact on cash**:\n - Recognize that share repurchases reduce the company's cash balance since cash is used to","subtract(175, multiply(1327657, 42.61, divide(1, 1000000)))"


In [None]:
from docling.document_converter import DocumentConverter

source = "https://www.canmat.org/wp-content/uploads/2019/07/Yatham-LN-2018-CANMAT-ISBD-guidelines-for-bipolar-disorder-Bipol-Disord.pdf"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)
print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"