Licensed under the MIT License.

Copyright (c) 2025-2035. All rights reserved by Hanhan Wu.

Permission is hereby granted to view this code for evaluation purposes only.
You may not reuse, copy, modify, merge, publish, distribute, sublicense,
or exploit this code without Hanhan Wu's EXPLICIT written permission.


# Create Large Dataset of Finance Q&A

* About the Data Source:
  * https://huggingface.co/datasets/sujet-ai/Sujet-Finance-QA-Vision-100k


In [1]:
%load_ext autoreload
%autoreload 2

import ast
import pandas as pd
from pprint import pprint
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor


import warnings
warnings.filterwarnings('ignore')

### Process Data

In [2]:
ds = load_dataset("sujet-ai/Sujet-Finance-QA-Vision-100k")
print(len(ds['train']), len(ds['test']))
print(ds['train'][0].keys())

9212 589
dict_keys(['doc_id', 'image', 'content', 'qa_pairs'])


In [3]:
def parse_row(row):
    return {
        'doc_id': row['doc_id'],
        'context': [row['content']],
        'qa_pairs': ast.literal_eval(row['qa_pairs'])
    }


def get_exploded_data(data):
    with ThreadPoolExecutor() as executor:
        rows = list(executor.map(parse_row, data))
    raw_df = pd.DataFrame(rows)
    print(raw_df.shape)

    exploded_df = raw_df.explode('qa_pairs').reset_index(drop=True)
    exploded_df['question'] = exploded_df['qa_pairs'].apply(lambda x: x['question'])
    exploded_df['answer'] = exploded_df['qa_pairs'].apply(lambda x: x['answer'])
    exploded_df = exploded_df.drop(columns=['qa_pairs'])

    return exploded_df

In [4]:
exploded_train_df = get_exploded_data(ds['train'])

print(exploded_train_df.shape)
display(exploded_train_df.head())

(9212, 3)
(100629, 4)


Unnamed: 0,doc_id,context,question,answer
0,175.jpeg,[### Document Analysis\n\n**Document Type:**\n...,What is the purpose of Recommendation No. 6?,The purpose is to grant approval for specific ...
1,175.jpeg,[### Document Analysis\n\n**Document Type:**\n...,What equipment does the memo recommend approvi...,The memo recommends approving funds for renova...
2,175.jpeg,[### Document Analysis\n\n**Document Type:**\n...,What is the total direct cost recommended in t...,"$25,380.00"
3,175.jpeg,[### Document Analysis\n\n**Document Type:**\n...,How much are indirect costs proposed to be all...,"$6,345.00"
4,175.jpeg,[### Document Analysis\n\n**Document Type:**\n...,What is the main reason for relocating the Ele...,To maximize resources by using facilities more...


In [5]:
exploded_test_df = get_exploded_data(ds['test'])

print(exploded_test_df.shape)
display(exploded_test_df.head())

(589, 3)
(6421, 4)


Unnamed: 0,doc_id,context,question,answer
0,5370.jpeg,[### Document Type\nThis image represents a hi...,What is the date of the invoice document?,"The document date is February 6, 1941."
1,5370.jpeg,[### Document Type\nThis image represents a hi...,What is the company issuing the invoice?,The company issuing the invoice is Tabacalera ...
2,5370.jpeg,[### Document Type\nThis image represents a hi...,What is the commodity being shipped according ...,The commodity being shipped is 960 cigarettes ...
3,5370.jpeg,[### Document Type\nThis image represents a hi...,What is the total amount charged for the shipm...,The total amount charged is $3.51.
4,5370.jpeg,[### Document Type\nThis image represents a hi...,What is the insurance value of the shipment?,The shipment is insured for $100.00.


In [6]:
exploded_train_df.rename(columns={'answer': 'ground_truth'}, inplace=True)
exploded_test_df.rename(columns={'answer': 'ground_truth'}, inplace=True)

In [7]:
exploded_train_df.to_parquet('bigger_finance_qa_train.parquet', index=False)
exploded_test_df.to_parquet('bigger_finance_qa_test.parquet', index=False)

### Sample Data to Save Money 😉

In [2]:
exploded_train_df = pd.read_parquet('bigger_finance_qa_train.parquet')
exploded_test_df = pd.read_parquet('bigger_finance_qa_test.parquet')

In [3]:
# choose the 1st record for each doc_id
first_per_doc_train_df = exploded_train_df.groupby('doc_id', as_index=False).first()
first_per_doc_test_df = exploded_test_df.groupby('doc_id', as_index=False).first()

print(first_per_doc_train_df.shape, first_per_doc_test_df.shape)

(9212, 4) (589, 4)


In [5]:
sampled_train_df = first_per_doc_train_df.sample(n=280, random_state=10)
final_train_df = sampled_train_df.sample(n=100, random_state=10)
final_val_df = sampled_train_df.drop(final_train_df.index).reset_index(drop=True)
final_train_df = final_train_df.reset_index(drop=True)

final_test_df = first_per_doc_test_df.sample(n=200, random_state=10)

print(final_train_df.shape, final_val_df.shape, final_test_df.shape)
display(final_train_df.head())

(100, 4) (180, 4) (200, 4)


Unnamed: 0,doc_id,context,question,ground_truth
0,2574.jpeg,[**Document Type:** \nThis is a check issued b...,Who issued the check?,"The Tobacco Institute, located at 1875 I Stree..."
1,4492.jpeg,[### Document Analysis\n\n**Document Type**: C...,What is the type of document being analyzed?,The document is a Check Request Form.
2,7281.jpeg,[**Document Type**: This is a commercial invoi...,What is the name of the organization issuing t...,Philip Morris Limited
3,1242.jpeg,[### Document Type\nThis is a financial docume...,What is the client company for this outdoor ad...,The client company is P.M. Inc.
4,7700.jpeg,[### Document Type\nThis is a Production Estim...,What is the client's name mentioned in the doc...,The client's name is RJR/NOW Family.


In [7]:
final_train_df.to_parquet('final_finance_qa_train.parquet', index=False)
final_val_df.to_parquet('final_finance_qa_val.parquet', index=False)
final_test_df.to_parquet('final_finance_qa_test.parquet', index=False)