# 0. load imports

In [172]:
!pip install -r requirements.txt




[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [173]:
import json
import re
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain_core.tools import tool
from langchain.schema import BaseMessage, HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
import os
import dotenv


# Load environment variables
dotenv.load_dotenv()

# Get API key
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    print("Warning: GEMINI_API_KEY not found in environment variables")
    print("Please make sure to set your API key in the .env file")
else:
    print("GEMINI_API_KEY loaded successfully")

print("Imports loaded successfully!")

GEMINI_API_KEY loaded successfully
Imports loaded successfully!


In [174]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

In [175]:
try:
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash", 
        temperature=0, 
        api_key=GEMINI_API_KEY,
        convert_system_message_to_human=True
    )
    print("LLM initialized successfully!")
except Exception as e:
    print(f"Error initializing LLM: {e}")
    llm = None

LLM initialized successfully!


In [176]:
DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant that extracts financial data from documents."

In [177]:
res = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content="Hey how are you?")
    ])
print(res.content)

I'm doing well, thank you for asking! I'm ready to assist you with extracting financial data from documents. Just let me know what you need. How can I help you today?


# 1. Document extraction & Prompt engineering

In [178]:
DATA_SOURCE_FILE_NAME = "fy2024_analysis_of_revenue_and_expenditure.pdf"

In [179]:
class FinancialExtractionOutput(BaseModel):
    """Structured output for financial data extraction from the PDF"""
    
    corporate_income_tax_2024: Optional[float] = Field(
        description="Amount of Corporate Income Tax in 2024 (in millions or appropriate unit)"
    )
    
    yoy_percentage_corp_income_tax_2024: Optional[float] = Field(
        description="Year-on-year percentage difference of Corporate Income Tax in 2024"
    )
    
    total_top_ups_2024: Optional[float] = Field(
        description="Total amount of top ups in 2024 (in millions or appropriate unit)"
    )
    
    operating_revenue_taxes: List[str] = Field(
        description="List of taxes mentioned in the Operating Revenue section"
    )

print("Imports loaded successfully!")

Imports loaded successfully!


In [180]:
import pymupdf  

with pymupdf.open(DATA_SOURCE_FILE_NAME) as doc:
    pdf_text = {}
    for page_num in range(len(doc)):
        page = doc[page_num]
        pdf_text[page_num + 1] = page.get_text()

# page 5
print(pdf_text[5])

MINISTRY OF FINANCE 
 
5 
 
01 Update on Financial Year 2023 
 
1.1 
Expected Overall Fiscal Position for FY2023  
 
The basic deficit is revised to $5.4 billion (0.8% of GDP). After factoring in Top-
ups to Endowment and Trust Funds of $24.3 billion, Net Investment Returns 
Contribution (NIRC) of $22.9 billion, Capitalisation of Nationally Significant 
Infrastructure of $3.5 billion, and Significant Infrastructure Government Loan 
Act (SINGA) Interest Costs and Loan Expenses of $0.2 billion, the Overall Fiscal 
Position for Revised FY2023 is a deficit of $3.6 billion (0.5% of GDP). This is larger 
than the $0.4 billion deficit estimated at Budget 2023. This is due to higher Total 
Expenditure to meet priority areas such as catching up of projects deferred due 
to COVID-19; funding for public healthcare institutions; and a top-up to the 
Majulah Package Fund to boost the retirement and healthcare adequacy of 
Singaporeans, as announced by Prime Minister Lee Hsien Loong at the National 

Separate prompts so we don't blow the context window, or exceed free tier input token limit

In [181]:
# Prompt for Corporate Income Tax Amount in 2024 (page 5)
corp_income_tax_prompt = """
Extract the exact amount of Corporate Income Tax specified from the following page text. 
Assume units is in billions, return only the number as a float.

PAGE TEXT:
{page_text}
"""

# Prompt for YOY Percentage Difference of Corporate Income Tax in 2024 (page 5)
yoy_corp_income_tax_prompt = """
Extract the year-over-year percentage change of Corporate Income Tax from from the following page text. The text may not specify the specific year. 
Assume units is in billions, return only the number as a float.

PAGE TEXT:
{page_text}
"""

# Prompt for Total Amount of Top-ups in 2024 (page 20)
total_top_ups_prompt = """
Extract the total amount of all top-ups to endowment and trust funds for FY2024 from the following page text.
Assume units is in billions, return only the number as a float.

PAGE TEXT:
{page_text}
"""

# Prompt for List of Taxes in Operating Revenue Section (pages 5-6)
operating_revenue_taxes_prompt = """
Identify all types of taxes mentioned in the "Operating Revenue" section from the following page text. 
Return a list of the exact tax names as strings.

PAGE TEXT:
{page_text}
"""

In [182]:
def extract_financial_data_from_pdf(pdf_text: dict, llm) -> FinancialExtractionOutput:
    """
    Extracts financial data from specific pages of the PDF using LLM prompts.
    Args:
        pdf_text (dict): Dictionary mapping page numbers to page text.
        llm: The language model instance to use for extraction.
    Returns:
        FinancialExtractionOutput: Structured extraction result.
    """
    # Corporate Income Tax Amount for 2024 (page 5)
    corp_income_tax_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=corp_income_tax_prompt.format(page_text=pdf_text[5]))
    ]).content
    print(f"Corporate Income Tax Response: {corp_income_tax_resp}")
    try:
        corporate_income_tax_2024 = float(corp_income_tax_resp)
    except Exception:
        corporate_income_tax_2024 = None

    # YOY Percentage Difference of Corporate Income Tax in 2024 (page 5)
    yoy_corp_income_tax_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=yoy_corp_income_tax_prompt.format(page_text=pdf_text[5]))
    ]).content
    print(f"YOY Corporate Income Tax Response: {yoy_corp_income_tax_resp}")
    try:
        yoy_percentage_corp_income_tax_2024 = float(yoy_corp_income_tax_resp)
    except Exception:
        yoy_percentage_corp_income_tax_2024 = None

    # Total Amount of Top-ups in 2024 (page 20)
    total_top_ups_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=total_top_ups_prompt.format(page_text=pdf_text[20]))
    ]).content
    print(f"Total Top-ups Response: {total_top_ups_resp}")
    try:
        total_top_ups_2024 = float(total_top_ups_resp)
    except Exception:
        total_top_ups_2024 = None

    # List of Taxes in Operating Revenue Section (pages 5-6)
    operating_revenue_text = pdf_text[5] + "\n" + pdf_text[6]
    operating_revenue_taxes_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=operating_revenue_taxes_prompt.format(page_text=operating_revenue_text))
    ]).content
    print(f"Operating Revenue Taxes Response: {operating_revenue_taxes_resp}")
    try:
        taxes = json.loads(operating_revenue_taxes_resp)
        if not isinstance(taxes, list):
            raise ValueError
        operating_revenue_taxes = taxes
    except Exception:
        operating_revenue_taxes = [operating_revenue_taxes_resp]

    return FinancialExtractionOutput(
        corporate_income_tax_2024=corporate_income_tax_2024,
        yoy_percentage_corp_income_tax_2024=yoy_percentage_corp_income_tax_2024,
        total_top_ups_2024=total_top_ups_2024,
        operating_revenue_taxes=operating_revenue_taxes
    )

In [183]:
extracted_data = extract_financial_data_from_pdf(pdf_text, llm)

Corporate Income Tax Response: 28.4
YOY Corporate Income Tax Response: 17.0
Total Top-ups Response: 20.352
Operating Revenue Taxes Response: Here's a list of the tax names mentioned in the "Operating Revenue" section of the provided text:

*   "Corporate Income Tax"
*   "Other Taxes"
*   "Personal Income Tax"
*   "Assets Taxes"
*   "Betting Taxes"
*   "Goods and Services Tax"
*   "Water Conservation Tax"
*   "Annual Tonnage Tax"
*   "Casino Taxes"


### Final output for extracted data

In [184]:
from pprint import pprint

pprint(extracted_data.dict())

{'corporate_income_tax_2024': 28.4,
 'operating_revenue_taxes': ["Here's a list of the tax names mentioned in the "
                             '"Operating Revenue" section of the provided '
                             'text:\n'
                             '\n'
                             '*   "Corporate Income Tax"\n'
                             '*   "Other Taxes"\n'
                             '*   "Personal Income Tax"\n'
                             '*   "Assets Taxes"\n'
                             '*   "Betting Taxes"\n'
                             '*   "Goods and Services Tax"\n'
                             '*   "Water Conservation Tax"\n'
                             '*   "Annual Tonnage Tax"\n'
                             '*   "Casino Taxes"'],
 'total_top_ups_2024': 20.352,
 'yoy_percentage_corp_income_tax_2024': 17.0}


C:\Users\riley\AppData\Local\Temp\ipykernel_10776\1254583981.py:3: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  pprint(extracted_data.dict())


# 2. Tool Calling & Reasoning Integration

In [185]:
from langchain.output_parsers import PydanticOutputParser
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel, Field


Define new tool with decorator

In [186]:


@tool
def normalize_submission_date(date_str: str) -> str:
    """
    Use this tool to convert dates into YYYY-MM-DD format.
    """
    try:
        dt = datetime.strptime(date_str.strip(), "%d %B %Y")
        return dt.strftime("%Y-%m-%d")
    except Exception as e:
        print(f"Error normalizing date: {e}")
        return None

Define a pydantic class to force structured output

In [187]:

class DateExtractionOutput(BaseModel):
    original_text: str = Field(description="Original text parsed into the LLM")
    normalized_date: Optional[str] = Field(description="Date in YYYY-MM-DD format, if found")
    status: Optional[str] = Field(description="Status of the date (Upcoming if > 2024-01-01, Expired if < 2024-01-01, Ongoing if = 2024-01-01), compared against the date of 2024-01-01")


In [192]:
# Create an agent that can use tools for date normalization with structured output
from langchain.agents import AgentExecutor, create_tool_calling_agent

agent_system_prompt = """You are a helpful assistant that processes text and normalizes dates.

CRITICAL INSTRUCTIONS - YOU MUST FOLLOW THESE EXACTLY:

1. TOOL USAGE IS MANDATORY:
   - When you find ANY date in "DD Month YYYY" format (e.g., "15 March 2024"), you MUST call the normalize_submission_date tool
   - DO NOT convert dates manually or use your own logic
   - DO NOT proceed to step 2 until you have called the tool and received the normalized date
   - If there is a date pattern, calling the tool is REQUIRED, not optional

2. DATE COMPARISON:
   After receiving the normalized date from the tool, compare it against 2024-01-01:
   - "Upcoming" if normalized_date > 2024-01-01
   - "Expired" if normalized_date < 2024-01-01
   - "Ongoing" if normalized_date = 2024-01-01

3. OUTPUT FORMAT:
   Return ONLY this JSON structure with no additional text:
   {{
       "original_text": "the original input text",
       "normalized_date": "YYYY-MM-DD from tool output, or null if no date found",
       "status": "Ongoing/Upcoming/Expired/null if no date found"
   }}

WORKFLOW:
Step 1: Analyze input for dates in "DD Month YYYY" format
Step 2: If date found → CALL normalize_submission_date tool (MANDATORY)
Step 3: Wait for tool response
Step 4: Use tool output to determine status
Step 5: Return JSON with results

If no date is found in the input, return the JSON with normalized_date and status as null.

Remember: You cannot determine the normalized date yourself. You MUST use the tool."""

# Create the prompt template for the agent
agent_prompt = ChatPromptTemplate.from_messages([
    ("system", agent_system_prompt),
    ("human", "{input}"),
    ("placeholder", "{agent_scratchpad}"),
])

# Create the tool list
tools = [normalize_submission_date]

# Create output parser for post-processing
output_parser = PydanticOutputParser(pydantic_object=DateExtractionOutput)

# Create the agent
agent = create_tool_calling_agent(llm, tools, agent_prompt)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

# Wrapper function to parse agent output
def process_with_structured_output(text: str) -> DateExtractionOutput:
    result = agent_executor.invoke({"input": text})
    # Parse the agent's output into structured format
    return output_parser.parse(result["output"])

print("Agent with structured output created successfully!")

Agent with structured output created successfully!


Test process_with_structured_output

In [193]:
result_structured = process_with_structured_output("The submission was on 15 March 2024")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `normalize_submission_date` with `{'date_str': '15 March 2024'}`


[0m[36;1m[1;3m2024-03-15[0m[32;1m[1;3m```json
{
    "original_text": "The submission was on 15 March 2024",
    "normalized_date": "2024-03-15",
    "status": "Upcoming"
}
```[0m

[1m> Finished chain.[0m


Test with pdf_text page 1 and 36

In [194]:

page_1_result = process_with_structured_output(pdf_text[1])


print(page_1_result)
print("Original text:", page_1_result.original_text)
print("Normalized date:", page_1_result.normalized_date)
print("Status:", page_1_result.status)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `normalize_submission_date` with `{'date_str': '16 February 2024'}`


[0m[36;1m[1;3m2024-02-16[0m[32;1m[1;3m```json
{
    "original_text": "REVENUE AND \nEXPENDITURE \nFinancial Year 2024  \n \n \n \n \n \n \n \n \nDistributed on Budget Day: 16 February 2024",
    "normalized_date": "2024-02-16",
    "status": "Upcoming"
}
```[0m

[1m> Finished chain.[0m
original_text='REVENUE AND \nEXPENDITURE \nFinancial Year 2024  \n \n \n \n \n \n \n \n \nDistributed on Budget Day: 16 February 2024' normalized_date='2024-02-16' status='Upcoming'
Original text: REVENUE AND 
EXPENDITURE 
Financial Year 2024  
 
 
 
 
 
 
 
 
Distributed on Budget Day: 16 February 2024
Normalized date: 2024-02-16
Status: Upcoming


**NOTE: Sometimes the LLM may not invoke the tool, highly depends on the model capabilities**

In [None]:

page_36_result = process_with_structured_output(pdf_text[36])


print(page_36_result)
print("\n---------\n")
print("Original text:", page_36_result.original_text)
print("Normalized date:", page_36_result.normalized_date)
print("Status:", page_36_result.status)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
       "original_text": "MINISTRY OF FINANCE \n \n36 \n \nGlossary of Terms \n \nAssets Taxes  \nAssets Taxes refer to Property Tax and Estate \nDuty. Property Tax is a tax on the ownership of \nproperty and is payable by all property owners \non the properties owned by them. Estate Duty \nis a tax on the total market value of a person's \nassets (cash and non-cash) at the date of his or \nher death. Estate Duty does not apply to a \nperson who dies after 15 February 2008.  \n \nCustoms Duties \nTaxes on goods imported into Singapore. In \nSingapore, Customs Duties are principally \nimposed on alcoholic beverages. \n \nDevelopment Expenditure \nExpenses \nthat \nrepresent \na \nlonger-term \ninvestment and/or are incurred on capital assets \nin respect of or in connection with the economic \ndevelopment or general welfare of Singapore. \nExamples of spending areas are on the \nacquisition of heavy equipment and capi