# 0. load imports

In [25]:
!pip install -r requirements.txt




[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
import json
import re
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.schema import BaseMessage, HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
import os
import dotenv

# Load environment variables
dotenv.load_dotenv()

# Get API key
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    print("Warning: GEMINI_API_KEY not found in environment variables")
    print("Please make sure to set your API key in the .env file")
else:
    print("GEMINI_API_KEY loaded successfully")

print("Imports loaded successfully!")

GEMINI_API_KEY loaded successfully
Imports loaded successfully!


In [27]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

In [None]:
try:
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash", 
        temperature=0, 
        api_key=GEMINI_API_KEY,
        convert_system_message_to_human=True
    )
    print("LLM initialized successfully!")
except Exception as e:
    print(f"Error initializing LLM: {e}")
    llm = None

LLM initialized successfully!


In [29]:
DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant that extracts financial data from documents."

In [30]:
res = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content="Hey how are you?")
    ])
print(res.content)

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 2
Please retry in 30.558478021s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-pro"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 2
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 30
}
].
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals

Hello! I'm doing well, thank you for asking.

I'm ready to help you extract financial data from your documents. How can I assist you today? You can upload a file, paste the text, or just describe the information you need.


# 1. Document extraction & Prompt engineering

In [31]:
DATA_SOURCE_FILE_NAME = "fy2024_analysis_of_revenue_and_expenditure.pdf"

In [32]:
class FinancialExtractionOutput(BaseModel):
    """Structured output for financial data extraction from the PDF"""
    
    corporate_income_tax_2024: Optional[float] = Field(
        description="Amount of Corporate Income Tax in 2024 (in millions or appropriate unit)"
    )
    
    yoy_percentage_corp_income_tax_2024: Optional[float] = Field(
        description="Year-on-year percentage difference of Corporate Income Tax in 2024"
    )
    
    total_top_ups_2024: Optional[float] = Field(
        description="Total amount of top ups in 2024 (in millions or appropriate unit)"
    )
    
    operating_revenue_taxes: List[str] = Field(
        description="List of taxes mentioned in the Operating Revenue section"
    )

print("Imports loaded successfully!")

Imports loaded successfully!


In [33]:
import pymupdf  

with pymupdf.open(DATA_SOURCE_FILE_NAME) as doc:
    pdf_text = {}
    for page_num in range(len(doc)):
        page = doc[page_num]
        pdf_text[page_num + 1] = page.get_text()

# 500 characters of page 5
print(pdf_text[5][:500])

MINISTRY OF FINANCE 
 
5 
 
01 Update on Financial Year 2023 
 
1.1 
Expected Overall Fiscal Position for FY2023  
 
The basic deficit is revised to $5.4 billion (0.8% of GDP). After factoring in Top-
ups to Endowment and Trust Funds of $24.3 billion, Net Investment Returns 
Contribution (NIRC) of $22.9 billion, Capitalisation of Nationally Significant 
Infrastructure of $3.5 billion, and Significant Infrastructure Government Loan 
Act (SINGA) Interest Costs and Loan Expenses of $0.2 billion, th


Separate prompts so we don't blow the context window, or exceed free tier input token limit

In [34]:
# Prompt for Corporate Income Tax Amount in 2024 (page 5)
corp_income_tax_prompt = """
You are a financial analyst. Extract the exact amount of Corporate Income Tax for FY2024 from the following page text. 
Return only the number as a float (in millions if specified, otherwise in the unit mentioned). 
If not found, return null.

PAGE TEXT:
{page_text}
"""

# Prompt for YOY Percentage Difference of Corporate Income Tax in 2024 (page 5)
yoy_corp_income_tax_prompt = """
You are a financial analyst. Extract the year-over-year percentage change of Corporate Income Tax from 2023 to 2024 from the following page text. 
Return only the percentage as a float (positive for increase, negative for decrease). 
If not found, return null.

PAGE TEXT:
{page_text}
"""

# Prompt for Total Amount of Top-ups in 2024 (page 20)
total_top_ups_prompt = """
You are a financial analyst. Extract the total amount of all top-ups to endowment and trust funds for FY2024 from the following page text. 
Sum all individual top-up amounts if presented separately. Return only the total as a float (in millions if specified). 
If not found, return null.

PAGE TEXT:
{page_text}
"""

# Prompt for List of Taxes in Operating Revenue Section (pages 5-6)
operating_revenue_taxes_prompt = """
You are a financial analyst. Identify all types of taxes mentioned in the "Operating Revenue" section from the following page text. 
Return a list of the exact tax names as strings. If not found, return an empty list.

PAGE TEXT:
{page_text}
"""

In [35]:
def extract_financial_data_from_pdf(pdf_text: dict, llm) -> FinancialExtractionOutput:
    """
    Extracts financial data from specific pages of the PDF using LLM prompts.
    Args:
        pdf_text (dict): Dictionary mapping page numbers to page text.
        llm: The language model instance to use for extraction.
    Returns:
        FinancialExtractionOutput: Structured extraction result.
    """
    # Corporate Income Tax Amount for 2024 (page 5)
    corp_income_tax_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=corp_income_tax_prompt.format(page_text=pdf_text[5]))
    ]).content
    print(f"Corporate Income Tax Response: {corp_income_tax_resp}")
    try:
        corporate_income_tax_2024 = float(re.findall(r"[-+]?\d*\.\d+|\d+", corp_income_tax_resp)[0])
    except Exception:
        corporate_income_tax_2024 = None

    # YOY Percentage Difference of Corporate Income Tax in 2024 (page 5)
    yoy_corp_income_tax_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=yoy_corp_income_tax_prompt.format(page_text=pdf_text[5]))
    ]).content
    print(f"YOY Corporate Income Tax Response: {yoy_corp_income_tax_resp}")
    try:
        yoy_percentage_corp_income_tax_2024 = float(re.findall(r"[-+]?\d*\.\d+|\d+", yoy_corp_income_tax_resp)[0])
    except Exception:
        yoy_percentage_corp_income_tax_2024 = None

    # Total Amount of Top-ups in 2024 (page 20)
    total_top_ups_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=total_top_ups_prompt.format(page_text=pdf_text[20]))
    ]).content
    print(f"Total Top-ups Response: {total_top_ups_resp}")
    try:
        total_top_ups_2024 = float(re.findall(r"[-+]?\d*\.\d+|\d+", total_top_ups_resp)[0])
    except Exception:
        total_top_ups_2024 = None

    # List of Taxes in Operating Revenue Section (pages 5-6)
    operating_revenue_text = pdf_text[5] + "\n" + pdf_text[6]
    operating_revenue_taxes_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=operating_revenue_taxes_prompt.format(page_text=operating_revenue_text))
    ]).content
    print(f"Operating Revenue Taxes Response: {operating_revenue_taxes_resp}")
    try:
        taxes = json.loads(operating_revenue_taxes_resp)
        if not isinstance(taxes, list):
            raise ValueError
        operating_revenue_taxes = taxes
    except Exception:
        operating_revenue_taxes = re.findall(r'"([^"]+)"', operating_revenue_taxes_resp)

    return FinancialExtractionOutput(
        corporate_income_tax_2024=corporate_income_tax_2024,
        yoy_percentage_corp_income_tax_2024=yoy_percentage_corp_income_tax_2024,
        total_top_ups_2024=total_top_ups_2024,
        operating_revenue_taxes=operating_revenue_taxes
    )

In [36]:
extracted_data = extract_financial_data_from_pdf(pdf_text, llm)

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 2
Please retry in 45.233754798s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-pro"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 2
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 45
}
].
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals

KeyboardInterrupt: 

In [None]:
print(extracted_data)

corporate_income_tax_2024=None yoy_percentage_corp_income_tax_2024=None total_top_ups_2024=20352.0 operating_revenue_taxes=['Operating Revenue', 'Corporate Income Tax', 'Other Taxes', 'Personal Income Tax', 'Assets Taxes', 'Betting Taxes', 'Goods and Services Tax', 'Foreign Worker Levy', 'Water Conservation Tax', 'Land Betterment Charge', 'Annual Tonnage Tax', 'Casino Taxes']
