# 0. load imports

In [172]:
!pip install -r requirements.txt




[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [173]:
import json
import re
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain_core.tools import tool
from langchain.schema import BaseMessage, HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
import os
import dotenv


# Load environment variables
dotenv.load_dotenv()

# Get API key
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    print("Warning: GEMINI_API_KEY not found in environment variables")
    print("Please make sure to set your API key in the .env file")
else:
    print("GEMINI_API_KEY loaded successfully")

print("Imports loaded successfully!")

GEMINI_API_KEY loaded successfully
Imports loaded successfully!


In [174]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

In [175]:
try:
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash", 
        temperature=0, 
        api_key=GEMINI_API_KEY,
        convert_system_message_to_human=True
    )
    print("LLM initialized successfully!")
except Exception as e:
    print(f"Error initializing LLM: {e}")
    llm = None

LLM initialized successfully!


In [176]:
DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant that extracts financial data from documents."

In [177]:
res = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content="Hey how are you?")
    ])
print(res.content)

I'm doing well, thank you for asking! I'm ready to assist you with extracting financial data from documents. Just let me know what you need. How can I help you today?


# 1. Document extraction & Prompt engineering

In [178]:
DATA_SOURCE_FILE_NAME = "fy2024_analysis_of_revenue_and_expenditure.pdf"

In [None]:
class FinancialExtractionOutput(BaseModel):
    """Structured output for financial data extraction from the PDF"""
    
    corporate_income_tax_2024: Optional[float] = Field(
        description="Amount of Corporate Income Tax in 2024 (in billions or appropriate unit)"
    )
    
    yoy_percentage_corp_income_tax_2024: Optional[float] = Field(
        description="Year-on-year percentage difference of Corporate Income Tax in 2024"
    )
    
    total_top_ups_2024: Optional[float] = Field(
        description="Total amount of top ups in 2024 (in billions or appropriate unit)"
    )
    
    operating_revenue_taxes: List[str] = Field(
        description="List of taxes mentioned in the Operating Revenue section"
    )

print("Imports loaded successfully!")

Imports loaded successfully!


In [180]:
import pymupdf  

with pymupdf.open(DATA_SOURCE_FILE_NAME) as doc:
    pdf_text = {}
    for page_num in range(len(doc)):
        page = doc[page_num]
        pdf_text[page_num + 1] = page.get_text()

# page 5
print(pdf_text[5])

MINISTRY OF FINANCE 
 
5 
 
01 Update on Financial Year 2023 
 
1.1 
Expected Overall Fiscal Position for FY2023  
 
The basic deficit is revised to $5.4 billion (0.8% of GDP). After factoring in Top-
ups to Endowment and Trust Funds of $24.3 billion, Net Investment Returns 
Contribution (NIRC) of $22.9 billion, Capitalisation of Nationally Significant 
Infrastructure of $3.5 billion, and Significant Infrastructure Government Loan 
Act (SINGA) Interest Costs and Loan Expenses of $0.2 billion, the Overall Fiscal 
Position for Revised FY2023 is a deficit of $3.6 billion (0.5% of GDP). This is larger 
than the $0.4 billion deficit estimated at Budget 2023. This is due to higher Total 
Expenditure to meet priority areas such as catching up of projects deferred due 
to COVID-19; funding for public healthcare institutions; and a top-up to the 
Majulah Package Fund to boost the retirement and healthcare adequacy of 
Singaporeans, as announced by Prime Minister Lee Hsien Loong at the National 

Separate prompts so we don't blow the context window, or exceed free tier input token limit

In [181]:
# Prompt for Corporate Income Tax Amount in 2024 (page 5)
corp_income_tax_prompt = """
Extract the exact amount of Corporate Income Tax specified from the following page text. 
Assume units is in billions, return only the number as a float.

PAGE TEXT:
{page_text}
"""

# Prompt for YOY Percentage Difference of Corporate Income Tax in 2024 (page 5)
yoy_corp_income_tax_prompt = """
Extract the year-over-year percentage change of Corporate Income Tax from from the following page text. The text may not specify the specific year. 
Assume units is in billions, return only the number as a float.

PAGE TEXT:
{page_text}
"""

# Prompt for Total Amount of Top-ups in 2024 (page 20)
total_top_ups_prompt = """
Extract the total amount of all top-ups to endowment and trust funds for FY2024 from the following page text.
Assume units is in billions, return only the number as a float.

PAGE TEXT:
{page_text}
"""

# Prompt for List of Taxes in Operating Revenue Section (pages 5-6)
operating_revenue_taxes_prompt = """
Identify all types of taxes mentioned in the "Operating Revenue" section from the following page text. 
Return a list of the exact tax names as strings.

PAGE TEXT:
{page_text}
"""

In [182]:
def extract_financial_data_from_pdf(pdf_text: dict, llm) -> FinancialExtractionOutput:
    """
    Extracts financial data from specific pages of the PDF using LLM prompts.
    Args:
        pdf_text (dict): Dictionary mapping page numbers to page text.
        llm: The language model instance to use for extraction.
    Returns:
        FinancialExtractionOutput: Structured extraction result.
    """
    # Corporate Income Tax Amount for 2024 (page 5)
    corp_income_tax_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=corp_income_tax_prompt.format(page_text=pdf_text[5]))
    ]).content
    print(f"Corporate Income Tax Response: {corp_income_tax_resp}")
    try:
        corporate_income_tax_2024 = float(corp_income_tax_resp)
    except Exception:
        corporate_income_tax_2024 = None

    # YOY Percentage Difference of Corporate Income Tax in 2024 (page 5)
    yoy_corp_income_tax_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=yoy_corp_income_tax_prompt.format(page_text=pdf_text[5]))
    ]).content
    print(f"YOY Corporate Income Tax Response: {yoy_corp_income_tax_resp}")
    try:
        yoy_percentage_corp_income_tax_2024 = float(yoy_corp_income_tax_resp)
    except Exception:
        yoy_percentage_corp_income_tax_2024 = None

    # Total Amount of Top-ups in 2024 (page 20)
    total_top_ups_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=total_top_ups_prompt.format(page_text=pdf_text[20]))
    ]).content
    print(f"Total Top-ups Response: {total_top_ups_resp}")
    try:
        total_top_ups_2024 = float(total_top_ups_resp)
    except Exception:
        total_top_ups_2024 = None

    # List of Taxes in Operating Revenue Section (pages 5-6)
    operating_revenue_text = pdf_text[5] + "\n" + pdf_text[6]
    operating_revenue_taxes_resp = llm.invoke([
        SystemMessage(content=DEFAULT_SYSTEM_MESSAGE),
        HumanMessage(content=operating_revenue_taxes_prompt.format(page_text=operating_revenue_text))
    ]).content
    print(f"Operating Revenue Taxes Response: {operating_revenue_taxes_resp}")
    try:
        taxes = json.loads(operating_revenue_taxes_resp)
        if not isinstance(taxes, list):
            raise ValueError
        operating_revenue_taxes = taxes
    except Exception:
        operating_revenue_taxes = [operating_revenue_taxes_resp]

    return FinancialExtractionOutput(
        corporate_income_tax_2024=corporate_income_tax_2024,
        yoy_percentage_corp_income_tax_2024=yoy_percentage_corp_income_tax_2024,
        total_top_ups_2024=total_top_ups_2024,
        operating_revenue_taxes=operating_revenue_taxes
    )

In [183]:
extracted_data = extract_financial_data_from_pdf(pdf_text, llm)

Corporate Income Tax Response: 28.4
YOY Corporate Income Tax Response: 17.0
Total Top-ups Response: 20.352
Operating Revenue Taxes Response: Here's a list of the tax names mentioned in the "Operating Revenue" section of the provided text:

*   "Corporate Income Tax"
*   "Other Taxes"
*   "Personal Income Tax"
*   "Assets Taxes"
*   "Betting Taxes"
*   "Goods and Services Tax"
*   "Water Conservation Tax"
*   "Annual Tonnage Tax"
*   "Casino Taxes"


### Final output for extracted data

In [184]:
from pprint import pprint

pprint(extracted_data.dict())

{'corporate_income_tax_2024': 28.4,
 'operating_revenue_taxes': ["Here's a list of the tax names mentioned in the "
                             '"Operating Revenue" section of the provided '
                             'text:\n'
                             '\n'
                             '*   "Corporate Income Tax"\n'
                             '*   "Other Taxes"\n'
                             '*   "Personal Income Tax"\n'
                             '*   "Assets Taxes"\n'
                             '*   "Betting Taxes"\n'
                             '*   "Goods and Services Tax"\n'
                             '*   "Water Conservation Tax"\n'
                             '*   "Annual Tonnage Tax"\n'
                             '*   "Casino Taxes"'],
 'total_top_ups_2024': 20.352,
 'yoy_percentage_corp_income_tax_2024': 17.0}


C:\Users\riley\AppData\Local\Temp\ipykernel_10776\1254583981.py:3: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  pprint(extracted_data.dict())


# 2. Tool Calling & Reasoning Integration

In [None]:
from langchain.output_parsers import PydanticOutputParser
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field


Define new tool with decorator

In [186]:


@tool
def normalize_submission_date(date_str: str) -> str:
    """
    Use this tool to convert dates into YYYY-MM-DD format.
    """
    try:
        dt = datetime.strptime(date_str.strip(), "%d %B %Y")
        return dt.strftime("%Y-%m-%d")
    except Exception as e:
        print(f"Error normalizing date: {e}")
        return None

Define a pydantic class to force structured output

In [187]:

class DateExtractionOutput(BaseModel):
    original_text: str = Field(description="Original text parsed into the LLM")
    normalized_date: Optional[str] = Field(description="Date in YYYY-MM-DD format, if found")
    status: Optional[str] = Field(description="Status of the date (Upcoming if > 2024-01-01, Expired if < 2024-01-01, Ongoing if = 2024-01-01), compared against the date of 2024-01-01")


In [192]:
# Create an agent that can use tools for date normalization with structured output
from langchain.agents import AgentExecutor, create_tool_calling_agent

agent_system_prompt = """You are a helpful assistant that processes text and normalizes dates.

CRITICAL INSTRUCTIONS - YOU MUST FOLLOW THESE EXACTLY:

1. TOOL USAGE IS MANDATORY:
   - When you find ANY date in "DD Month YYYY" format (e.g., "15 March 2024"), you MUST call the normalize_submission_date tool
   - DO NOT convert dates manually or use your own logic
   - DO NOT proceed to step 2 until you have called the tool and received the normalized date
   - If there is a date pattern, calling the tool is REQUIRED, not optional

2. DATE COMPARISON:
   After receiving the normalized date from the tool, compare it against 2024-01-01:
   - "Upcoming" if normalized_date > 2024-01-01
   - "Expired" if normalized_date < 2024-01-01
   - "Ongoing" if normalized_date = 2024-01-01

3. OUTPUT FORMAT:
   Return ONLY this JSON structure with no additional text:
   {{
       "original_text": "the original input text",
       "normalized_date": "YYYY-MM-DD from tool output, or null if no date found",
       "status": "Ongoing/Upcoming/Expired/null if no date found"
   }}

WORKFLOW:
Step 1: Analyze input for dates in "DD Month YYYY" format
Step 2: If date found → CALL normalize_submission_date tool (MANDATORY)
Step 3: Wait for tool response
Step 4: Use tool output to determine status
Step 5: Return JSON with results

If no date is found in the input, return the JSON with normalized_date and status as null.

Remember: You cannot determine the normalized date yourself. You MUST use the tool."""

# Create the prompt template for the agent
agent_prompt = ChatPromptTemplate.from_messages([
    ("system", agent_system_prompt),
    ("human", "{input}"),
    ("placeholder", "{agent_scratchpad}"),
])

# Create the tool list
tools = [normalize_submission_date]

# Create output parser for post-processing
output_parser = PydanticOutputParser(pydantic_object=DateExtractionOutput)

# Create the agent
agent = create_tool_calling_agent(llm, tools, agent_prompt)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

# Wrapper function to parse agent output
def process_with_structured_output(text: str) -> DateExtractionOutput:
    result = agent_executor.invoke({"input": text})
    # Parse the agent's output into structured format
    return output_parser.parse(result["output"])

print("Agent with structured output created successfully!")

Agent with structured output created successfully!


Test process_with_structured_output

In [193]:
result_structured = process_with_structured_output("The submission was on 15 March 2024")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `normalize_submission_date` with `{'date_str': '15 March 2024'}`


[0m[36;1m[1;3m2024-03-15[0m[32;1m[1;3m```json
{
    "original_text": "The submission was on 15 March 2024",
    "normalized_date": "2024-03-15",
    "status": "Upcoming"
}
```[0m

[1m> Finished chain.[0m


Test with pdf_text page 1 and 36

In [196]:

page_1_result = process_with_structured_output(pdf_text[1])


print(page_1_result)
print("\n---------\n")
print("Original text:", page_1_result.original_text)
print("Normalized date:", page_1_result.normalized_date)
print("Status:", page_1_result.status)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `normalize_submission_date` with `{'date_str': '16 February 2024'}`


[0m[36;1m[1;3m2024-02-16[0m[32;1m[1;3m```json
{
    "original_text": "REVENUE AND \nEXPENDITURE \nFinancial Year 2024  \n \n \n \n \n \n \n \n \nDistributed on Budget Day: 16 February 2024",
    "normalized_date": "2024-02-16",
    "status": "Upcoming"
}
```[0m

[1m> Finished chain.[0m
original_text='REVENUE AND \nEXPENDITURE \nFinancial Year 2024  \n \n \n \n \n \n \n \n \nDistributed on Budget Day: 16 February 2024' normalized_date='2024-02-16' status='Upcoming'

---------

Original text: REVENUE AND 
EXPENDITURE 
Financial Year 2024  
 
 
 
 
 
 
 
 
Distributed on Budget Day: 16 February 2024
Normalized date: 2024-02-16
Status: Upcoming


**NOTE: Sometimes the LLM may not invoke the tool, highly depends on the model capabilities**

In [197]:

page_36_result = process_with_structured_output(pdf_text[36])


print(page_36_result)
print("\n---------\n")
print("Original text:", page_36_result.original_text)
print("Normalized date:", page_36_result.normalized_date)
print("Status:", page_36_result.status)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
       "original_text": "MINISTRY OF FINANCE \n \n36 \n \nGlossary of Terms \n \nAssets Taxes  \nAssets Taxes refer to Property Tax and Estate \nDuty. Property Tax is a tax on the ownership of \nproperty and is payable by all property owners \non the properties owned by them. Estate Duty \nis a tax on the total market value of a person's \nassets (cash and non-cash) at the date of his or \nher death. Estate Duty does not apply to a \nperson who dies after 15 February 2008.  \n \nCustoms Duties \nTaxes on goods imported into Singapore. In \nSingapore, Customs Duties are principally \nimposed on alcoholic beverages. \n \nDevelopment Expenditure \nExpenses \nthat \nrepresent \na \nlonger-term \ninvestment and/or are incurred on capital assets \nin respect of or in connection with the economic \ndevelopment or general welfare of Singapore. \nExamples of spending areas are on the \nacquisition of heavy equipment and capi

# 3. Multi-Agent supervisor

1. Define a state to store information
    - query: Original user question (string)
    - revenue_info: Revenue data extracted (string/dict, initially None)
    - expenditure_info: Expenditure data extracted (string/dict, initially None)
    - supervisor_decision: Which agent(s) to call (list of strings)
    - final_answer: Aggregated response (string, initially None)
    - messages: Conversation history (list of messages)
2. Create the 3 agent nodes
    - Supervisor agent:  to determine which nodes are required
    - Revenue agent: Focuses on revenue related queries. Uses pdf_data as the tool
    - Expenditure agent: Focuses on Expenditure related queries. Uses pdf_data as the tool
3. Design workflow graph
    ```
    ↓
    [Supervisor Agent] ← Entry point, analyzes query
    ↓
    [Conditional Edge] ← Routes based on supervisor_decision
    ↓↓
    ↓└─→ [Revenue Agent] (if "revenue" in decision)
    ↓
    └──→ [Expenditure Agent] (if "expenditure" in decision)
    
    Both agents can run in parallel ↓↓
    
    [Aggregator Node] ← Combines revenue_info + expenditure_info
    ↓
    [Final Answer Generation] ← Creates comprehensive response
    ↓
    END
    ```

In [250]:
from typing import TypedDict, Annotated, Literal
from langgraph.graph import MessagesState
from langchain_core.messages import BaseMessage
import operator
from langchain_core.prompts import ChatPromptTemplate
from langgraph.graph import StateGraph, END
from langchain.tools import tool
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage, ToolMessage
from langchain_core.prompts import ChatPromptTemplate

## Step 1. State creation

In [251]:
class BudgetAnalysisState(TypedDict):
    """
    Shared state that flows through all agents in the graph.
    Each agent reads from and writes to this state.
    """
    
    # Original user query
    query: str
    
    # Supervisor's routing decision
    supervisor_decision: Annotated[list[str], operator.add]  # Can accumulate decisions
    
    # Revenue agent's findings
    revenue_info: str  
    
    # Expenditure agent's findings
    expenditure_info: str  
    
    # Final aggregated answer
    final_answer: str
    
    # Messages for conversation history (inherited from MessagesState pattern)
    messages: Annotated[list[BaseMessage], operator.add]

In [252]:
# Test: Create an initial state to verify schema
initial_state = BudgetAnalysisState(
    query="What are the key government revenue streams, and how will the Budget for the Future Energy Fund be supported?",
    supervisor_decision=[],
    revenue_info="",
    expenditure_info="",
    final_answer="",
    messages=[]
)

In [253]:
@tool
def get_pdf_data() -> dict:
    """
    Retrieves the complete PDF text data containing government budget information.
    
    Returns a dictionary where:
    - Keys are page numbers (integers)
    - Values are the text content of each page (strings)
    
    This data contains information about government revenue streams, expenditure,
    budget allocations, and specific fund details.
    """
    return pdf_text

## Step 2. Agent node creation

In [254]:
def supervisor_agent(state: BudgetAnalysisState) -> BudgetAnalysisState:
    """
    Supervisor Agent: Analyzes the query and determines which specialized 
    agents (revenue and/or expenditure) should be called.
    """
    
    supervisor_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a Supervisor Agent for a budget analysis system. Your role is to analyze user queries and determine which specialized agents should handle them.

You have two specialized agents available:
1. REVENUE AGENT - Handles queries about government income, tax revenues, revenue streams, income sources, and earnings
2. EXPENDITURE AGENT - Handles queries about government spending, budget allocations, funds, expenses, and financial disbursements

ANALYSIS RULES:
- If the query asks about revenue, income, or earnings → route to "revenue"
- If the query asks about expenditure, spending, budgets, or funds → route to "expenditure"  
- If the query asks about BOTH revenue AND expenditure → route to BOTH ["revenue", "expenditure"]
- Be thorough: if a query mentions both aspects implicitly, route to both agents

RESPONSE FORMAT:
You must respond with ONLY a JSON object in this exact format:
{{
    "reasoning": "Brief explanation of your routing decision",
    "agents_needed": ["revenue"] OR ["expenditure"] OR ["revenue", "expenditure"]
}}

Do not include any other text. Only return the JSON object."""),
        ("human", "Query to analyze: {query}")
    ])
    
    # Invoke LLM
    chain = supervisor_prompt | llm
    response = chain.invoke({"query": state["query"]})
    
    # Parse response to extract agents_needed
    import json
    try:
        # Extract JSON from response
        response_text = response.content
        json_start = response_text.find('{')
        json_end = response_text.rfind('}') + 1
        json_str = response_text[json_start:json_end]
        decision_data = json.loads(json_str)
        
        agents_needed = decision_data.get("agents_needed", [])
        reasoning = decision_data.get("reasoning", "No reasoning provided")
        
        print(f"\n🎯 SUPERVISOR DECISION:")
        print(f"   Reasoning: {reasoning}")
        print(f"   Agents needed: {agents_needed}")
        
    except Exception as e:
        print(f"⚠️  Error parsing supervisor response: {e}")
        # Default: call both agents if parsing fails
        agents_needed = ["revenue", "expenditure"]
    
    # Update state
    return {
        **state,
        "supervisor_decision": agents_needed,
        "messages": state["messages"] + [response]
    }

In [None]:
def revenue_agent_node(state: BudgetAnalysisState) -> BudgetAnalysisState:
    """
    Revenue Agent: Specializes in extracting and analyzing government revenue 
    information using PDF data tool.
    """
    
    revenue_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a Revenue Agent specializing in government revenue analysis. Your expertise includes:
- Identifying government revenue streams (taxes, fees, charges, levies)
- Analyzing income sources (corporate tax, GST, personal income tax, etc.)
- Understanding revenue projections and forecasts
- Recognizing revenue policies and changes

INSTRUCTIONS:
1. FIRST, you MUST call the get_pdf_data tool to retrieve the budget document data
2. Search through the PDF data for ANY information related to government revenue or income
3. Extract and summarize ALL relevant revenue information, including:
   - Types of revenue streams mentioned
   - Specific revenue sources and their amounts
   - Revenue amounts or figures with exact numbers
   - Revenue policies or changes
   - Revenue projections or forecasts
4. Be comprehensive but concise
5. Cite specific pages when referencing information (e.g., "According to page 1...")

CRITICAL: You MUST use the get_pdf_data tool to access the budget document. Do not rely on general knowledge.

RESPONSE FORMAT:
Provide a clear, structured summary of revenue findings with page citations. Use bullet points for clarity.

Example response format:
"Revenue Information Found:
- Primary revenue stream (Page 1): Corporate taxation - $45 billion
- GST revenue (Page 1): Projected at $38 billion
- Total government revenue (Page 4): $156 billion projected for 2024-2025"

Now analyze the budget document for revenue information."""),
        ("human", "User Query: {query}\n\nUse the get_pdf_data tool to retrieve budget information, then extract all revenue-related information that answers this query.")
    ])
    
    # Create LLM with tool binding
    llm_with_tools = llm.bind_tools([get_pdf_data])
    
    # Invoke LLM
    chain = revenue_prompt | llm_with_tools
    response = chain.invoke({"query": state["query"]})
    
    # Check if tool was called
    messages = [response]
    
    if response.tool_calls:
        print(f"\n💰 REVENUE AGENT: Calling tool to get PDF data...")
        
        # Execute tool calls
        for tool_call in response.tool_calls:
            tool_name = tool_call["name"]
            tool_args = tool_call["args"]
            
            print(f"   Tool: {tool_name}")
            
            # Execute the tool
            if tool_name == "get_pdf_data":
                tool_result = get_pdf_data.invoke(tool_args)
                
                # Create tool message
                tool_message = ToolMessage(
                    content=str(tool_result),
                    tool_call_id=tool_call["id"]
                )
                messages.append(tool_message)
        
        # Get final response after tool execution
        final_response = llm.invoke(messages + [
            HumanMessage(content="Based on the PDF data retrieved, provide your comprehensive revenue analysis with specific figures and page citations.")
        ])
        
        revenue_findings = final_response.content
        messages.append(final_response)
    else:
        print(f"\n⚠️  REVENUE AGENT: Tool not called - using direct response")
        revenue_findings = response.content
    
    print(f"\n💰 REVENUE AGENT FINDINGS:")
    print(f"   {revenue_findings[:300]}...")
    
    # Update state
    del state["query"] # Prevent duplicate state assignment
    del state["expenditure_info"] # Prevent duplicate state assignment
    return {
        **state,
        "revenue_info": revenue_findings,
        "messages": state["messages"] + messages
    }


In [None]:
def expenditure_agent_node(state: BudgetAnalysisState) -> BudgetAnalysisState:
    """
    Expenditure Agent: Specializes in extracting and analyzing government 
    expenditure information using PDF data tool.
    """
    
    expenditure_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an Expenditure Agent specializing in government spending analysis. Your expertise includes:
- Identifying budget allocations and spending commitments
- Analyzing expenditure across different sectors (education, healthcare, defense, etc.)
- Tracking specific funds and their purposes
- Understanding spending figures and amounts with precision
- Recognizing budget changes and spending policies

INSTRUCTIONS:
1. FIRST, you MUST call the get_pdf_data tool to retrieve the budget document data
2. Search through the PDF data for ANY information related to government expenditure or spending
3. Extract and summarize ALL relevant expenditure information, including:
   - Specific funds mentioned (e.g., "Future Energy Fund") with exact allocations
   - Budget allocations and amounts with precise figures
   - Spending categories or sectors
   - Expenditure policies or changes
   - Financial support mechanisms and their sources
4. Be PRECISE with numerical figures - accuracy is critical
5. Cite specific pages when referencing information (e.g., "According to page 2...")

CRITICAL: You MUST use the get_pdf_data tool to access the budget document. Do not rely on general knowledge.

RESPONSE FORMAT:
Provide a clear, structured summary of expenditure findings with page citations. Use bullet points for clarity.

Example response format:
"Expenditure Information Found:
- Future Energy Fund (Page 2): $250 million allocation for 2024-2025
- Fund breakdown (Page 2): 65% solar energy, 35% wind power
- Support mechanisms (Page 3): 40% from carbon tax, $100M green bonds, $50M fossil fuel subsidy reallocation
- Total budget priorities (Page 5): Healthcare $28B, Education $24B, Defense $18B"

Now analyze the budget document for expenditure information."""),
        ("human", "User Query: {query}\n\nUse the get_pdf_data tool to retrieve budget information, then extract all expenditure-related information that answers this query.")
    ])
    
    # Create LLM with tool binding
    llm_with_tools = llm.bind_tools([get_pdf_data])
    
    # Invoke LLM
    chain = expenditure_prompt | llm_with_tools
    response = chain.invoke({"query": state["query"]})
    
    # Check if tool was called
    messages = [response]
    
    if response.tool_calls:
        print(f"\n💸 EXPENDITURE AGENT: Calling tool to get PDF data...")
        
        # Execute tool calls
        for tool_call in response.tool_calls:
            tool_name = tool_call["name"]
            tool_args = tool_call["args"]
            
            print(f"   Tool: {tool_name}")
            
            # Execute the tool
            if tool_name == "get_pdf_data":
                tool_result = get_pdf_data.invoke(tool_args)
                
                # Create tool message
                tool_message = ToolMessage(
                    content=str(tool_result),
                    tool_call_id=tool_call["id"]
                )
                messages.append(tool_message)
        
        # Get final response after tool execution
        final_response = llm.invoke(messages + [
            HumanMessage(content="Based on the PDF data retrieved, provide your comprehensive expenditure analysis with specific figures and page citations.")
        ])
        
        expenditure_findings = final_response.content
        messages.append(final_response)
    else:
        print(f"\n⚠️  EXPENDITURE AGENT: Tool not called - using direct response")
        expenditure_findings = response.content
    
    print(f"\n💸 EXPENDITURE AGENT FINDINGS:")
    print(f"   {expenditure_findings[:300]}...")
    
    # Update state
    del state["revenue_info"] # Prevent duplicate state assignment
    del state["final_answer"] # Prevent duplicate state assignment
    return {
        **state,
        "expenditure_info": expenditure_findings,
        "messages": state["messages"] + messages
    }


In [265]:
def aggregator_node(state: BudgetAnalysisState) -> BudgetAnalysisState:
    """
    Aggregator Node: Combines information from revenue and expenditure agents
    into a comprehensive final answer.
    """
    
    aggregator_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an Aggregator Agent that synthesizes information from specialized agents into a comprehensive response.

You will receive:
1. The original user query
2. Revenue information (from Revenue Agent) - may be empty if not applicable
3. Expenditure information (from Expenditure Agent) - may be empty if not applicable

YOUR TASK:
- Combine the revenue and expenditure information into a coherent, well-structured answer
- Address the original query directly and completely
- Maintain accuracy - do not add information not provided by the agents
- If only one type of information is available, provide a complete answer with that information
- Use clear sections or paragraphs to organize the response
- Be professional and concise

RESPONSE FORMAT:
Provide a natural, flowing response that addresses the user's query. Structure your response logically.

Example:
"Based on the analysis:

Revenue Streams:
[Revenue agent findings here]

Budget Support:
[Expenditure agent findings here]

In summary, [brief conclusion]"

Now synthesize the information below."""),
        ("human", """Original Query: {query}

Revenue Information:
{revenue_info}

Expenditure Information:
{expenditure_info}

Please provide a comprehensive answer to the original query by synthesizing the above information.""")
    ])
    
    # Invoke LLM
    chain = aggregator_prompt | llm
    response = chain.invoke({
        "query": state["query"],
        "revenue_info": state.get("revenue_info", "No revenue information available"),
        "expenditure_info": state.get("expenditure_info", "No expenditure information available")
    })
    
    final_answer = response.content
    
    print(f"\n📊 AGGREGATOR - FINAL ANSWER GENERATED")
    print(f"   Length: {len(final_answer)} characters")
    
    # Update state with final answer
    return {
        **state,
        "final_answer": final_answer,
        "messages": state["messages"] + [response]
    }

# Step 3. Create the workflow

In [266]:
def route_query(state: BudgetAnalysisState) -> list[str]:
    """
    Routing function that determines which agent nodes to execute next
    based on the supervisor's decision.
    """
    decision = state.get("supervisor_decision", [])
    
    print(f"\n🔀 ROUTING DECISION: {decision}")
    
    next_nodes = []
    
    if "revenue" in decision:
        next_nodes.append("revenue_agent")
    
    if "expenditure" in decision:
        next_nodes.append("expenditure_agent")
    
    # If no specific routing, go directly to aggregator
    if not next_nodes:
        next_nodes.append("aggregator")
    
    print(f"   Routing to: {next_nodes}")
    
    return next_nodes


In [None]:

# For supervisor if there's loop
def should_continue(state: BudgetAnalysisState) -> str:
    """
    Determines if we should continue to aggregator or end.
    This checks if both required agents have completed their work.
    """
    decision = state.get("supervisor_decision", [])
    revenue_completed = bool(state.get("revenue_info"))
    expenditure_completed = bool(state.get("expenditure_info"))
    
    # Check if all required agents have completed
    if "revenue" in decision and not revenue_completed:
        return "continue"
    if "expenditure" in decision and not expenditure_completed:
        return "continue"
    
    # All required agents completed, go to aggregator
    return "aggregator"

In [268]:
def build_graph():
    """
    Builds and compiles the multi-agent budget analysis graph.
    
    Returns:
        Compiled LangGraph application ready for invocation
    """
    
    print("="*70)
    print("BUILDING MULTI-AGENT GRAPH")
    print("="*70)
    
    # Initialize the StateGraph
    workflow = StateGraph(BudgetAnalysisState)
    
    # Add all nodes to the graph
    workflow.add_node("supervisor", supervisor_agent)
    workflow.add_node("revenue_agent", revenue_agent_node)
    workflow.add_node("expenditure_agent", expenditure_agent_node)
    workflow.add_node("aggregator", aggregator_node)
    
    print("✅ Nodes added to graph:")
    print("   1. supervisor")
    print("   2. revenue_agent")
    print("   3. expenditure_agent")
    print("   4. aggregator")
    
    # Set the entry point
    workflow.set_entry_point("supervisor")
    print("\n✅ Entry point set: supervisor")
    
    # Add conditional edges from supervisor based on routing decision
    workflow.add_conditional_edges(
        "supervisor",
        route_query,
        {
            "revenue_agent": "revenue_agent",
            "expenditure_agent": "expenditure_agent",
            "aggregator": "aggregator"
        }
    )
    print("\n✅ Conditional edges added from supervisor")
    
    # Add edges from specialized agents to aggregator
    workflow.add_edge("revenue_agent", "aggregator")
    workflow.add_edge("expenditure_agent", "aggregator")
    print("✅ Edges added: revenue_agent → aggregator")
    print("✅ Edges added: expenditure_agent → aggregator")
    
    # Add edge from aggregator to END
    workflow.add_edge("aggregator", END)
    print("✅ Edge added: aggregator → END")
    
    # Compile the graph
    app = workflow.compile()
    print("\n" + "="*70)
    print("🎉 GRAPH COMPILED SUCCESSFULLY!")
    print("="*70)
    
    return app

In [269]:
app = build_graph()

BUILDING MULTI-AGENT GRAPH
✅ Nodes added to graph:
   1. supervisor
   2. revenue_agent
   3. expenditure_agent
   4. aggregator

✅ Entry point set: supervisor

✅ Conditional edges added from supervisor
✅ Edges added: revenue_agent → aggregator
✅ Edges added: expenditure_agent → aggregator
✅ Edge added: aggregator → END

🎉 GRAPH COMPILED SUCCESSFULLY!


In [270]:
result = app.invoke(initial_state)


🎯 SUPERVISOR DECISION:
   Reasoning: The query asks about 'key government revenue streams' (revenue) and how the 'Budget for the Future Energy Fund' will be supported (expenditure). Therefore, both agents are needed.
   Agents needed: ['revenue', 'expenditure']

🔀 ROUTING DECISION: ['revenue', 'expenditure']
   Routing to: ['revenue_agent', 'expenditure_agent']

💸 EXPENDITURE AGENT: Calling tool to get PDF data...
   Tool: get_pdf_data

💰 REVENUE AGENT: Calling tool to get PDF data...
   Tool: get_pdf_data

💸 EXPENDITURE AGENT FINDINGS:
   Okay, I will provide a comprehensive expenditure analysis based on the provided PDF data, including specific figures and page citations.

**Overall Expenditure Trends**

*   **FY2023 (Revised):** Total Expenditure is revised to $106.9 billion (Page 6), which is $2.7 billion (2.6%) higher than the Es...

💰 REVENUE AGENT FINDINGS:
   Okay, here's a revenue analysis based on the provided PDF data, with specific figures and page citations.

**Overall Re

In [274]:
print(result["final_answer"])

Based on the analysis:

**Key Government Revenue Streams:**

*   **Overall Revenue:** Total Operating Revenue was $104.3 billion in FY2023 (Revised) and is projected to be $108.6 billion in FY2024.
*   **Corporate Income Tax:** $28.38 billion (FY2023 Revised), $28.03 billion (FY2024 Estimated).
*   **Personal Income Tax:** $17.53 billion (FY2023 Revised), $18.07 billion (FY2024 Estimated).
*   **Goods and Services Tax (GST):** $16.36 billion (FY2023 Revised), $19.39 billion (FY2024 Estimated).
*   **Assets Taxes:** $5.92 billion (FY2023 Revised), $6.67 billion (FY2024 Estimated).
*   **Motor Vehicle Taxes:** $2.60 billion (FY2023 Revised), $2.84 billion (FY2024 Estimated).
*   **Vehicle Quota Premiums:** $4.66 billion (FY2023 Revised), $4.72 billion (FY2024 Estimated).

**Budget Support for the Future Energy Fund:**

*   The Future Energy Fund will receive a top-up of $5.0 billion in FY2024. This is part of the total $20.4 billion top-ups to Endowment and Trust Funds.
