# Using Structured LLMs

In [1]:
from datetime import datetime
from pydantic import BaseModel
from pydantic.fields import Field

class LineItem(BaseModel):
    """A line item in an invoice."""

    item_name: str = Field(description="The name of this item")
    price: float = Field(description="The price of this item")


class Invoice(BaseModel):
    """A representation of information from an invoice."""

    invoice_id: str = Field(
        description="A unique identifier for this invoice, often a number"
    )
    date: datetime = Field(description="The date this invoice was created")
    line_items: list[LineItem] = Field(
        description="A list of all the items in this invoice"
    )

In [2]:
from llama_index.readers.file import PDFReader
from pathlib import Path

pdf_reader = PDFReader()
documents = pdf_reader.load_data(file=Path("/Users/hyjiang/song_ws/code_ware/Learn_advance/学习杂论/LlamaIndex/quickstart/start_rag/data/2023_canadian_budget.pdf"))
text = documents[0].text
text

'‹\xa02022 2024›\n2023 budget of the\nCanadian federal\ngovernment\nSubmitted 28 March 2023\nPresented 28 March 2023\nParliament 44th\nParty Liberal\nFinance\nminister\nChrystia Freeland\nTotal revenue $456.8 billion\n(projected)\nTotal\nexpenditures\n$496.9 billion\n(projected)\nDeﬁcit $40.1 billion\n(projected)[ 1 ] \nGDP TBA\nWebsite 2023 Budget (http\ns://www.budget.can\nada.ca/2023/home-\naccueil-en.html)\n2023 Canadian federal budget\nThe Canadian federal budget for the fiscal years of 2023–24 was\npresented to the House of Commons by Finance Minister Chrystia\nFreeland on 28 March 2023.[2] The budget was meant to reflect\nPrime Minister Justin Trudeau\'s stated policy objective to "make\nlife more affordable for Canadians"[3] while also reducing\ngovernment expenditures.[4]\nThe 2023 budget is the seventh budget document introduced in\nthe House of Commons under the premiership of Justin Trudeau.\nIt comes at the heel of the first anniversary of the Russian\ninvasion of Ukraine,

In [3]:
import dotenv
dotenv.load_dotenv()

True

In [4]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o")
sllm = llm.as_structured_llm(Invoice)

response = sllm.complete(text)

In [5]:
import json

In [6]:
json_response = json.loads(response.text)
print(json.dumps(json_response, indent=2))

{
  "invoice_id": "2023-CAN-BUDGET",
  "date": "2023-03-28T00:00:00Z",
  "line_items": [
    {
      "item_name": "Total Revenue",
      "price": 456800000000.0
    },
    {
      "item_name": "Total Expenditures",
      "price": 496900000000.0
    },
    {
      "item_name": "Deficit",
      "price": 40100000000.0
    },
    {
      "item_name": "Net New Spending (6 years)",
      "price": 43000000000.0
    },
    {
      "item_name": "Green Technology Tax Credit",
      "price": 20000000000.0
    },
    {
      "item_name": "Means-Tested Dental Care Program",
      "price": 13000000000.0
    },
    {
      "item_name": "Grocery Rebate",
      "price": 467.0
    },
    {
      "item_name": "Canada Student Grants Increase",
      "price": 0.0
    },
    {
      "item_name": "Spending Cuts",
      "price": -15000000000.0
    },
    {
      "item_name": "Stock Buybacks Tax Revenue",
      "price": 0.0
    }
  ]
}


In [7]:
from pprint import pprint

pprint(response.raw)

Invoice(invoice_id='2023-CAN-BUDGET', date=datetime.datetime(2023, 3, 28, 0, 0, tzinfo=TzInfo(UTC)), line_items=[LineItem(item_name='Total Revenue', price=456800000000.0), LineItem(item_name='Total Expenditures', price=496900000000.0), LineItem(item_name='Deficit', price=40100000000.0), LineItem(item_name='Net New Spending (6 years)', price=43000000000.0), LineItem(item_name='Green Technology Tax Credit', price=20000000000.0), LineItem(item_name='Means-Tested Dental Care Program', price=13000000000.0), LineItem(item_name='Grocery Rebate', price=467.0), LineItem(item_name='Canada Student Grants Increase', price=0.0), LineItem(item_name='Spending Cuts', price=-15000000000.0), LineItem(item_name='Stock Buybacks Tax Revenue', price=0.0)])


In [8]:
from llama_index.core.prompts import PromptTemplate

prompt = PromptTemplate(
    "Extract an invoice from the following text. If you cannot find an invoice ID, use the company name '{company_name}' and the date as the invoice ID: {text}"
)

response = llm.structured_predict(
    Invoice, prompt, text=text, company_name="Uber"
)

In [9]:
response

Invoice(invoice_id='Uber-2023-03-28', date=datetime.datetime(2023, 3, 28, 0, 0), line_items=[LineItem(item_name='Total revenue', price=456800000000.0), LineItem(item_name='Total expenditures', price=496900000000.0), LineItem(item_name='Deficit', price=40100000000.0)])

In [10]:
json_output = response.model_dump_json()
print(json.dumps(json.loads(json_output), indent=2))

{
  "invoice_id": "Uber-2023-03-28",
  "date": "2023-03-28T00:00:00",
  "line_items": [
    {
      "item_name": "Total revenue",
      "price": 456800000000.0
    },
    {
      "item_name": "Total expenditures",
      "price": 496900000000.0
    },
    {
      "item_name": "Deficit",
      "price": 40100000000.0
    }
  ]
}


# Structured Prediction

In [11]:
from llama_index.core.prompts import PromptTemplate

prompt = PromptTemplate(
    "Extract an invoice from the following text. If you cannot find an invoice ID, use the company name '{company_name}' and the date as the invoice ID: {text}"
)

response = llm.structured_predict(
    Invoice, prompt, text=text, company_name="Uber"
)

In [12]:
json_output = response.model_dump_json()
print(json.dumps(json.loads(json_output), indent=2))

{
  "invoice_id": "Uber_2023-03-28",
  "date": "2023-03-28T00:00:00Z",
  "line_items": [
    {
      "item_name": "Total revenue",
      "price": 456800000000.0
    },
    {
      "item_name": "Total expenditures",
      "price": 496900000000.0
    },
    {
      "item_name": "Deficit",
      "price": 40100000000.0
    },
    {
      "item_name": "Net new spending over six years",
      "price": 43000000000.0
    },
    {
      "item_name": "Refundable tax credit for green technologies",
      "price": 20000000000.0
    },
    {
      "item_name": "Means-tested dental care program",
      "price": 13000000000.0
    },
    {
      "item_name": "Grocery rebate for eligible families",
      "price": 467.0
    },
    {
      "item_name": "Grocery rebate for eligible single people",
      "price": 234.0
    },
    {
      "item_name": "Spending cuts",
      "price": 15000000000.0
    }
  ]
}


# Low-level structured data extraction

## Calling tools directly

In [14]:
from llama_index.core.program.function_program import get_function_tool

tool = get_function_tool(Invoice)
llm = OpenAI(model="gpt-3.5-turbo")
resp = llm.chat_with_tools(
    [tool],
    # chat_history=chat_history,  # can optionally pass in chat history instead of user_msg
    user_msg="Extract an invoice from the following text: " + text,
    # tool_choice="Invoice",  # can optionally force the tool call
)

tool_calls = llm.get_tool_calls_from_response(
    resp, error_on_no_tool_calls=False
)

outputs = []
for tool_call in tool_calls:
    if tool_call.tool_name == "Invoice":
        outputs.append(Invoice(**tool_call.tool_kwargs))

# use your outputs
print(outputs[0])

invoice_id='2023 Canadian federal budget' date=datetime.datetime(2023, 3, 28, 0, 0, tzinfo=TzInfo(UTC)) line_items=[LineItem(item_name='Total revenue', price=456.8), LineItem(item_name='Total expenditures', price=496.9), LineItem(item_name='Deficit', price=40.1)]


In [16]:
from llama_index.core.program.function_program import get_function_tool

tool = get_function_tool(LineItem)

resp = llm.chat_with_tools(
    [tool],
    user_msg="Extract line items from the following text: " + text,
    allow_parallel_tool_calls=True,
)

tool_calls = llm.get_tool_calls_from_response(
    resp, error_on_no_tool_calls=False
)

outputs = []
for tool_call in tool_calls:
    if tool_call.tool_name == "LineItem":
        outputs.append(LineItem(**tool_call.tool_kwargs))

# use your outputs
print(outputs)

[LineItem(item_name='Total revenue', price=456.8)]


In [17]:
schema = Invoice.model_json_schema()
prompt = "Here is a JSON schema for an invoice: " + json.dumps(
    schema, indent=2
)
prompt += (
    """
  Extract an invoice from the following text.
  Format your output as a JSON object according to the schema above.
  Do not include any other text than the JSON object.
  Omit any markdown formatting. Do not include any preamble or explanation.
"""
    + text
)

response = llm.complete(prompt)

print(response)

invoice = Invoice.model_validate_json(response.text)

pprint(invoice)

{
  "invoice_id": "2023-001",
  "date": "2023-03-28T00:00:00Z",
  "line_items": [
    {
      "item_name": "Total Revenue",
      "price": 456.8
    },
    {
      "item_name": "Total Expenditures",
      "price": 496.9
    },
    {
      "item_name": "Deficit",
      "price": 40.1
    }
  ]
}
Invoice(invoice_id='2023-001', date=datetime.datetime(2023, 3, 28, 0, 0, tzinfo=TzInfo(UTC)), line_items=[LineItem(item_name='Total Revenue', price=456.8), LineItem(item_name='Total Expenditures', price=496.9), LineItem(item_name='Deficit', price=40.1)])
