In [5]:
from langchain_openai import AzureChatOpenAI
from langchain_unstructured import UnstructuredLoader
import os
from dotenv import load_dotenv
load_dotenv() 

llm = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
    azure_deployment="gpt-4o",
    temperature=0,
)

file_paths = [
    "../data/Wine-PO.pdf"
]

loader = UnstructuredLoader(file_paths,
                            api_key=os.getenv("UNSTRUCTURED_API_KEY"),
                            partition_via_api=True)

docs = loader.load()

INFO: Preparing to split document for partition.
INFO: Concurrency level set to 5
INFO: Splitting pages 1 to 1 (1 total)
INFO: Determined optimal split size of 2 pages.
INFO: Document has too few pages (1) to be split efficiently. Partitioning without split.
INFO: Successfully partitioned the document.


In [11]:
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List


class Item(BaseModel):
    item_name: str = Field(description="Item Name")
    quantity: int = Field(description="Quantity")
    item_price: float = Field(description="Item Price")
    item_discount: float = Field(description="Item Discount")
    total: float = Field(description="Total Price")

class Order(BaseModel):
    po_number: str = Field(description="PO Number")
    order_status: str = Field(description="Order Status")
    client_id: str = Field(description="Client ID")
    customer_contact_name: str = Field(description="Customer Contact Name")
    customer_company_name: str = Field(description="Customer Company Name")
    customer_address: str = Field(description="Customer Address")
    supplier_contact_name: str = Field(description="Supplier Contact Name")
    supplier_company_name: str = Field(description="Supplier Company Name")
    supplier_address: str = Field(description="Supplier Address")
    delivery_date: str = Field(description="Delivery Date")
    order_total: float = Field(description="Order Total")
    subtotal: float = Field(description="Subtotal")
    sales_tax: float = Field(description="Sales Tax")
    shipping_cost: float = Field(description="Shipping Cost")
    other_costs: float = Field(description="Other Costs")
    items: List[Item] = Field(description="Items")

In [12]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

parser = JsonOutputParser(pydantic_object=Order)

prompt = PromptTemplate(
    template="Extract the information as specified.\n{format_instructions}\n{context}\n",
    input_variables=["context"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

response = chain.invoke({
    "context": docs
})


INFO: HTTP Request: POST https://aoaisec.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"


In [7]:
response

{'po_number': 'P/O NUMBER',
 'order_status': '',
 'client_id': 'CLIENT ID',
 'customer_contact_name': '',
 'customer_company_name': '',
 'customer_address': '',
 'supplier_contact_name': '',
 'supplier_company_name': '',
 'supplier_address': '',
 'delivery_date': 'DATE',
 'order_total': 0,
 'subtotal': 0,
 'sales_tax': 0,
 'shipping_cost': 0,
 'other_costs': 0,
 'items': []}