In [None]:
import sys
sys.path.append('../../../')

import pypdfium2
from marker.convert import convert_single_pdf
from marker.models import load_all_models
from marker.output import save_markdown
import os
from dotenv import dotenv_values
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from modules.app_settings import AppSettings
from modules.invoice import InvoiceData, InvoiceDataProduct
from modules.medical import MedicalRecord, MedicalRecordPatient, MedicalRecordReferral

In [None]:
# Get environment variables
working_dir = os.path.abspath('../../../')
settings = AppSettings(dotenv_values(f"{working_dir}/.env"))

In [None]:
# Load Models
model_lst = load_all_models()

In [None]:
# Configure Azure OpenAI
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

openai_client = AzureOpenAI(
  azure_endpoint = settings.completions_openai_endpoint,
  azure_ad_token_provider=token_provider,
  api_version="2024-02-01"
)

In [None]:
# Setup the PDF file
pdf_path = '../../Assets/Invoices/'
pdf_file_name = 'Invoice-Complex-Scanned.pdf'

expected = InvoiceData(
    invoice_number='3847193',
    purchase_order_number='15931',
    customer_name='Sharp Consulting',
    customer_address='73 Regal Way, Leeds, LS1 5AB, UK',
    delivery_date='2024-05-16',
    payable_by='2024-05-24',
    products=[
        InvoiceDataProduct(
            id='MA197',
            description='STRETCHWRAP ROLL',
            unit_price=16.62,
            quantity=5,
            total=83.10
        ),
        InvoiceDataProduct(
            id='ST4086',
            description='BALLPOINT PEN MED.',
            unit_price=2.49,
            quantity=10,
            total=24.90
        ),
        InvoiceDataProduct(
            id='JF9912413BF',
            description='BUBBLE FILM ROLL CL.',
            unit_price=15.46,
            quantity=12,
            total=185.52
        ),
    ],
    returns=[],
    total_product_quantity=27,
    total_product_price=293.52
)

In [None]:
fname = pdf_path + pdf_file_name

full_text, images, out_meta = convert_single_pdf(fname, model_lst, langs=["English"], batch_multiplier=2, start_page=None)

subfolder_path = save_markdown('./', pdf_file_name, full_text, images, out_meta)

print(f"Saved markdown to the {subfolder_path} folder")


In [None]:
json_structure = InvoiceData.empty_json_str()

completion = openai_client.chat.completions.create(
    model=settings.completions_openai_completion_model_deployment,
    messages=[
        {
            "role": "system",
            "content": "You are an AI assistant that extracts data from documents and returns them as structured JSON objects. Do not return as a code block.",
        },
        {
            "role": "user",
            "content": "Extract the data from this invoice. If a value is not present, provide null. Use the following structure: " + json_structure,
        },
        {
            "role": "user",
            "content": full_text,
        }
    ],
)

In [None]:
response_json_str = completion.choices[0].message.content
invoice = InvoiceData.from_json_str(response_json_str)

invoice_json = invoice.to_json()
accuracy = expected.compare_accuracy(invoice)
overall_accuracy_percent = accuracy['overall'] * 100

print(f"Response: {invoice_json}")
print(f"Overall Accuracy: {overall_accuracy_percent:.2f}%")

# Save accuracy result to a file
with open(f"{subfolder_path}/Accuracy.txt", "w") as f:
    f.write("Response:")
    f.write("\n")
    f.write(response_json_str)
    f.write("\n")
    f.write("\n")
    f.write(f"Overall Accuracy: {overall_accuracy_percent:.2f}%")
    f.write("\n")
    f.write("\n")
    f.write("Accuracy Detail:")
    f.write("\n")
    f.write(str(accuracy))

print(f"Accuracy result saved to {subfolder_path}/Accuracy.txt")