In [None]:
import sys
sys.path.append('../../../')

import pypdfium2
from marker.convert import convert_single_pdf
from marker.models import load_all_models
from marker.output import save_markdown
import os
from dotenv import dotenv_values
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from modules.app_settings import AppSettings
from modules.invoice import InvoiceData

In [None]:
# Get environment variables
working_dir = os.path.abspath('../../../')
settings = AppSettings(dotenv_values(f"{working_dir}/.env"))

In [None]:
# Load Models
model_lst = load_all_models()

In [None]:
# Configure Azure OpenAI
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

openai_client = AzureOpenAI(
  azure_endpoint = settings.completions_openai_endpoint,
  azure_ad_token_provider=token_provider,
  api_version="2024-02-01"
)

In [None]:
# Setup the PDF file
pdf_path = '../../Assets/Invoices/'
pdf_file_name = 'Invoice-Complex-Scanned.pdf'

In [None]:
fname = pdf_path + pdf_file_name

full_text, images, out_meta = convert_single_pdf(fname, model_lst, langs=["English"], batch_multiplier=2, start_page=None)

subfolder_path = save_markdown('./', pdf_file_name, full_text, images, out_meta)

print(f"Saved markdown to the {subfolder_path} folder")


In [None]:
json_structure = InvoiceData.empty_json_str()

completion = openai_client.chat.completions.create(
    model=settings.completions_openai_completion_model_deployment,
    messages=[
        {
            "role": "system",
            "content": "You are an AI assistant that extracts data from documents and returns them as structured JSON objects. Do not return as a code block.",
        },
        {
            "role": "user",
            "content": "Extract the data from this invoice. If a value is not present, provide null. Use the following structure: " + json_structure,
        },
        {
            "role": "user",
            "content": full_text,
        }
    ],
)

invoice = InvoiceData.from_json(completion.choices[0].message.content)
print(invoice.to_json())