In [None]:
import sys
sys.path.append('../../../')

import pymupdf
import os
from dotenv import dotenv_values
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from modules.app_settings import AppSettings
from modules.invoice import InvoiceData

In [None]:
# Configure Tesseract
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'
tessdata = os.environ['TESSDATA_PREFIX']

# Get environment variables
working_dir = os.path.abspath('../../../')
settings = AppSettings(dotenv_values(f"{working_dir}/.env"))

# Configure Azure OpenAI
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

openai_client = AzureOpenAI(
  azure_endpoint = settings.completions_openai_endpoint,
  azure_ad_token_provider=token_provider,
  api_version="2024-02-01"
)

# Setup the PDF file
pdf_path = '../../Assets/Invoices/'
pdf_file_name = 'Invoice-Complex.pdf'

is_scanned = False

if is_scanned:
    image_file_name = pdf_file_name + '.image.page_%s.png'
    ocr_pdf_file_name = pdf_file_name + '.ocr.page_%s.pdf'

In [None]:
original_doc = pymupdf.open(pdf_path + pdf_file_name)

doc_text = ''

for page_index in range(len(original_doc)):
    original_page = original_doc[page_index]

    if is_scanned:
        page_images = original_page.get_images() # Get the images of the page

        for page_image_idx, page_image in enumerate(page_images, start=1):
            page_image_ref = page_image[0] 
            page_image_pixmap = pymupdf.Pixmap(original_doc, page_image_ref) 

            if page_image_pixmap.n - page_image_pixmap.alpha > 3:
                page_image_pixmap = pymupdf.Pixmap(pymupdf.csRGB, page_image_pixmap)

            # DEBUG: Save the image
            page_image_pixmap.save(image_file_name % page_index) 

            # Perform OCR on the image
            page_image_pixmap.pdfocr_save(ocr_pdf_file_name % page_index, tessdata=tessdata)

            ocr_doc = pymupdf.open(ocr_pdf_file_name % page_index)
            for ocr_page in ocr_doc:
                doc_text += ocr_page.get_text()
                print(ocr_page.get_text())
    else:
        doc_text += original_page.get_text()
        print(original_page.get_text())

In [None]:
json_structure = InvoiceData.empty_json_str()

completion = openai_client.chat.completions.create(
    model=settings.completions_openai_completion_model_deployment,
    messages=[
        {
            "role": "system",
            "content": "You are an AI assistant that extracts data from documents and returns them as structured JSON objects. Do not return as a code block.",
        },
        {
            "role": "user",
            "content": "Extract the data from this invoice. If a value is not present, provide null. Use the following structure: " + json_structure,
        },
        {
            "role": "user",
            "content": doc_text,
        }
    ],
)

invoice = InvoiceData.from_json(completion.choices[0].message.content)
print(invoice.to_json())