# Essencif.AI Document Intelligence Solution

#### SDK and libraries installation

In [None]:
%pip install azure-ai-documentintelligence==1.0.0

In [None]:
%pip install PyPDF2

In [None]:
%pip install python-dotenv

In [None]:
%pip install azure-storage-blob

#### Importing Utilities and Libraries

In [None]:
import os
import io
from PyPDF2 import PdfReader, PdfWriter
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

#### Loading the Environment Variables

In [None]:
load_dotenv()
document_intelligence_endpoint = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
document_intelligence_key = os.getenv("DOCUMENT_INTELLIGENCE_KEY")
storage_connection_string = os.getenv("STORAGE_CONNECTION_STRING")
storage_container_name = os.getenv("STORAGE_CONTAINER_NAME")
storage_account_name = os.getenv("STORAGE_ACCOUNT_NAME")

#### Creating a storage account client to upload blobs (PDFs) to container

In [None]:
blob_service_client = BlobServiceClient.from_connection_string(storage_connection_string) # creating client to interact with Azure Storage Account

#### Creating a Document Intelligence Client

In [None]:
# Creating client to interact with our Document Intelligence Resource Analyse API
document_intelligence_client = DocumentIntelligenceClient(
    endpoint=document_intelligence_endpoint, credential=AzureKeyCredential(document_intelligence_key)
    )

### Splitting the Parent Combined PDF page-by-page

In [None]:
from PyPDF2 import PdfReader, PdfWriter
import os

# Input PDF file
input_pdf_path = "./Invoices Upwork.pdf"  # Change this to your PDF file
output_folder = "output_pages"  # Folder to store split pages

# Create output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Open the input PDF
reader = PdfReader(input_pdf_path)

# Loop through each page and save it as a separate file
for i, page in enumerate(reader.pages):
    writer = PdfWriter()
    writer.add_page(page)

    output_pdf_path = os.path.join(output_folder, f"{i+1}.pdf")
    with open(output_pdf_path, "wb") as output_pdf:
        writer.write(output_pdf)
    
    print(f"Saved: {output_pdf_path}")

print("PDF split completed!")


#### Uploading single PDF files to the Azure Storage Account

In [None]:
for file in os.listdir(output_folder):
    print(file)
    blob_client = blob_service_client.get_blob_client(container=storage_container_name, blob=file)
    print("\nUploading to Azure Storage as blob:\n\t" + file)

    # Upload the created file
    upload_file_path = os.path.join(output_folder, file)
    with open(file=upload_file_path, mode="rb") as data:
        blob_client.upload_blob(data)

#### Creating a hashMap to keep track of each invoice clubbed by its page number(s)

In [None]:
hashMap={}

#### Creating a function to keep track of Vendor Names

In [None]:
def is_vendor_name_same(vendor_name_1: str, vendor_name_2: str) -> bool:
    """
    Compares two VendorName strings and returns True if at least one whole string matches,
    ignoring case sensitivity.

    Args:
        vendor_name_1 (str): The first VendorName string.
        vendor_name_2 (str): The second VendorName string.

    Returns:
        bool: True if at least one whole string matches, False otherwise.
    """
    # Normalize the strings by splitting them into individual words or lines
    vendor_name_1_parts = set(vendor_name_1.strip().replace("\n", " ").lower().split())
    vendor_name_2_parts = set(vendor_name_2.strip().replace("\n", " ").lower().split())

    # Check if there is any intersection between the two sets
    return bool(vendor_name_1_parts & vendor_name_2_parts)

#### Creating a function to keep track of Invoice ID

In [None]:
import re

def is_invoice_id_changed(invoice_id_1: str, invoice_id_2: str) -> bool:
    """
    Compares the numeric parts of two invoice IDs and determines if they are different.

    Args:
        invoice_id_1 (str): The first invoice ID.
        invoice_id_2 (str): The second invoice ID.

    Returns:
        bool: True if the numeric parts are different, False otherwise.
    """
    # Extract numeric parts using regex
    numeric_part_1 = re.sub(r'\D', '', invoice_id_1)
    numeric_part_2 = re.sub(r'\D', '', invoice_id_2)

    return numeric_part_1 != numeric_part_2

#### Creating Function to analyse a PDF file using Azure Doc Intelligence's Analyze API

In [None]:
def document_intelligence(blob_url, page_number):
    """Process a single-page PDF and extract invoice details using Azure Document Intelligence."""
    try:
        global current_vendor_name  # Declare global to track VendorName across pages
        
        global current_invoice_id  # Declare global to track InvoiceId across pages
         
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-invoice", AnalyzeDocumentRequest(url_source=blob_url),
            
        )
        invoices = poller.result()

        fields_to_extract = ["InvoiceId", "VendorName", "VendorAddressRecipient", "InvoiceTotal", "CustomerName", "InvoiceDate", "TotalTax"] #our principal fields of interest
        # Initialize flags to check if fields are found
        found_invoice_id = False
        found_invoice_total = False
        current_page_vendor_name = None
        found_customer_name = False
        current_page_invoice_id = None
        
        
        # Check if the page contains any of the fields we are interested in

        for invoice in invoices.documents:
            
            print(f"Page {page_number} fields: {invoice.fields.keys()}")
            for field in fields_to_extract:
                field_value = invoice.fields.get(field)
                if field_value:
                    if field == "InvoiceId":
                        found_invoice_id = True
                        current_page_invoice_id = field_value.value_string
                        print("page number:{}, field: {},  field_value: {}".format(page_number,field, field_value))  # Print the value of InvoiceId
                    
                    if field == "VendorName":
                        found_invoice_id = True
                        print("page number:{}, field: {}, field_value: {} ".format(page_number,field,field_value))  # Print the value of VendorName
                    
                    if field == "VendorAddressRecipient":
                        found_invoice_id = True
                        print("page number:{}, field: {}, field_value: {} ".format(page_number,field,field_value))
                        
                    if field=="InvoiceTotal" and field_value.confidence >= 0.5:
                        # Check if confidence is above 0.5 before considering it valid
                        found_invoice_total = True
                        print("page number:{}, field: {}, field_value: {} ".format(page_number,field,field_value))
                    
                    if field == "CustomerName":
                        found_customer_name = True
                        print("page number:{}, field: {}, field_value: {} ".format(page_number,field,field_value))
                        
                    if field == "InvoiceDate":
                        found_invoice_id = True
                        print("page number:{}, field: {}, field_value: {} ".format(page_number,field,field_value))
                    
                    
                    
                    if field == "TotalTax":
                        found_invoice_total = True
                        print("page number:{}, field: {}, field_value: {} ".format(page_number,field,field_value))
                        
                    if field=="VendorName":
                        
                        current_page_vendor_name = field_value.value_string
                        print("page number:{}, field: {}, field_value: {} ".format(page_number,field,field_value))
                    

                    # If both InvoiceId and SubTotal are found on the same page
                
        if current_page_vendor_name:
            if current_vendor_name and is_vendor_name_same(current_page_vendor_name, current_vendor_name) is not True:
                print(f"Vendor name changed from {current_vendor_name} to {current_page_vendor_name} on page {page_number}.")
                current_vendor_name = current_page_vendor_name  #Update the global variable
                return "VendorNameChanged"
            
            #Update the global variable if its the first page
            current_vendor_name = current_page_vendor_name  # Update the local variable
        
        if current_page_invoice_id:
            if current_invoice_id and is_invoice_id_changed(current_page_invoice_id, current_invoice_id) is True:
                print(f"InvoiceId changed from {current_invoice_id} to {current_page_invoice_id} on page {page_number}.")
                current_invoice_id = current_page_invoice_id #Update the global variable
                return "InvoiceIdChanged"
            # Update the global variable if its the first page 
            current_invoice_id = current_page_invoice_id  #Update the global variable
            
            
        # If only InvoiceId is found
        if found_customer_name:
            hashMap[page_number] = "InvoiceId"
            print("InvoiceId")
            return "InvoiceId"
        
        if found_invoice_total:
            hashMap[page_number] = "InvoiceTotal"
            print("InvoiceTotal")
            return "InvoiceTotal"
        
        # If no relevant fields are found, mark as "child page"
        hashMap[page_number] = "child page"
        print("page number : {} child page".format(page_number))
        return "child page"

    except Exception as e:
        print(f"Error processing page {page_number}: {e}")
        return None

#### Function to combine multiple PDF files into a single PDF file - for creating a consolidated invoice

In [None]:
def create_pdf_from_pages(input_pdf: str, output_pdf: str, pages: list):
    """
    Creates a new PDF containing only the specified pages from the input PDF.

    :param input_pdf: Path to the original multi-page PDF.
    :param output_pdf: Path to save the new PDF.
    :param pages: List of page numbers (1-based index) to include in the new PDF.
    """
    if not pages:
        print("No pages specified for creating the PDF.")
        return

    try:
        reader = PdfReader(input_pdf)
        writer = PdfWriter()

        for page_num in pages:
            if 1 <= page_num <= len(reader.pages):
                writer.add_page(reader.pages[page_num - 1])  # Convert 1-based to 0-based index
            else:
                print(f"Invalid page number: {page_num}")

        # Ensure the output directory exists
        output_dir = "output_files"
        os.makedirs(output_dir, exist_ok=True)

        with open(os.path.join(output_dir, output_pdf), "wb") as out_file:
            writer.write(out_file)

        print(f"New PDF created: {output_pdf}")

    except Exception as e:
        print(f"Error creating PDF: {e}")

#### Reads a multi-page PDF and sends to Document Intelligence for Analysis

In [None]:
def process_pdf(pdf_path: str):
    """Reads a multi-page PDF and sends each page separately to Azure Document Intelligence."""
    try:
        pdf_reader = PdfReader(pdf_path)
        hashMap.clear()  # Clear the hashMap before processing
        invoiceID = None
        pageList = []  # To store all pages in the hashMap of the current invoice to be consolidated into a single PDF
        count = 1 #for naming the output files
        
        global current_vendor_name
        current_vendor_name = None  # Initialize the current VendorName
        
        global current_invoice_id
        current_invoice_id = None  # Initialize the current InvoiceId
        
        for page_num in range(len(pdf_reader.pages)):
            # Generate the blob URL for the current page
            blob_url = f"https://{storage_account_name}.blob.core.windows.net/{storage_container_name}/{page_num + 1}.pdf"

            # Call Azure Document Intelligence for this page
            result = document_intelligence(blob_url, page_num + 1)  # Use 1-based page index
            
            if result == "VendorNameChanged":
                # Finalize the current invoice (exclude the current page)
                if pageList:
                    create_pdf_from_pages(pdf_path, f"output_{count}.pdf", pageList)
                    print(f"VendorName changed: New PDF created: output_{count}.pdf with pages: {pageList}")
                    count += 1
                    pageList = []  # Reset the page list for the next invoice

                # Clear the hashMap for the new invoice
                hashMap.clear()
                
                # Process the current page again
                result = document_intelligence(blob_url, page_num+1) #reprocess the current page
                    

            if result == "InvoiceTotal":
                # Check if we have both "InvoiceId" and "InvoiceTotal" in the hashMap
                if "InvoiceId" in hashMap.values() and "InvoiceTotal" in hashMap.values():
                    pageList = list(hashMap.keys())  # Collect all pages in hashMap
                    pageList.sort()  # Ensure pages are in ascending order

                    # Find the page with "InvoiceId" and set it as the invoiceID
                    for page, field in hashMap.items():
                        if field == "InvoiceId":
                            invoiceID = page
                            break

                    # Create the output PDF for the current invoice
                    create_pdf_from_pages(pdf_path, f"output_{count}.pdf", pageList)
                    print(f"pageList: {pageList}")
                    print(f"New PDF created: output_{count}.pdf")
                    print(f"hashMap: {hashMap}")
                    count+=1
                    # Clear the hashMap and pageList for the next invoice
                    hashMap.clear()
                    pageList = []
                    continue
            
            if result=="InvoiceIdChanged":
                # Finalize the current invoice (exclude the current page)
                if pageList:
                    create_pdf_from_pages(pdf_path, f"output_{count}.pdf", pageList)
                    print(f"Invoice ID changed: New PDF created: output_{count}.pdf with pages: {pageList}")
                    count += 1
                    pageList = []  # Reset the page list for the next invoice
                
                # Clear the hashMap for the new invoice
                hashMap.clear()
                
                # Process the current page again
                result = document_intelligence(blob_url, page_num+1) #reprocess the current page
                
            
            # Add current page to the pageList
            pageList.append(page_num + 1)  # Use 1-based page index
            
            
                
        # Finalize the last invoice if any pages are left in pageList
        if pageList:
            create_pdf_from_pages(pdf_path, f"output_{count}.pdf", pageList)
            print(f"Remaining pages consolidated: New PDF created: output_{count}.pdf with pages: {pageList}")
            
        if not pageList:
            print("No valid invoice data found in the PDF.")

    except Exception as e:
        print(f"An error occurred while processing the PDF: {e}")

    print("\nProcessing completed!")

In [None]:
process_pdf(input_pdf_path)


#### Uploading consolidated PDFs to Azure Storage Account

In [None]:
for file in os.listdir("output_files"):
    print(file)
    blob_client = blob_service_client.get_blob_client(container=storage_container_name, blob=file)
    print("\nUploading to Azure Storage as blob:\n\t" + file)

    # Upload the created file
    upload_file_path = os.path.join("output_files", file)
    with open(file=upload_file_path, mode="rb") as data:
        blob_client.upload_blob(data)

#### Function to call the Prompt FLow endpoint with the consolidated invoice as the input

In [None]:
import os
import urllib.request
import json
import shutil
import zipfile

def call_prompt_flow_endpoint(url, file_path):
    # Ensure the parent folder 'final_output' exists
    parent_folder = "final_output"
    os.makedirs(parent_folder, exist_ok=True)

    # Extract the file name without extension from file_path
    file_name = os.path.basename(file_path)
    file_stem, file_ext = os.path.splitext(file_name)
    
    # Create a subfolder named after the file (without extension)
    sub_folder = os.path.join(parent_folder, file_stem)
    os.makedirs(sub_folder, exist_ok=True)

    # Request data
    data = {"url": url}
    body = str.encode(json.dumps(data))

    # Load API details from environment variables
    url_endpoint = os.getenv("PROMPT_FLOW_ENDPOINT")
    api_key = os.getenv("PROMPT_FLOW_API_KEY")
    if not api_key:
        raise Exception("A key should be provided to invoke the endpoint")

    headers = {
        'Content-Type': 'application/json',
        'Accept': 'application/json',
        'Authorization': 'Bearer ' + api_key
    }

    req = urllib.request.Request(url_endpoint, body, headers)

    try:
        response = urllib.request.urlopen(req)
        result = response.read()
        print(result)

        # Decode and parse JSON
        response_json = json.loads(result.decode("utf-8"))

        # Extract markdown text and JSON structure
        markdown_text = response_json["output"]["markdown_text"]
        json_struct = response_json["output"]["json_struct"]

        # Save Markdown file
        md_file_path = os.path.join(sub_folder, f"{file_stem}.md")
        with open(md_file_path, "w", encoding="utf-8") as md_file:
            md_file.write(markdown_text)

        # Save JSON file
        json_file_path = os.path.join(sub_folder, f"{file_stem}.json")
        with open(json_file_path, "w", encoding="utf-8") as json_file:
            json.dump(json_struct, json_file, indent=4)

        # Copy the original PDF file to the subfolder
        pdf_destination_path = os.path.join(sub_folder, file_name)
        shutil.copy(file_path, pdf_destination_path)

        # Create a ZIP archive of the subfolder
        zip_file_path = os.path.join(parent_folder, f"{file_stem}.zip")
        shutil.make_archive(zip_file_path.replace(".zip", ""), 'zip', sub_folder)

        print(f"Files saved in: {sub_folder}")
        print(f"- {md_file_path}")
        print(f"- {json_file_path}")
        print(f"- {pdf_destination_path}")
        print(f"Zipped Folder: {zip_file_path}")

    except urllib.error.HTTPError as error:
        print("The request failed with status code: " + str(error.code))
        print(error.info())
        print(error.read().decode("utf8", 'ignore'))


In [None]:
for file in os.listdir("output_files"):
    blob_url = f"https://{storage_account_name}.blob.core.windows.net/{storage_container_name}/{file}"
    print(blob_url)
    call_prompt_flow_endpoint(blob_url, file_path="output_files/"+file)
    
    
    