In [None]:
%pip install azure-ai-documentintelligence==1.0.0b4

In [None]:
%pip install PyPDF2

In [None]:
%pip install python-dotenv

In [None]:
%pip install azure-storage-blob

In [None]:
import os
import io
from PyPDF2 import PdfReader, PdfWriter
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

In [None]:
load_dotenv()
document_intelligence_endpoint = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
document_intelligence_key = os.getenv("DOCUMENT_INTELLIGENCE_KEY")
storage_connection_string = os.getenv("STORAGE_CONNECTION_STRING")
storage_container_name = os.getenv("STORAGE_CONTAINER_NAME")
storage_account_name = os.getenv("STORAGE_ACCOUNT_NAME")

In [None]:
blob_service_client = BlobServiceClient.from_connection_string(storage_connection_string)

In [None]:
document_intelligence_client = DocumentIntelligenceClient(
    endpoint=document_intelligence_endpoint, credential=AzureKeyCredential(document_intelligence_key)
    )

In [None]:
from PyPDF2 import PdfReader, PdfWriter
import os

# Input PDF file
input_pdf_path = r"C:\Users\HP VICTUS\Downloads\essencif_AI\doc_intelli_flow\finalised_invoice_dataset_disorganised.pdf"  # Change this to your PDF file
output_folder = "output_pages"  # Folder to store split pages

# Create output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Open the input PDF
reader = PdfReader(input_pdf_path)

# Loop through each page and save it as a separate file
for i, page in enumerate(reader.pages):
    writer = PdfWriter()
    writer.add_page(page)

    output_pdf_path = os.path.join(output_folder, f"{i+1}.pdf")
    with open(output_pdf_path, "wb") as output_pdf:
        writer.write(output_pdf)
    
    print(f"Saved: {output_pdf_path}")

print("PDF split completed!")


In [None]:
for file in os.listdir(output_folder):
    print(file)
    blob_client = blob_service_client.get_blob_client(container=storage_container_name, blob=file)
    print("\nUploading to Azure Storage as blob:\n\t" + file)

    # Upload the created file
    upload_file_path = os.path.join(output_folder, file)
    with open(file=upload_file_path, mode="rb") as data:
        blob_client.upload_blob(data)

In [None]:
hashMap={}

In [None]:
def document_intelligence(blob_url, page_number):
    """Process a single-page PDF and extract invoice details using Azure Document Intelligence."""
    try:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-invoice", AnalyzeDocumentRequest(url_source=blob_url)
        )
        invoices = poller.result()

        fields_to_extract = ["InvoiceId", "SubTotal", "AmountDue"]
        for invoice in invoices.documents:
            for field in fields_to_extract:
                field_value = invoice.fields.get(field)
                if field_value:
                    hashMap[page_number] = field
                    print(field)
                    if field=="InvoiceId" and ("SubTotal" in hashMap.values() or "AmountDue" in hashMap.values()):
                     return "SubTotal"
                    else:
                        return field

        # If no relevant fields are found, mark as "child page"
        hashMap[page_number] = "child page"
        print("child page")
        return "child page"

    except Exception as e:
        print(f"Error processing page {page_number}: {e}")
        return None

In [None]:
def create_pdf_from_pages(input_pdf: str, output_pdf: str, pages: list):
    """
    Creates a new PDF containing only the specified pages from the input PDF.

    :param input_pdf: Path to the original multi-page PDF.
    :param output_pdf: Path to save the new PDF.
    :param pages: List of page numbers (1-based index) to include in the new PDF.
    """
    if not pages:
        print("No pages specified for creating the PDF.")
        return

    try:
        reader = PdfReader(input_pdf)
        writer = PdfWriter()

        for page_num in pages:
            if 1 <= page_num <= len(reader.pages):
                writer.add_page(reader.pages[page_num - 1])  # Convert 1-based to 0-based index
            else:
                print(f"Invalid page number: {page_num}")

        # Ensure the output directory exists
        output_dir = "output_files"
        os.makedirs(output_dir, exist_ok=True)

        with open(os.path.join(output_dir, output_pdf), "wb") as out_file:
            writer.write(out_file)

        print(f"New PDF created: {output_pdf}")

    except Exception as e:
        print(f"Error creating PDF: {e}")

In [None]:
def process_pdf(pdf_path: str):
    """Reads a multi-page PDF and sends each page separately to Azure Document Intelligence."""
    try:
        pdf_reader = PdfReader(pdf_path)
        hashMap.clear()  # Clear the hashMap before processing
        invoiceID = None
        pageList = []

        for page_num in range(len(pdf_reader.pages)):
            # Generate the blob URL for the current page
            blob_url = f"https://{storage_account_name}.blob.core.windows.net/{storage_container_name}/{page_num + 1}.pdf"

            # Call Azure Document Intelligence for this page
            result = document_intelligence(blob_url, page_num + 1)  # Use 1-based page index
            if result in {"SubTotal", "AmountDue"} and "InvoiceId" in hashMap.values():
                pageList = list(hashMap.keys())  # Collect all pages in hashMap
                if "InvoiceId" in hashMap.values():
                    # Find the page with "InvoiceID" and set it as the invoiceID
                    for page, field in hashMap.items():
                        print("field: {}, page: {}".format(field, page))
                        if field == "InvoiceId":
                            invoiceID = page
                            break

                # Reverse the page list and create the output PDF
                if invoiceID>pageList[0]:
                 pageList.sort(reverse=True)
                print("pageList:", pageList)
                create_pdf_from_pages(pdf_path, f"output_{invoiceID}.pdf", pageList)
                print("hashMap:", hashMap)
                hashMap.clear()  # Clear the hashMap for the next invoice
               

        if not pageList:
            print("No valid invoice data found in the PDF.")

    except Exception as e:
        print(f"An error occurred while processing the PDF: {e}")

    print("\nProcessing completed!")

In [None]:
process_pdf(input_pdf_path)
