# Document Analysis
Compare two document with version difference

### 1. Install required libraries

In [None]:
%pip install azure-storage-blob azure-ai-formrecognizer azure-identity openai

### 2. Import helper libraries and load credentials from .env file

In [16]:
import os
from openai import AzureOpenAI
import numpy as np
from dotenv import load_dotenv
load_dotenv()

True

### 2. Create DocumentComparisonController class 
Designed to compare two PDF documents using Azure Form Recognizer and Azure OpenAI services. It extracts text content from the documents and uses OpenAI to analyze and compare the documents based on a user-provided prompt.

In [22]:
from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI
import datetime

class DocumentComparisonController:
    def __init__(self):
        self.FormRecogEndpoint = os.environ['FORM_RECOGNIZER_ENDPOINT']
        self.FormRecogSubscriptionKey = os.environ['FORM_RECOGNIZER_KEY']
        self.AOAIendpoint = os.environ['OPENAI_API_ENDPOINT']
        self.AOAIsubscriptionKey = os.environ['OPENAI_API_KEY']
        self.AOAIDeploymentName = os.environ['DEPLOYMENT_NAME']
        self.model = {}

        #self.storageconnstring = os.environ['BLOB_SAS']  
        #blob_service_client = BlobServiceClient.from_connection_string(self.storageconnstring)
        #self.container_client = blob_service_client.get_container_client(os.environ['BLOB_CONTAINER_NAME'])
        #self.sas_uri = self.generate_sas_uri()
        #self.sas_uri = os.environ['BLOB_SAS_URL']


    # def generate_sas_uri(self):
    #     sas_token = generate_blob_sas(
    #         account_name=self.container_client.account_name,
    #         container_name=self.container_client.container_name,
    #         account_key=self.container_client.credential.account_key,
    #         permission=BlobSasPermissions(read=True),
    #         expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1)
    #     )
    #     return f"https://{self.container_client.account_name}.blob.core.windows.net/{self.container_client.container_name}?{sas_token}"

    def document_comparison_post(self, document_urls, prompt):
        self.model['PdfUrl1'] = document_urls[0]
        self.model['PdfUrl2'] = document_urls[1]
        # self.model['PdfUrl1'] = document_urls[0] + self.sas_uri
        # self.model['PdfUrl2'] = document_urls[1] + self.sas_uri
        # self.model['tabName1'] = tab_names[0]
        # self.model['tabName2'] = tab_names[1]

        output_result = ["", ""]

        # Call AI Doc Intelligence service
        for i, url in enumerate(document_urls):
            client = DocumentAnalysisClient(endpoint=self.FormRecogEndpoint, credential=AzureKeyCredential(self.FormRecogSubscriptionKey))
            poller = client.begin_analyze_document_from_url("prebuilt-layout", url)
            result = poller.result()
            output_result[i] = result.content

        try:
            aoai_client = AzureOpenAI(
             azure_endpoint=os.environ['OPENAI_API_ENDPOINT'],
             api_key=os.environ['OPENAI_API_KEY'],  # this is also the default, it can be omitted
             api_version='2023-05-15',
             )

            messages = [
                {"role": "system", "content": f"You are specialized in analyzing different versions of the same PDF document. The first Document OCR result is: <<<{output_result[0]}>>> and the second Document OCR result is: <<<{output_result[1]}>>>"},
                {"role": "user", "content": f"User question: {prompt}"}
            ]
            # Call OpenAI service
            response = aoai_client.chat.completions.create(
                model=self.AOAIDeploymentName,
                messages=messages,
                max_tokens=1000,
                temperature=0.7,
                frequency_penalty=0,
                presence_penalty=0,
                top_p=0.95
            )

            self.model['Message'] = response.choices[0].message.content
        except Exception as e:
            raise e

        return self.model

controller = DocumentComparisonController()

# Example usage
doc_url1 ="https://ukhosampledocs.blob.core.windows.net/docs/20200515 TEMPLATE New Port Agreement (with Tidal Licence).pdf"
doc_url2 ="https://ukhosampledocs.blob.core.windows.net/docs/Cleowent Agreement with Tidal Licence - 03.01.2012.pdf"
document_urls = [doc_url1, doc_url2]
# tab_names = ["Tab1", "Tab2"]
prompt = "Compare the documents and provide similary and difference as table format"

result = controller.document_comparison_post(document_urls, prompt)
print("result----->",result)

result-----> {'PdfUrl1': 'https://ukhosampledocs.blob.core.windows.net/docs/20200515 TEMPLATE New Port Agreement (with Tidal Licence).pdf', 'PdfUrl2': 'https://ukhosampledocs.blob.core.windows.net/docs/Cleowent Agreement with Tidal Licence - 03.01.2012.pdf', 'Message': "Sure! Here's a comparison table highlighting the similarities and differences between the two documents:\n\n| Similarities | Differences |\n|--------------|-------------|\n| Purpose: Both agreements are entered into for the mutual benefit of the UK Hydrographic Office (UKHO) and the respective port/harbour commissions (CHC and Cleowent Harbour Commission) |\n| Exchange of Materials: Both agreements outline the exchange of hydrographic surveys, data, and related information between the parties |\n| Use of Material: Both agreements grant the UKHO a license to use and reproduce the material supplied by the port/harbour commissions in the UKHO's products |\n| Licensing: Both agreements allow for the sub-licensing of intelle