In [11]:
import fitz
import json
import time
from requests import get, post
from PIL import Image
import io

In [3]:
# Endpoint URL
endpoint = r"https://**************.cognitiveservices.azure.com" ### Service endpoint
apim_key = " " ## Key for accessing the service

### Analyze forms using custom models with labels GA version

#### If you have models trained using the labelling tool, you can use the model id for analyzing similar kind of documents.

In [12]:
model_id =" " ### paste your model_id here
post_url = endpoint + "/formrecognizer/v2.1/custom/models/%s/analyze" % model_id

In [47]:
file=" " ### Test document name 
#file="image1.pdf"
with open(file, "rb") as f:
    data_bytes = f.read()


### For displaying the pdf file

In [1]:
doc = fitz.open(file)
page = doc.loadPage(0)
pix = page.getPixmap(alpha=False)
data=pix.getPNGData()
image = Image.open(io.BytesIO(data))
image

### Calling the Rest Service for analyzing 

In [2]:

source = file
params = {
    "includeTextDetails": True
}

headers = {
    # Request headers
    'Content-Type': 'application/pdf',
    'Ocp-Apim-Subscription-Key': apim_key,
}
with open(source, "rb") as f:
    data_bytes = f.read()

try:
    resp = post(url = post_url, data = data_bytes, headers = headers, params = params)
    if resp.status_code != 202:
        print("POST analyze failed:\n%s" % json.dumps(resp.json()))
        quit()
    print("POST analyze succeeded:\n%s" % resp.headers)
    get_url = resp.headers["operation-location"]
except Exception as e:
    print("POST analyze failed:\n%s" % str(e))
    quit() 

In [3]:
n_tries = 15
n_try = 0
wait_sec = 30
max_wait_sec = 60
time.sleep(wait_sec)
while n_try < n_tries:
    try:
        resp = get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": apim_key})
        resp_json = resp.json()
        if resp.status_code != 200:
            print("GET analyze results failed:\n%s" % json.dumps(resp_json))
            quit()
        status = resp_json["status"]
        if status == "succeeded":
            print("Analysis succeeded:\n%s" % json.dumps(resp_json))
            file = open(source.split('.')[0]+"custom_jsonoutput_newmodel.txt", "w")
            file.write(json.dumps(resp.json())) 
            file.close()
            break
        if status == "failed":
            print("Analysis failed:\n%s" % json.dumps(resp_json))
            quit()
        # Analysis still running. Wait and retry.
        time.sleep(wait_sec)
        n_try += 1
        wait_sec = min(2*wait_sec, max_wait_sec)     
    except Exception as e:
        msg = "GET analyze results failed:\n%s" % str(e)
        print(msg)
        quit()
        print("Analyze operation did not complete within the allocated time.")

#### Please make necessary changes in the code below. This is just a sample to retrieve the necessary information

In [4]:
import pandas as pd
from IPython.display import display

pd.options.display.max_columns = None
ocrresult=[]
for read_result in resp_json["analyzeResult"]["readResults"]:
    # print("Page No:%s" % read_result["page"])
    print("--------------Page %d: Extracted OCR---------------" % read_result["page"])
    for line in read_result["lines"]:
        ocrresult.append(line["text"])
keyvaluedict={}

counter=0 


for page_result in resp_json["analyzeResult"]["documentResults"]:
    #print(page_result)
    keyvaluedict[counter]={}
    keyvaluerelevant[counter]={}
    for keyvalue in page_result['fields']:
        newkey=keyvalue
        if newkey not in keyvaluedict[counter].keys():
            keyvaluedict[counter][newkey]=[]
        keyvaluedict[counter][newkey].append(page_result['fields'][newkey]['text'])

                
                
    counter=counter+1  
keyvaluedict
    

### Cutom model training without labels 2.1
###### The below code shows how to train the model with labels using REST API . Here, I have used labelling tool to label the data , but traning done through REST API. Not necessary to go this way, but just showcasing an option.
#### useLabelFile = True If this value is set to False then you can use the same code for custom model without labels. 
###### Like in the previous case,you can also use the model id you get from the labelling tool and use for analyzing skipping lines [9] to [13]

In [9]:
endpointnew="https://**********.cognitiveservices.azure.com/"

post_url1 = endpointnew + "/formrecognizer/v2.1/custom/models"
apim_key1="*********************"

#### Paste the SaS url

In [10]:
source="https://adfstoragejr.blob.core.windows.net/**********?sp=****************************" 

In [11]:
prefix=''
includeSubFolders = False
useLabelFile = True ## change this to false if we are doing custom model without labels

headers = {
    # Request headers
    'Content-Type': 'application/pdf',
    'Ocp-Apim-Subscription-Key': apim_key1,
}

body = 	{
    "source": source,
    "sourceFilter": {
        "prefix": prefix,
        "includeSubFolders": includeSubFolders
    },
    "useLabelFile": useLabelFile
}

try:
    resp = post(url = post_url1, json = body, headers = headers)
    if resp.status_code != 201:
        print("POST model failed (%s):\n%s" % (resp.status_code, json.dumps(resp.json())))
        #break
    print("POST model succeeded:\n%s" % resp.headers)
    get_url = resp.headers["location"]
    print(get_url)
except Exception as e:
    print("POST model failed:\n%s" % str(e))
    #break
n_tries = 10
n_try = 0
wait_sec = 5
max_wait_sec = 60
while n_try < n_tries:
    try:
        resp = get(url = get_url, headers = headers)
        resp_json = resp.json()
        if resp.status_code != 200:
            print("GET model failed (%s):\n%s" % (resp.status_code, json.dumps(resp_json)))
            #break
        model_status = resp_json["modelInfo"]["status"]
        if model_status == "ready":
            print("Training succeeded:\n%s" % json.dumps(resp_json))
            break
        if model_status == "invalid":
            print("Training failed. Model is invalid:\n%s" % json.dumps(resp_json))
            #break
        # Training still running. Wait and retry.
        time.sleep(wait_sec)
        n_try += 1
        wait_sec = min(2*wait_sec, max_wait_sec)     
    except Exception as e:
        msg = "GET model failed:\n%s" % str(e)
        print(msg)
        break
    print("Train operation did not complete within the allocated time.")

In [12]:
resp_json['modelInfo']['modelId']

### Analyze Invoices based on custom models with labels after training by code

In [14]:

model_id =resp_json['modelInfo']['modelId'] ##### If you did the training using the labelling tool, then you can paste the model if here
post_url=endpointnew+"/formrecognizer/v2.1-preview.1/custom/models/%s/analyze" % model_id

In [15]:
file="*************.pdf"

In [5]:

source = file

params = {
    "includeTextDetails": True
}

headers = {
    # Request headers
    'Content-Type': 'application/pdf',
    'Ocp-Apim-Subscription-Key': apim_key1,
}
with open(source, "rb") as f:
    data_bytes = f.read()

try:
    resp = post(url = post_url, data = data_bytes, headers = headers, params = params)
    if resp.status_code != 202:
        print("POST analyze failed:\n%s" % json.dumps(resp.json()))
        quit()
    print("POST analyze succeeded:\n%s" % resp.headers)
    get_url = resp.headers["operation-location"]
except Exception as e:
    print("POST analyze failed:\n%s" % str(e))
    quit() 

In [6]:
n_tries = 15
n_try = 0
wait_sec = 30
max_wait_sec = 60
time.sleep(wait_sec)
while n_try < n_tries:
    try:
        resp = get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": apim_key1})
        resp_json = resp.json()
        if resp.status_code != 200:
            print("GET analyze results failed:\n%s" % json.dumps(resp_json))
            quit()
        status = resp_json["status"]
        if status == "succeeded":
            print("Analysis succeeded:\n%s" % json.dumps(resp_json))
            file = open(source.split('.')[0]+"custom_jsonoutput_newmodel.txt", "w")
            file.write(json.dumps(resp.json())) 
            file.close()
            break
        if status == "failed":
            print("Analysis failed:\n%s" % json.dumps(resp_json))
            quit()
        # Analysis still running. Wait and retry.
        time.sleep(wait_sec)
        n_try += 1
        wait_sec = min(2*wait_sec, max_wait_sec)     
    except Exception as e:
        msg = "GET analyze results failed:\n%s" % str(e)
        print(msg)
        quit()
        print("Analyze operation did not complete within the allocated time.")

In [7]:
import pandas as pd
from IPython.display import display

pd.options.display.max_columns = None
ocrresults=[]
for read_result in resp_json["analyzeResult"]["readResults"]:
    # print("Page No:%s" % read_result["page"])
    print("--------------Page %d: Extracted OCR---------------" % read_result["page"])
    for line in read_result["lines"]:
        ocrresults.append(line["text"])

keyvaluedict={}
counter=0 


for page_result in resp_json["analyzeResult"]["documentResults"]:
    #print(page_result)
    keyvaluedict[counter]={}
    keyvaluerelevant[counter]={}
    for keyvalue in page_result['fields']:
        newkey=keyvalue
        if newkey not in keyvaluedict[counter].keys():
            keyvaluedict[counter][newkey]=[]

        keyvaluedict[counter][newkey].append(page_result['fields'][newkey]['text'])           
                
    counter=counter+1  
keyvaluedict
    