In [8]:
import json
import time
import os
#import ntpath
import sys
from requests import get, post
import csv

#### Paste your credentials

In [2]:
endpoint = r"https://************.cognitiveservices.azure.com/" ### paste your resource details (obtained from Azure portal)
apim_key = "***********" ###  paste your keys


invoiceFullFilename="*.pdf" #### path to your invoice file (It can be pdf, jpd, png, tiff etc)

In [3]:
def analyzeInvoice(filename):
    invoiceResultsFilename = filename + ".invoice.json"

    # do not run analyze if .invoice.json file is present on disk
    if os.path.isfile(invoiceResultsFilename):
        with open(invoiceResultsFilename) as json_file:
            return json.load(json_file)

    post_url = endpoint + "/formrecognizer/v2.1/prebuilt/invoice/analyze"
    headers = {
        # Request headers
        'Content-Type': 'application/pdf', ### if pdf use this. if it is image format change to 'image/jpeg', 'image/png' or 'image/tiff' depending on your case
        'Ocp-Apim-Subscription-Key': apim_key,
    }

    params = {
        "includeTextDetails": True
    }

    with open(filename, "rb") as f:
        data_bytes = f.read()

    try:
        resp = post(url = post_url, data = data_bytes, headers = headers, params = params)
        if resp.status_code != 202:
            print("POST analyze failed:\n%s" % resp.text)
            return None
        print("POST analyze succeeded: %s" % resp.headers["operation-location"])
        get_url = resp.headers["operation-location"]
    except Exception as e:
        print("POST analyze failed:\n%s" % str(e))
        return None

    n_tries = 50
    n_try = 0
    wait_sec = 6

    while n_try < n_tries:
        try:
            resp = get(url = get_url, headers = {"Ocp-Apim-Subscription-Key": apim_key})
            resp_json = json.loads(resp.text)
            if resp.status_code != 200:
                print("GET Invoice results failed:\n%s" % resp_json)
                return None
            status = resp_json["status"]
            if status == "succeeded":
                print("Invoice analysis succeeded.")
                with open(invoiceResultsFilename, 'w') as outfile:
                    json.dump(resp_json, outfile, indent=4)
                return resp_json
            if status == "failed":
                print("Analysis failed:\n%s" % resp_json)
                return None
            # Analysis still running. Wait and retry.
            time.sleep(wait_sec)
            n_try += 1     
        except Exception as e:
            msg = "GET analyze results failed:\n%s" % str(e)
            print(msg)
            return None

    return resp_json

def parseInvoiceResults(resp_json):
    docResults = resp_json["analyzeResult"]["documentResults"]
    invoiceResult = {}
    for docResult in docResults:
        for fieldName, fieldValue in sorted(docResult["fields"].items()):
            valueFields = list(filter(lambda item: ("value" in item[0]) and ("valueString" not in item[0]), fieldValue.items()))
            invoiceResult[fieldName] = fieldValue["text"]            
            if len(valueFields) == 1:
                print("{0:26} : {1:50}      NORMALIZED VALUE: {2}".format(fieldName , fieldValue["text"], valueFields[0][1]))
                invoiceResult[fieldName + "_normalized"] = valueFields[0][1]
            else:
                print("{0:26} : {1}".format(fieldName , fieldValue["text"]))
    print("")
    return invoiceResult



In [5]:
resp_json = analyzeInvoice(invoiceFullFilename)
if (resp_json is not None):
    invoiceResults = parseInvoiceResults(resp_json)
    invoiceResults["FullFilename"] = invoiceFullFilename
    #invoiceResults["Filename"] = invoiceFilename
    #writer.writerow(invoiceResults)

In [6]:
print(resp_json)

In [7]:
import pandas as pd
from IPython.display import display

pd.options.display.max_columns = None

for read_result in resp_json["analyzeResult"]["readResults"]:
    # print("Page No:%s" % read_result["page"])
    print("--------------Page %d: Extracted OCR---------------" % read_result["page"])
    for line in read_result["lines"]:
        print(line["text"])

for pageresult in resp_json["analyzeResult"]["pageResults"]:
    # print("Page No:%s" % pageresult["page"])
    #if pageresult["page"] == 4:
   
    for table in pageresult['tables']:
        print("--------------Page %d: Extracted table---------------" % pageresult["page"])
        # print("-------------------Extracted Table-------------------")
        print("No of Rows: %s" % table["rows"])
        print("No of Columns: %s" % table["columns"])
        tableList = [[None for x in range(table["columns"])] for y in range(table["rows"])] 
        for cell in table['cells']:
            tableList[cell["rowIndex"]][cell["columnIndex"]] = cell["text"]
        #print("new table" , tableList)
        df = pd.DataFrame.from_records(tableList)
        display(df)
        #print(df)
