# Document AI Asynchronous API
This notebook shows you how use Python to make asynchronous calls to the Document AI API

You must replace the `processor_id` variable value in the the second cell with the appropriate value for the Processor ID of the Document AI processor that you want to use. The processor may not support all of the Document AI output properties. Entity data is only returned by processors that use specialized parsers for example. 

In [1]:
# Import Libraries
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from prettytable import PrettyTable

import re
import os
import pandas as pd


In [2]:
# Set your Processor ID
processor_id = "7b55c435adb3e2de"  # TODO: Replace with a valid Processor ID 

In [3]:
# Set your variables
project_id = %system gcloud config get-value core/project
project_id = project_id[0]
location = 'us'           # Replace with 'eu' if processor does not use 'us' location
gcs_input_bucket  = project_id+"_doc_ai_async"   # Bucket name only, no gs:// prefix
gcs_input_prefix  = "input/"                     # Input bucket folder e.g. input/
gcs_output_bucket = project_id+"_doc_ai_async"   # Bucket name only, no gs:// prefix
gcs_output_prefix = "output/"                    # Input bucket folder e.g. output/
timeout = 300

In [4]:
# Define Google Cloud client objects
client_options = {"api_endpoint": "{}-documentai.googleapis.com".format(location)}
client = documentai.DocumentProcessorServiceClient(client_options=client_options)
storage_client = storage.Client()

In [5]:
# Create input configuration
blobs = storage_client.list_blobs(gcs_input_bucket, prefix=gcs_input_prefix)
input_configs = []
print("Input Files:")
for blob in blobs:
    if ".pdf" in blob.name:
        source = "gs://{bucket}/{name}".format(bucket = gcs_input_bucket, name = blob.name)
        print(source)
        input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
            gcs_source=source, mime_type="application/pdf"
        )
        input_configs.append(input_config)


Input Files:
gs://qwiklabs-gcp-03-b8c9b73a1a12_doc_ai_async/input/google_invoice.pdf
gs://qwiklabs-gcp-03-b8c9b73a1a12_doc_ai_async/input/office-depot-receipt.pdf


In [6]:
# Create output configuration
destination_uri = f"gs://{gcs_output_bucket}/{gcs_output_prefix}"
output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
    gcs_destination=destination_uri
)

In [7]:
# Create the Document AI API request
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
request = documentai.types.document_processor_service.BatchProcessRequest(
    name=name,
    input_configs=input_configs,
    output_config=output_config,
)


In [8]:
# Start the batch (asynchronous) API operation 
operation = client.batch_process_documents(request)
# Wait for the operation to finish
operation.result(timeout=timeout)
print ("Batch process  completed.")

Batch process  completed.


In [9]:
# Fetch list of output files
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))

In [10]:
# Display detected text from asynchronous output JSON files
for i, blob in enumerate(blob_list):
    # If JSON file, download the contents of this blob as a bytes object.
    if ".json" in blob.name:
        blob_as_bytes = blob.download_as_bytes()
        document = documentai.types.Document.from_json(blob_as_bytes)
        print(f"Fetched file {i + 1}:{blob.name}")
        # print the text data output from the processor
        print(f"Text Data:\n {document.text}")
    else:
        print(f"Skipping non-supported file type {blob.name}")

Fetched file 1:output/4869069084488981550/0/google_invoice-0.json
Text Data:
 Google
INVOICE
# 23413561D
John Smith
Bill To:
Date:
Sep 24, 2019
Due Date:
Sep 30, 2019
Jane Smith,
1600 Amphitheatre Pkway
Mountain View, CA 94043
Balance Due:
$4,647.68
Item
Quantity
Rate
Amount
12 ft HDMI cable
12
$9.99
$119.88
27" Computer Monitor
12
$399.99
$4,799.88
Ergonomic Keyboard
12
$59.99
$719.88
22222-
Optical mouse
12
$19.99
$239.88
Laptop
12
$1,299.99
$15,599.88
Misc processing fees
1
$899.99
$899.99
Subtotal:
$22,379.39
Discounts (21%):
$4,699.67
Tax (10%):
$1,767.97
Shipping:
$199.99
Total:
$19,647.68
Amount Paid:
$15,000.00
Notes:
This is a test order. No actual transactions took place.
Terms:
Delivery scheduled for second week of October 2019.

Fetched file 2:output/4869069084488981550/1/office-depot-receipt-0.json
Text Data:
 Office DEPOT
OfficeMaxcomo
YA LOS GATOS - (408) 356-3757nibuloxe)
07/20/2020 5:23 PM
s 101 omo
te
LOILLOL
V2VT7XYPQX555YXM6
SALE
950-1-2020-958724-20.6.2
328374
MAT,

In [11]:
# Display entity data from asynchronous output JSON files
for i, blob in enumerate(blob_list):
    # If JSON file, download the contents of this blob as a bytes object.
    if ".json" in blob.name:
        blob_as_bytes = blob.download_as_bytes()
        document = documentai.types.Document.from_json(blob_as_bytes)
        print(f"Fetched file {i + 1}:{blob.name}")
        # print the entity data output from the processor
        if 'entities' in dir(document):
            entities=document.entities
            table = PrettyTable(['Type', 'Value', 'Confidence'])
            entities_found = 0
            for entity in entities:
               entity_type = entity.type_
               value = entity.mention_text
               confidence = round(entity.confidence,4)
               table.add_row([entity_type, value, confidence])
            print(table)   
        else:
            print('No entity data returned by the Document AI processor for file'+blob.name)
    else:
        print(f"Skipping non-supported file type {blob.name}")

Fetched file 1:output/4869069084488981550/0/google_invoice-0.json
+------------------+-------+------------+
|       Type       | Value | Confidence |
+------------------+-------+------------+
| generic_entities |       |    0.0     |
+------------------+-------+------------+
Fetched file 2:output/4869069084488981550/1/office-depot-receipt-0.json
+------------------+-------+------------+
|       Type       | Value | Confidence |
+------------------+-------+------------+
| generic_entities |       |    0.0     |
+------------------+-------+------------+
