## Import all the necessary modules and global variables

In [28]:
from docai_module.config import *

In [2]:
from docai_module.docai_methods import *
from docai_module.storage_methods import *
from docai_module.firestore_methods import *

# Architecture 1: API Centric

Quick overview

<img src="./images/4_arch_front.png"
     alt="Processor"
     style="width:90%"
     />
     

## Lab Details

Details

## Cloud Storage

#### Cloud Storage Methods

In our architecture, everytime a system or a user needs to upload or download an object to the "Original Document" bucket, in this case a PDF document, it is necessary to generate a temporary signed URL to perform that activity (red dotted line).

<img src="./images/4_11_storage_upload.png"
     style="width:40%"
     />

Let's define a function to generate a signed url for download/upload and another function to download and upload the bytes itself.

In [3]:
# Generates a signed URL for downloading a blob using HTTP GET.
def create_signed_url_download(
    blob_name: str,
    bucket_name: str
) -> str:
    bucket = STORAGE_CLIENT.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    url = blob.generate_signed_url(
        version="v4",
        expiration=datetime.timedelta(minutes=15),
        method="GET",
    )
    return url

# Generates a signed URL for uploading a blob using HTTP POST.
def create_signed_url_upload(
    blob_name: str,
    bucket_name: str,
    content_type: str
):
    bucket = STORAGE_CLIENT.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    signed_url = blob.generate_signed_url(
        version="v4",
        expiration=datetime.timedelta(minutes=15),
        method="PUT",
        content_type=content_type,
    )
    return signed_url

# Download object from cloud storage
def download_blob_bytes(bucket_name, source_blob_name):
    """Downloads a blob from the bucket."""    
    bucket = STORAGE_CLIENT.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    return blob.download_as_bytes()

# Upload bytes to cloud storage
def upload_blob_bytes(bucket_name, 
                        image_bytes, 
                        destination_blob_name,
                        content_type):
    """Uploads a file to the bucket."""
    bucket = STORAGE_CLIENT.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_string(image_bytes, content_type=content_type)

Let's do a quick test and generate a signed URL and upload a PDF to the ORIGINAL bucket.

In [4]:
signed_test_upload = create_signed_url_upload('loan_form.pdf', ORIGINAL_BUCKET, MIME_TYPE)

In [5]:
print(signed_test_upload)

https://storage.googleapis.com/cool-ml-demos-original/loan_form.pdf?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=sa-geral%40cool-ml-demos.iam.gserviceaccount.com%2F20210129%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210129T210617Z&X-Goog-Expires=900&X-Goog-SignedHeaders=content-type%3Bhost&X-Goog-Signature=07096e1f0f34675085aa01e0bfc71d8ac91c0f98af09c4cb00063cbfe24dac67c5793371c3f022c0a3cac0b15e05a4885af10e3904bb9e22dd8296d1504303e49871ad485312575e9fc9abb72726d62a51c5d2c15707f0e422afaf2ab1949131a74981caa8044f33975a9aed6bb29972d08bbc7f162f6c59d74d834627e94f7822f04f4230abda7e75cf0325669959b016592789360057151a4a90566d73afe5ca6236b3ef21c4a7a1652d1e74c8c13cf9c6eef65efe447f10f7ab72edc93d2ff2c613de8866b688598748b69016dd4f3528d86ad3a0e8fea02efdbd5e6f825f6cf811f3278ece64fdbeeb36a4da0f34f48a41a97ae50a786057ba1d52f4b040


In [6]:
# Upload a file to this signed URL
headers = {'Content-Type': 'application/pdf'}
data = open('./files/loan_form.pdf', 'rb')
r = requests.put(signed_test_upload, data=data, headers=headers)
print(r)

<Response [200]>


On the Google Cloud web console, navigate back to your bucket ORIGINAL-BUCKET and check if the object was uploaded properly.

<img src="./images/4_7_storage_created.png"
     style="width:40%"
     />

## Document AI

We will use the processor created in the previous laboratory to call the API.  
Recall that the type of the processor is "Form Parsing".

In [7]:
print(PROCESSOR_ID)

ff4bad3352769404


We will use async processing with the method "async_process_document" defined in the Document AI lab.

Test the method to check if everything is working OK.  
Let's use the same document we uploaded in our Cloud Storage test.

In [8]:
gcs_output_uri = 'gs://' + DOCAI_BUCKET
gcs_output_uri_prefix = 'results'
gcs_input_uri = 'gs://' + ORIGINAL_BUCKET + '/' + 'loan_form.pdf'

In [9]:
OP_ID = async_process_document(gcs_input_uri, gcs_output_uri, gcs_output_uri_prefix, MIME_TYPE)
op_id_only = OP_ID.operation.name.split(sep='/')[-1]

print(f'Operation ID: {op_id_only} created.')
import time
while not OP_ID.done():
    print('Still processing, please wait ...')
    time.sleep(10)
else:
    print('Finish processing the document.')

Operation ID: 12739865725164768158 created.
Still processing, please wait ...
Still processing, please wait ...
Still processing, please wait ...
Still processing, please wait ...
Still processing, please wait ...
Still processing, please wait ...
Still processing, please wait ...
Still processing, please wait ...
Finish processing the document.


On the Google Cloud web console, navigate back to your bucket and check if there is a folder "results".  
This folder will hold the results from the Document AI call.

<img src="./images/4_8_results_folder.png"
     style="width:50%"
     />

## Data Loss Prevention

The next step in our architecture is to create a method to anonymize sensitive information from our document.  
We will use a simple function to identify any street address present in the JSON file.

For the sake of simplicity, we will inspect the document with a basic DLP API call.

<img src="./images/4_3_arch_dlp.png"
     style="width:70%"
     />

Method to detect street address using DLP API.

In [18]:
def inspect_string(content_string):
    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    # The info types to search for in the content. Required.
    info_types = [{"name": "STREET_ADDRESS"}]

    # The minimum likelihood to constitute a match. Optional.
    min_likelihood = dlp_v2.Likelihood.POSSIBLE

    # The maximum number of findings to report (0 = server maximum). Optional.
    max_findings = 0

    # Whether to include the matching string in the results. Optional.
    include_quote = True

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "min_likelihood": min_likelihood,
        "include_quote": include_quote,
        "limits": {"max_findings_per_request": max_findings},
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = f"projects/{PROJECT_ID}"

    # Call the API.
    response = DLP_CLIENT.inspect_content(
        request={"parent": parent, "inspect_config": inspect_config, "item": item}
    )

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print("Quote: {}".format(finding.quote))
            except AttributeError:
                pass
            print("Info type: {}".format(finding.info_type.name))
            print("Likelihood: {}".format(finding.likelihood))
    else:
        print("No findings.")

    return response

In [19]:
import json
with open('./files/result_loan_form.json') as f:
    data = json.load(f)
data['text']

'Loan Agreement Form\nAgreement Number:\n0123456789\nAgreement date:\n01/01/2020\nThis loan agreement is commenced between the parties:\nMortgage company contact details:\nName:\nMortgage company A\nAddress:\n100 Franklin Street, Mountain View, CA, 94035\nPhone number: 1-800-843-8623\n(hereinafter referred to as the lender)\nIndividual details:\nName:\nArjun Patel\nMarital status:\nSingle\nMarried ☐\nOther\nAddress:\n500 Castro Street, Mountain View, CA 94035\nPhone number: 650-987-0934\n(hereinafter referred to as the borrower)\n[Fill in all details as per instructions]\n6.0\n%.\nThe lender is ready to sanction $ 2000 as the loan amount at\n[Total loan amount along with the agreed percentage rate].\nThis loan agreement is valid from 01/01/2020 and is ending on 12/31/2020.\nTerms & agreements:\n38.67\nper month for\n5\nyears.\nThe borrower will pay an installment of $\n[Amount & tenure of loan]\nAny late installment will be accepted with $\n40\nas a fine.\n'

In [20]:
dlp_response = inspect_string(data['text'])

Quote: 100 Franklin Street, Mountain View, CA, 94035
Info type: STREET_ADDRESS
Likelihood: 4
Quote: 500 Castro Street, Mountain View, CA 94035
Info type: STREET_ADDRESS
Likelihood: 4


## Firestore

In this architecture we will use Firestore database to store the results of the API calls.  
After each API call to Document AI, DLP, etc., the location of the result or the intermediate ID of the Long Running Operation (LRO) will be stored there.  

<img src="./images/4_4_arch_firestore.png"
     style="width:70%"
     />
     
     
Next, let's define some methods to interact with Firestore.

In [42]:
parent = f'projects/{PROJECT_ID}/databases/(default)/collectionGroups/documents'
index = {
    'name':'documents',
    'query_scope': 'COLLECTION',
    'fields': {
        'blob_name'}
}

SyntaxError: invalid syntax (<ipython-input-42-33f5d23f62fc>, line 6)

In [41]:
FIRESTORE_ADMIN.create_index(parent=parent, index=index)

TypeError: Parameter to MergeFrom() must be instance of same class: expected google.firestore.admin.v1.Index.IndexField got str.

In [26]:
# Create a document in the collection with doc_values
def create_document_firestore(
    collection: str,
    doc_values: dict
):
    FIRESTORE_CLIENT.collection(collection).add(doc_values)

def create_firestore_index(
    collection_id: str,    
):
    parent = f'projects/{PROJECT_ID}/databases/(default)/collectionGroups/{collection_id}'
    index = {
        'name':'documents',
        'fields': [
            {'name':'blob_name'},{'name':'bucket_name'} 
        ]
    }
    return FIRESTORE_ADMIN.create_index(parent, index)
    
# Query all documents from collection, with filter
def get_all_documents_firebase(
    collection: str,
    field_name: str,
    filter_value: str,
    limit: int = 1
) -> dict:
    docs = FIRESTORE_CLIENT.collection(collection).where(
        field_name, u'==', filter_value).order_by(
            'creation_time').limit(limit).stream()
    response = {}

    for doc in docs:
        response[doc.id] = doc.to_dict()

    return response

# Get document content with blob_name
def get_document_firebase(
    collection: str,
    blob_name: str
) -> dict:
    doc = FIRESTORE_CLIENT.collection(collection).where(
            'blob_name', '==', blob_name).limit(1).get()
    
    return doc[0].to_dict() if doc else doc

# Get specific field from document in collection
def get_field_from_doc_firebase(
    collection: str,
    blob_name: str,
    field_name: str
) -> dict:
    doc_field = FIRESTORE_CLIENT.collection(collection).where(
            'blob_name', '==', blob_name).limit(1).get()

    return doc_field[0].get(field_name) if doc_field else doc_field

# Update a specific document field
def update_document_field_firebase(
    collection: str,
    blob_name: str,
    field_name: str,
    value: str
) -> bool:
    doc_id = FIRESTORE_CLIENT.collection(collection).where(
        'blob_name', '==', blob_name).limit(1).get()

    if doc_id:
        doc_ref = FIRESTORE_CLIENT.collection(
            'documents').document(doc_id[0].id)
        doc_ref.update({field_name:value})
        return True
    else:
        return False

Let's test and understand how each component works.

In [25]:
# The following function call will create a document in collection 'document' with doc_values
doc_values = {'user_id': 'user','creation_time':'now',
                'blob_name':'loan_form.pdf','bucket_name':ORIGINAL_BUCKET, 'content_type':'application/pdf'}
create_document_firestore('documents', doc_values)

To verify if the document was created, navigate to Cloud Firestore in the web console.

<img src="./images/4_9_firestore.png"
     style="width:20%"
     />
     
Then, click on the document we just created and check if the information is there:

<img src="./images/4_10_firestore_info.png"
     style="width:70%"
     />
     
As we specified just the collection_id as "documents", the document_id was assigned with a random number.

Next, let's create an index in Firebase to allow more complex queries.

To keep it simple, we will index both blob_name and bucket_name from documents fields.

In [30]:
create_firestore_index('documents')

TypeError: create_index() takes from 1 to 2 positional arguments but 3 were given

In [None]:
_get_all_documents_firebase('documents', 'blob_name', 'loan_form.pdf')

In [None]:
_get_document_firebase('documents', 'loan_form.pdf')