# Modern Slavery Analysis Pipeline PoC

### High Level Overview of Process

1. Textract - Convert PDFs into text


In [27]:
import boto3
import time
import pandas as pd
import json
import os
from sagemaker import get_execution_role

In [28]:
mySession = boto3.session.Session()
region = mySession.region_name

s3_client = boto3.client('s3')
client = boto3.client('textract', region_name=region)
comprehend = boto3.client('comprehend', region_name=region)

role = get_execution_role()

---------------
## 1. Convert PDFs into text using Textract

In [49]:
# Sourced from: https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/Textract.ipynb
def start_job(client, s3_bucket_name, object_name):
    response = None
    response = client.start_document_text_detection(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3_bucket_name,
                'Name': object_name
            }})

    return response["JobId"]


def is_job_complete(client, job_id):
    #time.sleep(1)
    response = client.get_document_text_detection(JobId=job_id)
    status = response["JobStatus"]
    
    #print("Job status: {}".format(status))

    #while(status != "SUCCEEDED"):#"IN_PROGRESS"):
    #time.sleep(1)
    # response = client.get_document_text_detection(JobId=job_id)
    # status = response["JobStatus"]
    #print("Job status: {}".format(status))

    return status


def get_job_results(client, job_id):
    pages = []
    time.sleep(1)
    response = client.get_document_text_detection(JobId=job_id)
    pages.append(response)
    #print("Resultset page received: {}".format(len(pages)))
    next_token = None
    if 'NextToken' in response:
        next_token = response['NextToken']

    while next_token:
        time.sleep(1)
        response = client.\
            get_document_text_detection(JobId=job_id, NextToken=next_token)
        pages.append(response)
        #print("Resultset page received: {}".format(len(pages)))
        next_token = None
        if 'NextToken' in response:
            next_token = response['NextToken']

    return pages


In [30]:
def iterate_bucket_items(bucket, prefix=""):
    """
    Generator that iterates over all objects in a given s3 bucket

    See http://boto3.readthedocs.io/en/latest/reference/services/s3.html#S3.Client.list_objects_v2 
    for return data format
    :param bucket: name of s3 bucket
    :return: dict of metadata for an object
    """


    client = boto3.client('s3')
    paginator = client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)

    for page in page_iterator:
        if page['KeyCount'] > 0:
            for item in page['Contents']:
                yield item

In [32]:
# Documents
s3_bucket_name="modern-slavery-act-poc-124234234"
prefix='2020' # TODO: CHANGE BACK TO ALL FILES

# Data frame to save to 
extracted_text = pd.DataFrame(columns=["label","text"])

In [33]:
# TODO: print total count of documents to process
job_ids = []
finished_jobs = []
s3_objects = iterate_bucket_items(bucket=s3_bucket_name, prefix=prefix)

print("Starting Textract extraction jobs")
# Start Textract jobs
for s3_object in s3_objects:
    document_name = s3_object["Key"]
    if document_name.endswith(".pdf"):  
        job_id = start_job(client, s3_bucket_name, document_name)
        job_ids.append(job_id)
        #print("Started job with id: {}".format(job_id))

Starting Textract extraction jobs


In [51]:
# Check if jobs are complete
while job_ids:
    print("Waiting on #: ", len(job_ids))
    for job_id in job_ids:
        if is_job_complete(client, job_id) == "SUCCEEDED":
            print("SUCCEEDED: " + str(job_id))
            # add response to list
            response = get_job_results(client, job_id)
            finished_jobs.append(response)
            job_ids.remove(job_id)
            
        
    
    time.sleep(30)

Waiting on #:  58
SUCCEEDED: ddd2a9cc638d14c890aca040b7f71d7f361d37c02bcf863ba2a512ba41c65c83
SUCCEEDED: 5511f85f3a85dc9450124fecb1899a3109b2178b27885decfa6b858d847e52ba
SUCCEEDED: 76ccd1026c128efe6372995c405cc603fd04fd48cf6828459399a03d8e2bf9dc
SUCCEEDED: 1d977180c6a03222f669d7879ce7475203b66a731c84e181d79bc7084d349041
SUCCEEDED: 069331d89439e0556cbe8d11d332a38d6f86c581ddc15ddb0bfe69c47cbfc7ba
SUCCEEDED: a0a6ce7925f65bbd5cfb914026cce584132bbc16f446f45bea6ae4728d307d4e
SUCCEEDED: ef0947c1586049d73849c57fc9f65b196ec3d0a5e0a75d0c932c54c047e14f27
SUCCEEDED: 340b42d7c493555368f511874bf1ef270401e9f9db218e589e2538838b77d18a
SUCCEEDED: 2bd0dd815d002dd1d757b617035fe85d0f10ce46a2e1e23d491b69e0b4c83906
SUCCEEDED: 5eb430413e2d32214cbeaf57fec79cb4d272f26ed0b5c2a2169a4826e2624aa1
SUCCEEDED: ceb9bd37f29e593a2384590a9c628ebf4c19ff7fe517e6c5fb07ffaa4a5d1c00
SUCCEEDED: 282502ea26f1cba8116555e4c886beaa8b72b05cdb2eff959da1b8fcb842ffa3
SUCCEEDED: 54733e8a673b9b6b877a83607b140ceb458bd328d72d9b4ba459e3827fb

In [52]:
for response in finished_jobs:
    
    temp_list = []
    # Get only text LINES and add to a list
    for result_page in response:
        for item in result_page["Blocks"]:
            if item["BlockType"] == "LINE":
                temp_list.append(item["Text"])
    
    # write to text file
    with open('raw_text/'+str(finished_jobs.index(response))+'.txt', 'w') as f:
        f.write('\n'.join(temp_list))

    # add text to dataframe
    extracted_text = extracted_text.append({"label": 1, "text" : ' '.join(temp_list)}, ignore_index=True)

In [53]:
extracted_text.to_csv('2020_ModernSlaveryStatements_rawtext_1.csv', index=False)

In [54]:
# Upload Directory
raw_text_folder = "raw_text/"
s3_save_location = prefix + raw_text_folder

for root,dirs,files in os.walk(raw_text_folder):
    for file in files:
        if file.endswith(".txt") and not "." in root:  
            #print(os.path.join(root,file))
            s3_client.upload_file(os.path.join(root,file), s3_bucket_name, s3_save_location+file)


# 2. Text Analysis with Comprehend 
## (Entities, Key Phrases, Sentiment, Event Detection)
-----

In [None]:
# Parameters
filename = "2020/raw_text/1.txt"


input_data_s3_path = f's3://{s3_bucket_name}/' + filename
input_data_format = 'ONE_DOC_PER_LINE'
output_data_s3_path = f's3://{s3_bucket_name}/comprehend-test/'
job_uuid = uuid.uuid1()
job_name = f"entities-job-{job_uuid}"

In [34]:
# Begin the inference job
response = comprehend.start_entities_detection_job(
    InputDataConfig={'S3Uri': input_data_s3_path,
                     'InputFormat': input_data_format},
    OutputDataConfig={'S3Uri': output_data_s3_path},
    DataAccessRoleArn=role,
    JobName=job_name,
    LanguageCode='en',
   # TargetEventTypes=event_types
)

# Get the job ID
events_job_id = response['JobId']

In [36]:
# Get current job status
job = comprehend.describe_entities_detection_job(JobId=events_job_id)

# Loop until job is completed
waited = 0
timeout_minutes = 30
while (job['EntitiesDetectionJobProperties']['JobStatus'] not in ["COMPLETED", "FAILED"]:# or (job['EntitiesDetectionJobProperties']['JobStatus'] != 'FAILED'):
    print(job['EntitiesDetectionJobProperties']['JobStatus'])
    time.sleep(10)
    waited += 10
    assert waited//60 < timeout_minutes, "Job timed out after %d seconds." % waited
    job = comprehend.describe_entities_detection_job(JobId=events_job_id)

In [48]:
# The output filename is the input filename + ".out"
def split_s3_path(s3_path):
    path_parts=s3_path.replace("s3://","").split("/")
    bucket=path_parts.pop(0)
    key="/".join(path_parts)
    return bucket, key

bucket, key = split_s3_path(job['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'])
print(output_data_s3_file)
# download file from s3
s3_client.download_file(s3_bucket_name, key, 'output.tar.gz')

s3://modern-slavery-act-poc-124234234/comprehend-test/704631844570-NER-563ae2ef711b482ba65756f8f464700d/output/output.tar.gz


In [49]:
# importing the "tarfile" module
import tarfile
# open file
file = tarfile.open('output.tar.gz')
# extracting file
file.extractall('./extract')
file.close()

# 3. Model Training
-----

What output do you want from the model?
- Classification? I.e. Yes or No if document meets criterion 
    - Would need large sample size of submitted statements that did not meet requirements
- Confidence levels for each criterion
- Identification of which paragraphs are relevant to each criterion? Potentially speeding up processing

---


In [None]:
# For example we will train a text classification model


# 4. Model Hosting
-----