# This notebook provides an example to process a file using WIPRO IEF Controller extraction model for extraction of a company controllers from an annual report

Make your you have neccessary permissions in your IAM role for executing the below notebook code.

In [None]:
import boto3
import sagemaker
import json
from sagemaker import get_execution_role
import io
import base64

role = get_execution_role()

### Create a base64 encode version of the file to be used as input for the transform job. 
This is done to allow users to prcoess files >5mb which is a file size restriction in live hosting services.

In [None]:
# s3 bucket and file path for pdf file to be used for extraction
bucket='bucket.temp'
file_path='temp/sample.pdf'

data_location = 's3://{}/{}'.format(bucket, file_path)
print('File {} will be downloaded and converted to base64 format'.format(data_location))

s3 = boto3.resource('s3')
try:
    obj = s3.Object(bucket, file_path)
    b64_data=(base64.b64encode(obj.get()['Body'].read())).decode('utf-8')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The file does not exist.")
    else:
        raise

### Create a json with required parameters and upload to s3 bucket for use in transform job


In [None]:
# s3 bucket and input file path for transform job
bucket='bucket.temp'
file_path='input/inpt.json'

inpt_data=dict()
# geo - either CAN or AUS
inpt_data['geo']='AUS'
# org name
inpt_data['org']='XYZ corp'
# threshold value for confidence score
inpt_data['threshold']='0.50'
# encoded file data
inpt_data['file']=b64_data

# place the json file into input path for transform job 
obj = s3.Object(bucket, file_path)
obj.put(Body=json.dumps(inpt_data))

### Create and run a transform job

In [None]:
#IEF
input_key = 'bucket.temp/input/inpt.json'
bucket = 'bucket.temp'
input_location = 's3://{}'.format(input_key)

#output location
output_location = 's3://{}'.format('bucket.temp/output')

s3_client = boto3.client('s3')
#s3_client.upload_file('inpt.json', bucket, input_key)
s3 = boto3.resource('s3')

# Initialize the transformer object
transformer =sagemaker.transformer.Transformer(
    base_transform_job_name='Batch-Transform',
    model_name='model_name for your product subscription',
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=output_location
    )
# To start a transform job:
transformer.transform(input_location, content_type='application/json', split_type='None')

# Then wait until transform job is completed
transformer.wait()

print("Output path {}".format(output_location))