In [46]:
import boto3
import json
from time import sleep

## S3 Targets

In [8]:
# Let's use Amazon S3
s3 = boto3.resource('s3')
bucket = s3.Bucket('uwm-textract-910')

In [9]:
#find target files
target_files = []
for my_bucket_object in bucket.objects.filter(Prefix="Source_Files"):
    if my_bucket_object.key.endswith('.pdf'):
        target_files.append(my_bucket_object.key)

In [10]:
len(target_files)

60

## Textract

In [12]:
textract = boto3.client('textract')

In [16]:
response = textract.start_document_text_detection(
    DocumentLocation={
        'S3Object': {
            'Bucket': 'uwm-textract-910',
            'Name': target_files[0]
        }
    }
)
response

{'JobId': 'fc2bce1b38bfdd711f42301fdd5b0b467ea00981e2461d7aad17e29e98a55355',
 'ResponseMetadata': {'RequestId': '7460d57b-0fa6-46b6-8601-2305e7b92fe9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7460d57b-0fa6-46b6-8601-2305e7b92fe9',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '76',
   'date': 'Fri, 03 Nov 2023 20:23:14 GMT'},
  'RetryAttempts': 0}}

In [21]:
response2 = textract.get_document_text_detection(
    JobId=response['JobId']
)
response2

{'DocumentMetadata': {'Pages': 1},
 'JobStatus': 'SUCCEEDED',
 'Blocks': [{'BlockType': 'PAGE',
   'Geometry': {'BoundingBox': {'Width': 1.0,
     'Height': 1.0,
     'Left': 0.0,
     'Top': 0.0},
    'Polygon': [{'X': 0.0, 'Y': 8.169882903530379e-08},
     {'X': 1.0, 'Y': 0.0},
     {'X': 1.0, 'Y': 1.0},
     {'X': 9.408725532011886e-08, 'Y': 1.0}]},
   'Id': '58a82fd6-9c8e-4d14-85f2-ce5d20bca184',
   'Relationships': [{'Type': 'CHILD',
     'Ids': ['a18764fc-5ba7-43df-8450-4f801140604e',
      '892374c9-dddb-4012-812d-8217f3bc5e45',
      '5aeeb70c-302e-4b80-8880-9118a4ed862e',
      'a6096415-5379-4a20-b596-ea4a3f6de87d',
      '7130c73c-7ad2-4958-8a0f-5431300e3cb1',
      'bfe95b3c-136c-47bc-8dc5-4b18aa2bd1e8',
      '26d0f498-6c9d-4581-b60a-b8c72d47be2a',
      '87445e80-3ac4-4ed8-93da-eeb5fa17bbaa',
      '618573a8-cb85-4bb9-b62d-cedcc30a508b',
      '2c9ff53e-7c29-4e49-86fb-18c9e7b2d270',
      'f2e6f02b-2076-4c98-b4df-bb0154f03be4',
      'e9c48dab-6a79-4fdc-acf9-ad6c1c789a02'

In [30]:
json_objs={
    'Page': 1,
    'Blocks': []    
}
for obj in response2['Blocks']:
    json_obj = {}
    if obj['BlockType'] == 'LINE':
        json_obj['BlockType'] = obj['BlockType']
        json_obj['Confidnece'] = obj['Confidence']
        json_obj['Text'] = obj['Text']
        json_obj['BoundingBox'] = {
            'Width': obj['Geometry']['BoundingBox']['Width'],
            'Height': obj['Geometry']['BoundingBox']['Height'],
            'Left': obj['Geometry']['BoundingBox']['Left'],
            'Top': obj['Geometry']['BoundingBox']['Top']
        }
        json_obj['Id'] = obj['Id']
        json_objs['Blocks'].append(json_obj)

In [31]:
json_objs

{'Page': 1,
 'Blocks': [{'BlockType': 'LINE',
   'Confidnece': 99.89444732666016,
   'Text': '16',
   'BoundingBox': {'Width': 0.01270796824246645,
    'Height': 0.00822802446782589,
    'Left': 0.14141608774662018,
    'Top': 0.0827944353222847},
   'Id': 'a18764fc-5ba7-43df-8450-4f801140604e'},
  {'BlockType': 'LINE',
   'Confidnece': 83.62879180908203,
   'Text': 'Knowl. Org. (2006)M',
   'BoundingBox': {'Width': 0.13195845484733582,
    'Height': 0.01113848015666008,
    'Left': 0.7039221525192261,
    'Top': 0.0818491131067276},
   'Id': '892374c9-dddb-4012-812d-8217f3bc5e45'},
  {'BlockType': 'LINE',
   'Confidnece': 99.06912994384766,
   'Text': 'Ingetraut Dahlberg. Knowledge Organization: A New Science?',
   'BoundingBox': {'Width': 0.3566962480545044,
    'Height': 0.011461430229246616,
    'Left': 0.5014880299568176,
    'Top': 0.09442509710788727},
   'Id': '5aeeb70c-302e-4b80-8880-9118a4ed862e'},
  {'BlockType': 'LINE',
   'Confidnece': 99.8537826538086,
   'Text': 'stateme

## Upload to S3

In [33]:
json_binary = json.dumps(json_objs).encode('utf-8')

In [41]:
'Textract_Output/'+target_files[0].split('/')[-1].split('.')[0]+'.json'

'Textract_Output/Sample 1.json'

In [42]:
# Method 1: Object.put()
s3 = boto3.resource('s3')
object = s3.Object('uwm-textract-910', 'Textract_Output/'+target_files[0].split('/')[-1].split('.')[0]+'.json')
object.put(Body=json_binary)

{'ResponseMetadata': {'RequestId': 'CQ9N0A8B6Z12ER7M',
  'HostId': 'Ur/OU8/Bk3eXs/aAIZdV+20Dj2GpcQJRXs1a/i73XomgUPkJunlE7t8Of55TSvoEbbuSNu7LEGA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Ur/OU8/Bk3eXs/aAIZdV+20Dj2GpcQJRXs1a/i73XomgUPkJunlE7t8Of55TSvoEbbuSNu7LEGA=',
   'x-amz-request-id': 'CQ9N0A8B6Z12ER7M',
   'date': 'Fri, 03 Nov 2023 20:51:34 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"998c3ea32361059cea6f17d850d668a3"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"998c3ea32361059cea6f17d850d668a3"',
 'ServerSideEncryption': 'AES256'}

## Full Program

In [61]:
def response_gather(JobId, NextToken):

    if NextToken is None:
        response = textract.get_document_text_detection(
            JobId=JobId
        )
    else:
        response = textract.get_document_text_detection(
            JobId=JobId,
            NextToken=NextToken
        )


    return response

In [54]:
def format_json(response):
    json_objs={
        'Page': 1,
        'Blocks': []    
    }
    for obj in response2['Blocks']:
        json_obj = {}
        if obj['BlockType'] == 'LINE':
            json_obj['BlockType'] = obj['BlockType']
            json_obj['Confidnece'] = obj['Confidence']
            json_obj['Text'] = obj['Text']
            json_obj['BoundingBox'] = {
                'Width': obj['Geometry']['BoundingBox']['Width'],
                'Height': obj['Geometry']['BoundingBox']['Height'],
                'Left': obj['Geometry']['BoundingBox']['Left'],
                'Top': obj['Geometry']['BoundingBox']['Top']
            }
            json_obj['Id'] = obj['Id']
            json_objs['Blocks'].append(json_obj)

    return json_objs

In [63]:
for idx, file in enumerate(target_files):
    print(f'Working on file {idx+1} of 60')

    #set up
    source = file
    print(source)
    dest_file = 'Textract_Output/'+file.split('/')[-1].split('.')[0]+'.json'
    
    #start text detection
    response = textract.start_document_text_detection(
        DocumentLocation={
            'S3Object': {
                'Bucket': 'uwm-textract-910',
                'Name': source
            }
        }
    )

    #gather results 
    next_page = True
    JobId = response['JobId']
    NextToken = None
    iter = 0

    while next_page:


        get_res = True
        while get_res:
            response2 = response_gather(JobId, NextToken)

            if response2['JobStatus'] != 'SUCCEEDED':
                sleep(2)
            else:
                print(response2['JobStatus'])
                get_res = False
    
        formatted = format_json(response2)

        if iter == 0:
            final_obj = formatted.copy()
        else:
            final_obj['Blocks'].extend(formatted['Blocks'].copy())

        if 'NextToken' in response2.keys():
            NextToken = response2['NextToken']
            print(f'NextToken Detected {JobId}')
            iter += 1
        else:
            next_page = False

    #write to S3
    json_binary = json.dumps(final_obj).encode('utf-8')
    s3 = boto3.resource('s3')
    object = s3.Object('uwm-textract-910', dest_file)
    object.put(Body=json_binary)
    
    # if idx > 4:
    #     break

Working on file 1 of 60
Source_Files/Sample 1.pdf
SUCCEEDED
Working on file 2 of 60
Source_Files/Sample 10.pdf
SUCCEEDED
Working on file 3 of 60
Source_Files/Sample 11.pdf
SUCCEEDED
Working on file 4 of 60
Source_Files/Sample 12.pdf
SUCCEEDED
Working on file 5 of 60
Source_Files/Sample 13.pdf
SUCCEEDED
Working on file 6 of 60
Source_Files/Sample 14.pdf
SUCCEEDED
NextToken Detected aba3d8a9edf93eb8eebd6ad90c60eb0ee6dfd0384221fc5157edacadcc67dc30
SUCCEEDED
Working on file 7 of 60
Source_Files/Sample 15.pdf
SUCCEEDED
NextToken Detected 3a62e4415e4eb99735bb98140efabf556b37cc6303609e5d9bd074afb416fa22
SUCCEEDED
Working on file 8 of 60
Source_Files/Sample 16.pdf
SUCCEEDED
Working on file 9 of 60
Source_Files/Sample 17.pdf
SUCCEEDED
Working on file 10 of 60
Source_Files/Sample 18.pdf
SUCCEEDED
Working on file 11 of 60
Source_Files/Sample 19.pdf
SUCCEEDED
Working on file 12 of 60
Source_Files/Sample 2.pdf
SUCCEEDED
NextToken Detected d0b9ae0b34a168efd3a6423db0b7907166a1472b87a18027fb8cadd58262