In [1]:
import datetime
import boto3
import botocore
import pandas as pd
import numpy as np
import json
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN, Birch
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import hdbscan

import warnings
warnings.filterwarnings("ignore")

In [2]:
############################
#     AWS Functions        #
############################

def boto3_session(flavor):
  """
  boto3_session instantiates an AWS session. 
  
  :flavor: indicates if the session should be created at the client or the resouce level
  :return: session
  """

  if flavor == 'client':
    return(boto3.client('s3', 'us-east-2'))
  else:
    return(boto3.resource('s3', 'us-east-2'))


def write_to_s3(sesh, key, obj):
  """
  write_to_s3 will write textract formatted results to the artifact bucket using the provided key.
  
  :sesh: S3 session
  :key: the prefix and key for the object in S3
  """
  
  sesh.Object('uwm-textract-910', key).put(Body=bytes(json.dumps(obj).encode('UTF-8')))
  return

def read_s3_subfiles_src(s3_sesh, prefix):
  """
  read_s3_subfiles gathers all of the object keys from a specific directory in S3.
  
  :session: a boto3 resouce session
  :folder: the directory for inspection
  :return: an array of object keys
  """
  bucket_src = s3_sesh.Bucket('uwm-textract-910')
  files = []
  

  for object_summary in bucket_src.objects.filter(Prefix=prefix):
    if object_summary.key.endswith('.json'):
        files.append(object_summary.key)

    #remove for live automation
    # if len(files) == 1000:
    #     break
  
  return files

def retrieve_s3_subfile (s3_sesh, file):
  
  con_obj = s3_sesh.Object('uwm-textract-910', file)
  file_content = con_obj.get()['Body'].read().decode('utf-8')
  json_content = json.loads(file_content)
  
  return json_content

############################
#       METADATA           #
############################

def append_height_diff_above(json_page):
    for i in range(len(json_page['Blocks']) - 1):
        previous_line = i
        current_line = i + 1
            
        previous_font_size = json_page['Blocks'][previous_line]['BoundingBox']['Height']
        current_t = json_page['Blocks'][current_line]['BoundingBox']['Top']
        previous_t = json_page['Blocks'][previous_line]['BoundingBox']['Top']
        height_diff = current_t - (previous_t + previous_font_size)
        # ive tested with absolute value but do we want the possibility of negative differences?
        json_page['Blocks'][current_line]['height_diff_above'] = abs(height_diff)

    json_page['Blocks'][0]['height_diff_above'] = 0
    return json_page
  
  
def append_height_diff_below(json_page):
    for i in range(len(json_page['Blocks']) - 1):
        previous_line = i
        current_line = i + 1
            
        previous_font_size = json_page['Blocks'][previous_line]['BoundingBox']['Height']
        current_t = json_page['Blocks'][current_line]['BoundingBox']['Top']
        previous_t = json_page['Blocks'][previous_line]['BoundingBox']['Top']
        height_diff = current_t - (previous_t + previous_font_size)
        # ive tested with absolute value but do we want the possibility of negative differences?
        json_page['Blocks'][previous_line]['height_diff_below'] = abs(height_diff)

    json_page['Blocks'][-1]['height_diff_below'] = 0
    return json_page

  
def append_paragraph_number(json_page):
    blocks = json_page['Blocks']
    precision = 2
    p_count = 0
    for i in range(len(json_page['Blocks'])):
        below = round(json_page['Blocks'][i]['height_diff_below'],precision)
        above = round(json_page['Blocks'][i]['height_diff_above'],precision)
        if above > below:
            p_count +=1
        json_page['Blocks'][i]['paragraph'] = p_count
    return json_page

  
def append_paragraph_number_std(json_page):
    blocks = json_page['Blocks']
    data = pd.json_normalize(blocks)
    p_counter = 0 
    p_counts = []
    # remove first observation from calculation since it
    # has a zero in it since first line doesnt have a height above
    height_diff_std = data.height_diff_above.iloc[1::].std()
    height_diff_mean = data.height_diff_above.iloc[1::].mean()
    json_page['height_diff_mean'] = height_diff_mean
    json_page['height_diff_std'] = height_diff_std
    
    for i in range(len(json_page['Blocks'])):
        if json_page['Blocks'][i]['height_diff_above'] > (height_diff_mean+height_diff_std):
            p_counter+=1
        json_page['Blocks'][i]['paragraph_std'] = p_counter
    return json_page
      
      
def append_scaled(json_page):
    blocks = json_page['Blocks']
    data = pd.json_normalize(blocks)
    feats = ['BoundingBox.Top', 'BoundingBox.Left', 'paragraph_std', 'paragraph']
    X = data[feats]
    
    scaler = MinMaxScaler()
    X_scl = scaler.fit_transform(X)
    dim1 = X_scl.shape[0]
    dim2 = X_scl.shape[-1]
    for i in range(dim1):
        for j in range(dim2):
            if j==0:
                json_page['Blocks'][i]['top_scl'] = X_scl[i][j]
            if j == 1:
                json_page['Blocks'][i]['left_scl'] = X_scl[i][j]
            elif j == 2:
                json_page['Blocks'][i]['paragraph_std_scl'] = X_scl[i][j]
            else:
                json_page['Blocks'][i]['paragraph_scl'] = X_scl[i][j]
    return json_page
        

def append_cluster(json_page, pca=True):
    blocks = json_page['Blocks']
    feats = ['top_scl', 'left_scl', 'paragraph_std_scl']
    X_scl = pd.json_normalize(blocks)[feats]

    if pca:
        pca = PCA(n_components=2)
        pca_out = pca.fit_transform(X_scl)

        clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
        clusterer.fit(pca_out)
        hdbscan_labels = clusterer.labels_
        cluster_probs = clusterer.probabilities_
    else:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
        clusterer.fit(pca_out)
        hdbscan_labels = clusterer.labels_
        cluster_probs = clusterer.probabilities_


    for i in range(len(json_page['Blocks'])):
        json_page['Blocks'][i]['text_cluster'] = int(hdbscan_labels[i])
        json_page['Blocks'][i]['cluster_probability'] = float(cluster_probs[i])

    return json_page


In [7]:
############################
#       MAIN               #
############################

def main():
    file_arr = []

    s3_sesh = boto3_session('resource')
    target = f'Textract_Output'
  
    #for target in targets
    #gather the files paths from S3
    files = read_s3_subfiles_src(s3_sesh, target)
    files = [files[0]]
    
    #gather the file contents 
    for idx, file in enumerate(files):

      page = retrieve_s3_subfile(s3_sesh, file)
      file_name = file.split('/')[-1].split('.')[0]+'.pdf'
      dest_name = file.split('/')[-1]
      dest = f'T2_Model/{dest_name}'

      #calculate quadrant borders
      df = pd.DataFrame(page)
      df = df.join(pd.json_normalize(df["Blocks"].tolist()).add_prefix("Blocks.")).drop(["Blocks"], axis=1)
      
      #vertical
      top = df['Blocks.BoundingBox.Top'].min()
      df['Blocks.BoundingBox.Bottom'] = df['Blocks.BoundingBox.Top'] + df['Blocks.BoundingBox.Height']
      bottom = df['Blocks.BoundingBox.Bottom'].max()
      v_midpoint = (bottom-top)/2

      #horizontal
      left = df['Blocks.BoundingBox.Left'].min()
      df['Blocks.BoundingBox.Right'] = df['Blocks.BoundingBox.Left'] + df['Blocks.BoundingBox.Width']
      right = df['Blocks.BoundingBox.Right'].max()
      h_midpoint = (right-left)/2

      #assign quadrants
      for idx, block in enumerate(page['Blocks']):
        if block['BoundingBox']['Top'] < v_midpoint and block['BoundingBox']['Left'] < h_midpoint:
            block['Quadrant'] = 'Q1'
        elif block['BoundingBox']['Top'] < v_midpoint and block['BoundingBox']['Left'] > h_midpoint:
            block['Quadrant'] = 'Q2'
        elif block['BoundingBox']['Top'] > v_midpoint and block['BoundingBox']['Left'] < h_midpoint:
            block['Quadrant'] = 'Q3'
        elif block['BoundingBox']['Top'] > v_midpoint and block['BoundingBox']['Left'] > h_midpoint:
            block['Quadrant'] = 'Q4'

      #parition quadrants
      core = page['Blocks']
      core_q1 = {'Blocks':[x for x in core if x['Quadrant'] in ['Q1']]}
      core_q2 = {'Blocks':[x for x in core if x['Quadrant'] in ['Q2']]}
      core_q3 = {'Blocks':[x for x in core if x['Quadrant'] in ['Q3']]}
      core_q4 = {'Blocks':[x for x in core if x['Quadrant'] in ['Q4']]}

      core_full = [core_q1, core_q2, core_q3, core_q4]

      
      final_blocks = []
      for quad in core_full:
        if len(quad['Blocks']) > 1:
            height_diff_above_json = append_height_diff_above(quad)
            height_diff_below_json = append_height_diff_below(height_diff_above_json)
            paragraph_number_json = append_paragraph_number(height_diff_below_json)
            pg_num_std_json = append_paragraph_number_std(paragraph_number_json)
            scaled_json = append_scaled(pg_num_std_json)
            clustered_json = append_cluster(scaled_json)
            final_blocks.extend(clustered_json['Blocks'])
    


      final_json = {
        'Page': 1,
        'Blocks': final_blocks,
        'file_name': file_name
      }
      
      write_to_s3(s3_sesh, dest, final_json)
      file_arr.append(dest)

    return final_json

In [8]:
file_arr = main()
file_arr

{'Page': 1,
 'Blocks': [{'BlockType': 'LINE',
   'Confidnece': 99.89444732666016,
   'Text': '16',
   'BoundingBox': {'Width': 0.01270796824246645,
    'Height': 0.00822802446782589,
    'Left': 0.14141608774662018,
    'Top': 0.0827944353222847},
   'Id': '371f4a21-2453-4350-878b-30087aca9773',
   'Quadrant': 'Q1',
   'height_diff_above': 0,
   'height_diff_below': 0.048604803159832954,
   'paragraph': 0,
   'paragraph_std': 0,
   'top_scl': 0.0,
   'paragraph_scl': 0.0,
   'left_scl': 0.06640953197713628,
   'paragraph_std_scl': 0.0,
   'text_cluster': -1,
   'cluster_probability': 0.0},
  {'BlockType': 'LINE',
   'Confidnece': 99.8537826538086,
   'Text': 'statement to the subject field of knowledge organiza-',
   'BoundingBox': {'Width': 0.33351844549179077,
    'Height': 0.012791251763701439,
    'Left': 0.14064253866672516,
    'Top': 0.13962726294994354},
   'Id': 'bfe1faf7-b5c9-461e-bbae-7e3ca74dd8da',
   'Quadrant': 'Q1',
   'height_diff_above': 0.048604803159832954,
   'heigh

## Retrieve File Results

In [5]:
s3 = boto3_session('resouce')
res_files = read_s3_subfiles_src(s3, 'T2_Model')
res_files

['T2_Model/Sample 1.json',
 'T2_Model/Sample 10.json',
 'T2_Model/Sample 11.json',
 'T2_Model/Sample 12.json',
 'T2_Model/Sample 13.json',
 'T2_Model/Sample 14.json',
 'T2_Model/Sample 15.json',
 'T2_Model/Sample 16.json',
 'T2_Model/Sample 17.json',
 'T2_Model/Sample 18.json',
 'T2_Model/Sample 19.json',
 'T2_Model/Sample 2.json',
 'T2_Model/Sample 20.json',
 'T2_Model/Sample 21.json',
 'T2_Model/Sample 22.json',
 'T2_Model/Sample 23.json',
 'T2_Model/Sample 24.json',
 'T2_Model/Sample 25.json',
 'T2_Model/Sample 26.json',
 'T2_Model/Sample 27.json',
 'T2_Model/Sample 28.json',
 'T2_Model/Sample 29.json',
 'T2_Model/Sample 3.json',
 'T2_Model/Sample 30.json',
 'T2_Model/Sample 31.json',
 'T2_Model/Sample 32.json',
 'T2_Model/Sample 33.json',
 'T2_Model/Sample 34.json',
 'T2_Model/Sample 35.json',
 'T2_Model/Sample 36.json',
 'T2_Model/Sample 37.json',
 'T2_Model/Sample 38.json',
 'T2_Model/Sample 39.json',
 'T2_Model/Sample 4.json',
 'T2_Model/Sample 40.json',
 'T2_Model/Sample 41.jso

In [6]:
for file in res_files: 
    file_name = file.split('/')[-1]
    j_obj = retrieve_s3_subfile(s3, file)

    # Serializing json
    json_object = json.dumps(j_obj, indent=4)
    
    # Writing to sample.json
    with open(f'Export Data/T2 Model Results/{file_name}', "w") as outfile:
        outfile.write(json_object)