In [1]:
import datetime
import boto3
import botocore
import pandas as pd
import numpy as np
import json
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN, Birch
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import hdbscan

import warnings
warnings.filterwarnings("ignore")

In [2]:
############################
#     AWS Functions        #
############################

def boto3_session(flavor):
  """
  boto3_session instantiates an AWS session. 
  
  :flavor: indicates if the session should be created at the client or the resouce level
  :return: session
  """

  if flavor == 'client':
    return(boto3.client('s3', 'us-east-2'))
  else:
    return(boto3.resource('s3', 'us-east-2'))


def write_to_s3(sesh, key, obj):
  """
  write_to_s3 will write textract formatted results to the artifact bucket using the provided key.
  
  :sesh: S3 session
  :key: the prefix and key for the object in S3
  """
  
  sesh.Object('uwm-textract-910', key).put(Body=bytes(json.dumps(obj).encode('UTF-8')))
  return

def read_s3_subfiles_src(s3_sesh, prefix):
  """
  read_s3_subfiles gathers all of the object keys from a specific directory in S3.
  
  :session: a boto3 resouce session
  :folder: the directory for inspection
  :return: an array of object keys
  """
  bucket_src = s3_sesh.Bucket('uwm-textract-910')
  files = []
  

  for object_summary in bucket_src.objects.filter(Prefix=prefix):
    if object_summary.key.endswith('.json'):
        files.append(object_summary.key)

    #remove for live automation
    # if len(files) == 1000:
    #     break
  
  return files

def retrieve_s3_subfile (s3_sesh, file):
  
  con_obj = s3_sesh.Object('uwm-textract-910', file)
  file_content = con_obj.get()['Body'].read().decode('utf-8')
  json_content = json.loads(file_content)
  
  return json_content

############################
#       METADATA           #
############################

def append_height_diff_above(json_page):
    for i in range(len(json_page['Blocks']) - 1):
        previous_line = i
        current_line = i + 1
            
        previous_font_size = json_page['Blocks'][previous_line]['BoundingBox']['Height']
        current_t = json_page['Blocks'][current_line]['BoundingBox']['Top']
        previous_t = json_page['Blocks'][previous_line]['BoundingBox']['Top']
        height_diff = current_t - (previous_t + previous_font_size)
        # ive tested with absolute value but do we want the possibility of negative differences?
        json_page['Blocks'][current_line]['height_diff_above'] = abs(height_diff)

    json_page['Blocks'][0]['height_diff_above'] = 0
    return json_page
  
  
def append_height_diff_below(json_page):
    for i in range(len(json_page['Blocks']) - 1):
        previous_line = i
        current_line = i + 1
            
        previous_font_size = json_page['Blocks'][previous_line]['BoundingBox']['Height']
        current_t = json_page['Blocks'][current_line]['BoundingBox']['Top']
        previous_t = json_page['Blocks'][previous_line]['BoundingBox']['Top']
        height_diff = current_t - (previous_t + previous_font_size)
        # ive tested with absolute value but do we want the possibility of negative differences?
        json_page['Blocks'][previous_line]['height_diff_below'] = abs(height_diff)

    json_page['Blocks'][-1]['height_diff_below'] = 0
    return json_page

  
def append_paragraph_number(json_page):
    blocks = json_page['Blocks']
    precision = 2
    p_count = 0
    for i in range(len(json_page['Blocks'])):
        below = round(json_page['Blocks'][i]['height_diff_below'],precision)
        above = round(json_page['Blocks'][i]['height_diff_above'],precision)
        if above > below:
            p_count +=1
        json_page['Blocks'][i]['paragraph'] = p_count
    return json_page

  
def append_paragraph_number_std(json_page):
    blocks = json_page['Blocks']
    data = pd.json_normalize(blocks)
    p_counter = 0 
    p_counts = []
    # remove first observation from calculation since it
    # has a zero in it since first line doesnt have a height above
    height_diff_std = data.height_diff_above.iloc[1::].std()
    height_diff_mean = data.height_diff_above.iloc[1::].mean()
    json_page['height_diff_mean'] = height_diff_mean
    json_page['height_diff_std'] = height_diff_std
    
    for i in range(len(json_page['Blocks'])):
        if json_page['Blocks'][i]['height_diff_above'] > (height_diff_mean+height_diff_std):
            p_counter+=1
        json_page['Blocks'][i]['paragraph_std'] = p_counter
    return json_page
      
      
def append_scaled(json_page):
    blocks = json_page['Blocks']
    data = pd.json_normalize(blocks)
    feats = ['BoundingBox.Top', 'BoundingBox.Left', 'paragraph_std', 'paragraph']
    X = data[feats]
    
    scaler = MinMaxScaler()
    X_scl = scaler.fit_transform(X)
    dim1 = X_scl.shape[0]
    dim2 = X_scl.shape[-1]
    for i in range(dim1):
        for j in range(dim2):
            if j==0:
                json_page['Blocks'][i]['top_scl'] = X_scl[i][j]
            if j == 1:
                json_page['Blocks'][i]['left_scl'] = X_scl[i][j]
            elif j == 2:
                json_page['Blocks'][i]['paragraph_std_scl'] = X_scl[i][j]
            else:
                json_page['Blocks'][i]['paragraph_scl'] = X_scl[i][j]
    return json_page
        

def append_cluster(json_page, pca=True):
    blocks = json_page['Blocks']
    feats = ['top_scl', 'left_scl', 'paragraph_std_scl']
    X_scl = pd.json_normalize(blocks)[feats]

    if pca:
        pca = PCA(n_components=2)
        pca_out = pca.fit_transform(X_scl)

        clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
        clusterer.fit(pca_out)
        hdbscan_labels = clusterer.labels_
        cluster_probs = clusterer.probabilities_
    else:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
        clusterer.fit(pca_out)
        hdbscan_labels = clusterer.labels_
        cluster_probs = clusterer.probabilities_


    for i in range(len(json_page['Blocks'])):
        json_page['Blocks'][i]['text_cluster'] = int(hdbscan_labels[i])
        json_page['Blocks'][i]['cluster_probability'] = float(cluster_probs[i])

    return json_page


In [4]:
s3_sesh = boto3_session('resource')
target = f'Textract_Output'
files = read_s3_subfiles_src(s3_sesh, target)
print(files)

['Textract_Output/Sample 1.json', 'Textract_Output/Sample 10.json', 'Textract_Output/Sample 11.json', 'Textract_Output/Sample 12.json', 'Textract_Output/Sample 13.json', 'Textract_Output/Sample 14.json', 'Textract_Output/Sample 15.json', 'Textract_Output/Sample 16.json', 'Textract_Output/Sample 17.json', 'Textract_Output/Sample 18.json', 'Textract_Output/Sample 19.json', 'Textract_Output/Sample 2.json', 'Textract_Output/Sample 20.json', 'Textract_Output/Sample 21.json', 'Textract_Output/Sample 22.json', 'Textract_Output/Sample 23.json', 'Textract_Output/Sample 24.json', 'Textract_Output/Sample 25.json', 'Textract_Output/Sample 26.json', 'Textract_Output/Sample 27.json', 'Textract_Output/Sample 28.json', 'Textract_Output/Sample 29.json', 'Textract_Output/Sample 3.json', 'Textract_Output/Sample 30.json', 'Textract_Output/Sample 31.json', 'Textract_Output/Sample 32.json', 'Textract_Output/Sample 33.json', 'Textract_Output/Sample 34.json', 'Textract_Output/Sample 35.json', 'Textract_Output

In [73]:
page = retrieve_s3_subfile(s3_sesh, files[3])
page

{'Page': 1,
 'Blocks': [{'BlockType': 'LINE',
   'Confidnece': 99.91316223144531,
   'Text': '602',
   'BoundingBox': {'Width': 0.02108588255941868,
    'Height': 0.007797531317919493,
    'Left': 0.09483252465724945,
    'Top': 0.06895650923252106},
   'Id': '55217c52-606c-44c5-9e53-a177056dcec3'},
  {'BlockType': 'LINE',
   'Confidnece': 99.25527954101562,
   'Text': 'Knowl. Org. 46(2019)No.8',
   'BoundingBox': {'Width': 0.1567317545413971,
    'Height': 0.011315872892737389,
    'Left': 0.7480316758155823,
    'Top': 0.07095503807067871},
   'Id': '496002de-a71a-462c-a4be-84543ace97f4'},
  {'BlockType': 'LINE',
   'Confidnece': 99.46185302734375,
   'Text': 'V. Broughton. The Respective Roles of Intellectual Creativity and Automation in Representing Diversity',
   'BoundingBox': {'Width': 0.6103330254554749,
    'Height': 0.01169310137629509,
    'Left': 0.29476508498191833,
    'Top': 0.08680137991905212},
   'Id': 'b2a6c83f-390c-4afa-aa3e-aec08f1e6ffd'},
  {'BlockType': 'LINE',
 

In [74]:
df = pd.DataFrame(page)
df = df.join(pd.json_normalize(df["Blocks"].tolist()).add_prefix("Blocks.")).drop(["Blocks"], axis=1)
df.head(3)

Unnamed: 0,Page,Blocks.BlockType,Blocks.Confidnece,Blocks.Text,Blocks.Id,Blocks.BoundingBox.Width,Blocks.BoundingBox.Height,Blocks.BoundingBox.Left,Blocks.BoundingBox.Top
0,1,LINE,99.913162,602,55217c52-606c-44c5-9e53-a177056dcec3,0.021086,0.007798,0.094833,0.068957
1,1,LINE,99.25528,Knowl. Org. 46(2019)No.8,496002de-a71a-462c-a4be-84543ace97f4,0.156732,0.011316,0.748032,0.070955
2,1,LINE,99.461853,V. Broughton. The Respective Roles of Intellec...,b2a6c83f-390c-4afa-aa3e-aec08f1e6ffd,0.610333,0.011693,0.294765,0.086801


In [75]:
#top of the document = min of bounding box top
top = df['Blocks.BoundingBox.Top'].min()
top


0.06895650923252106

In [76]:
#bottom of the document = max of bounding box top + height
df['Blocks.BoundingBox.Bottom'] = df['Blocks.BoundingBox.Top'] + df['Blocks.BoundingBox.Height']
bottom = df['Blocks.BoundingBox.Bottom'].max()
bottom

0.9268646026030183

In [23]:
#midpoint vertical
v_midpoint = (bottom-top)/2
v_midpoint

0.40344604663550854

In [77]:
#left of the document = min of bounding box left
left = df['Blocks.BoundingBox.Left'].min()
left

0.0942993089556694

In [78]:
# right of the document = max of bounding box left + width
df['Blocks.BoundingBox.Right'] = df['Blocks.BoundingBox.Left'] + df['Blocks.BoundingBox.Width']
right = df['Blocks.BoundingBox.Right'].max()
right 

0.9051090180873871

In [79]:
#midpoint horizontal
h_midpoint = (right-left)/2
h_midpoint

0.40540485456585884

In [None]:
#definitions 
# Q1 = top < v_midpoint and left < h_midpoint
# Q2 = top < v_midpoint and left > h_midpoint
# Q3 = top > v_midpoint and left < h_midpoint
# Q4 = top > v_midpoint and left > h_midpoint 


In [80]:
def assign_quad(top, left):
    if top < v_midpoint and left < h_midpoint:
        return 'Q1'
    elif top < v_midpoint and left > h_midpoint:
        return 'Q2'
    elif top > v_midpoint and left < h_midpoint:
        return 'Q3'
    elif top > v_midpoint and left > h_midpoint:
        return 'Q4'

In [81]:
df['Quadrant'] = df.apply(lambda x: assign_quad(x['Blocks.BoundingBox.Top'], x['Blocks.BoundingBox.Left']), axis=1)
df.head(3)

Unnamed: 0,Page,Blocks.BlockType,Blocks.Confidnece,Blocks.Text,Blocks.Id,Blocks.BoundingBox.Width,Blocks.BoundingBox.Height,Blocks.BoundingBox.Left,Blocks.BoundingBox.Top,Blocks.BoundingBox.Bottom,Blocks.BoundingBox.Right,Quadrant
0,1,LINE,99.913162,602,55217c52-606c-44c5-9e53-a177056dcec3,0.021086,0.007798,0.094833,0.068957,0.076754,0.115918,Q1
1,1,LINE,99.25528,Knowl. Org. 46(2019)No.8,496002de-a71a-462c-a4be-84543ace97f4,0.156732,0.011316,0.748032,0.070955,0.082271,0.904763,Q2
2,1,LINE,99.461853,V. Broughton. The Respective Roles of Intellec...,b2a6c83f-390c-4afa-aa3e-aec08f1e6ffd,0.610333,0.011693,0.294765,0.086801,0.098494,0.905098,Q1


In [83]:
df[df['Quadrant']=='Q2'][['Blocks.Text', 'Blocks.BoundingBox.Top']].sort_values(by=['Blocks.BoundingBox.Top'])

Unnamed: 0,Blocks.Text,Blocks.BoundingBox.Top
1,Knowl. Org. 46(2019)No.8,0.070955
4,how serious these efforts are. We learn of a C...,0.129677
5,priest in Wittenberg which radiates light from...,0.14491
7,pronounces blessings in five languages as part...,0.159977
9,tion to celebrate 500 years since the inventio...,0.174802
11,"technology, instrumental in the Reformation an...",0.190257
13,Protestantism (Sherwood 2017). Other cases inc...,0.205274
15,Buddhist monk in China (Tatlow 2016) which rea...,0.220308
17,"ture and can answer questions, and another in ...",0.235644
19,"2017) which can ""chant prayers and tap drums a...",0.250584


In [38]:
df_q4 = df[df['Quadrant']=='Q4'].sort_values(by=['Blocks.BoundingBox.Top'])
df_q4.head(3)

Unnamed: 0,Page,Blocks.BlockType,Blocks.Confidnece,Blocks.Text,Blocks.Id,Blocks.BoundingBox.Width,Blocks.BoundingBox.Height,Blocks.BoundingBox.Left,Blocks.BoundingBox.Top,Blocks.BoundingBox.Bottom,Blocks.BoundingBox.Right,Quadrant
38,1,LINE,99.74025,"Hartmann (1964), presuppose and build themselves",8992ea29-905e-453c-aa43-110052895bc3,0.333041,0.012868,0.525323,0.404565,0.417433,0.858364,Q4
40,1,LINE,99.961006,"upon each other, and which can be specified ea...",81985e80-b3fb-4cdb-9779-8ba91d6959ca,0.333973,0.012577,0.525128,0.419564,0.432142,0.859101,Q4
42,1,LINE,99.825294,nine aspect areas.,ab821410-50d1-4ea8-aeeb-7a6138791967,0.107659,0.011704,0.524958,0.434916,0.44662,0.632617,Q4


In [41]:
json_4 = df_q4.to_json(orient='records')
parsed = json.loads(json_4)
parsed

[{'Page': 1,
  'Blocks.BlockType': 'LINE',
  'Blocks.Confidnece': 99.7402496338,
  'Blocks.Text': 'Hartmann (1964), presuppose and build themselves',
  'Blocks.Id': '8992ea29-905e-453c-aa43-110052895bc3',
  'Blocks.BoundingBox.Width': 0.3330409825,
  'Blocks.BoundingBox.Height': 0.0128680039,
  'Blocks.BoundingBox.Left': 0.5253229141,
  'Blocks.BoundingBox.Top': 0.4045654833,
  'Blocks.BoundingBox.Bottom': 0.4174334873,
  'Blocks.BoundingBox.Right': 0.8583638966,
  'Quadrant': 'Q4'},
 {'Page': 1,
  'Blocks.BlockType': 'LINE',
  'Blocks.Confidnece': 99.9610061646,
  'Blocks.Text': 'upon each other, and which can be specified each by',
  'Blocks.Id': '81985e80-b3fb-4cdb-9779-8ba91d6959ca',
  'Blocks.BoundingBox.Width': 0.3339730799,
  'Blocks.BoundingBox.Height': 0.0125774248,
  'Blocks.BoundingBox.Left': 0.5251280665,
  'Blocks.BoundingBox.Top': 0.4195641279,
  'Blocks.BoundingBox.Bottom': 0.4321415527,
  'Blocks.BoundingBox.Right': 0.8591011465,
  'Quadrant': 'Q4'},
 {'Page': 1,
  'Blo

In [84]:
for idx, block in enumerate(page['Blocks']):
    if block['BoundingBox']['Top'] < v_midpoint and block['BoundingBox']['Left'] < h_midpoint:
        block['Quadrant'] = 'Q1'
    elif block['BoundingBox']['Top'] < v_midpoint and block['BoundingBox']['Left'] > h_midpoint:
        block['Quadrant'] = 'Q2'
    elif block['BoundingBox']['Top'] > v_midpoint and block['BoundingBox']['Left'] < h_midpoint:
        block['Quadrant'] = 'Q3'
    elif block['BoundingBox']['Top'] > v_midpoint and block['BoundingBox']['Left'] > h_midpoint:
        block['Quadrant'] = 'Q4'

In [1]:
core_q4 = [x for x in page['Blocks'] if x['Quadrant'] in ['Q4']]
core_q4


NameError: name 'page' is not defined

In [94]:
#test Q4
core = page['Blocks']
# core_q4 = [x for x in core if x['Quadrant'] in ['Q4', 'Q2']]
core_q4 = [x for x in core if x['Quadrant'] in ['Q4']]
# core_q4

page_q4 = page.copy()
page_q4['Blocks'] = core_q4
page_q4

{'Page': 1,
 'Blocks': [{'BlockType': 'LINE',
   'Confidnece': 99.88965606689453,
   'Text': 'gods, asking whether the similarities are not caused by un-',
   'BoundingBox': {'Width': 0.3763725757598877,
    'Height': 0.01213967613875866,
    'Left': 0.5279744267463684,
    'Top': 0.41641929745674133},
   'Id': 'a4fbb4e5-236a-43e3-a454-5e929fc0eedc',
   'Quadrant': 'Q4'},
  {'BlockType': 'LINE',
   'Confidnece': 99.810302734375,
   'Text': 'certainty:',
   'BoundingBox': {'Width': 0.057482946664094925,
    'Height': 0.011118726804852486,
    'Left': 0.5284923315048218,
    'Top': 0.4321962594985962},
   'Id': 'e46693fa-2f33-4a11-8475-6a9c82651bf4',
   'Quadrant': 'Q4'},
  {'BlockType': 'LINE',
   'Confidnece': 99.93927764892578,
   'Text': 'But it is also true that where interaction is supposed',
   'BoundingBox': {'Width': 0.33780840039253235,
    'Height': 0.011987521313130856,
    'Left': 0.5472887754440308,
    'Top': 0.46187761425971985},
   'Id': 'bb137a9b-9631-4835-8b44-3e72ca66

In [95]:
height_diff_above_json = append_height_diff_above(page_q4)
height_diff_below_json = append_height_diff_below(height_diff_above_json)
paragraph_number_json = append_paragraph_number(height_diff_below_json)
pg_num_std_json = append_paragraph_number_std(paragraph_number_json)
scaled_json = append_scaled(pg_num_std_json)
clustered_json = append_cluster(scaled_json)
clustered_json

{'Page': 1,
 'Blocks': [{'BlockType': 'LINE',
   'Confidnece': 99.88965606689453,
   'Text': 'gods, asking whether the similarities are not caused by un-',
   'BoundingBox': {'Width': 0.3763725757598877,
    'Height': 0.01213967613875866,
    'Left': 0.5279744267463684,
    'Top': 0.41641929745674133},
   'Id': 'a4fbb4e5-236a-43e3-a454-5e929fc0eedc',
   'Quadrant': 'Q4',
   'height_diff_above': 0,
   'height_diff_below': 0.003637285903096199,
   'paragraph': 0,
   'paragraph_std': 0,
   'top_scl': 0.0,
   'paragraph_scl': 0.0,
   'left_scl': 0.008224235813798941,
   'paragraph_std_scl': 0.0,
   'text_cluster': 0,
   'cluster_probability': 0.054422199999495796},
  {'BlockType': 'LINE',
   'Confidnece': 99.810302734375,
   'Text': 'certainty:',
   'BoundingBox': {'Width': 0.057482946664094925,
    'Height': 0.011118726804852486,
    'Left': 0.5284923315048218,
    'Top': 0.4321962594985962},
   'Id': 'e46693fa-2f33-4a11-8475-6a9c82651bf4',
   'Quadrant': 'Q4',
   'height_diff_above': 0.0

In [96]:
df = pd.DataFrame(clustered_json)
df = df.join(pd.json_normalize(df["Blocks"].tolist()).add_prefix("Blocks.")).drop(["Blocks"], axis=1)
df['grouped_text'] = df.groupby('Blocks.paragraph')['Blocks.Text'].transform(lambda x: ' '.join(x))
df2 = df[['Blocks.paragraph', 'grouped_text']].drop_duplicates()
df3 = df2.sort_values(by='Blocks.paragraph')
df3

Unnamed: 0,Blocks.paragraph,grouped_text
0,0,"gods, asking whether the similarities are not ..."
2,1,But it is also true that where interaction is ...
13,2,A pressing question is whether a real sense of...
21,3,Central area of intellectual inquiry across di...
