In [4]:
from collections import defaultdict
import json
from pprint import pprint
import boto3
import math
import pathlib
from typing import Dict, List


In [5]:
image_folder = pathlib.Path('Images')
output_folder = pathlib.Path('OCR')
output_folder.mkdir(exist_ok=True)

In [None]:


def extract_text_from_image(filename: pathlib.Path):
    textract = boto3.client('textract', region_name='us-west-2')
    response = textract.analyze_document(
        Document={
            'Bytes': filename.read_bytes()
        },
        FeatureTypes=['TABLES', 'FORMS', ]
    )
    
    result = {
        "Pages": len(response['Blocks']),
        "Blocks": response['Blocks']
    }
    #return json dumps
   
    
    return json.dumps(result, indent=4, sort_keys=True, default=str)

In [None]:

# making a loop to go through all the images in the folder
for filename in image_folder.iterdir():
    # Extract the text from the image
    text = extract_text_from_image(filename)
    # Save the text to a json file
    output_file = output_folder / (filename.stem + '.json')
    output_file.write_text(text)
    

In [None]:
# this one works, but I want to make it work for all the json files in the folder


testimg1 = pathlib.Path('OCR/testimage1.json')
testlist1 = []
with open(testimg1) as f:
    data = json.load(f)
    for i in data['Blocks']:
        if i['BlockType'] == 'LINE' or i['BlockType'] == 'WORD':
            testlist1.append(i["Text"])

pprint(testlist1)


In [3]:
#deleting the leftmost bounding box from the list
def delete_leftmost(blocks: List[Dict]) -> List[Dict]:
    left_most = min((block for block in blocks if block['BlockType'] == 'WORD'), key = lambda block: block['Geometry']['BoundingBox']['Left'])
    left_bound = left_most['Geometry']['BoundingBox']['Left']
    left_threshold = left_bound + left_most['Geometry']['BoundingBox']['Width']
    left_filter = [block for block in blocks if block['Geometry']['BoundingBox']['Left'] > left_threshold]
    return left_filter
    

In [37]:
# fuction that puts all of the text where the bounding box bottom is above the top of the bounding box of the next line in a list and returns that list
def data_by_row(blocks: List[Dict]) -> List[Dict]:
    rows: List[List[Dict]] = []
    blocks_by_id = {block['Id']: block for block in blocks}

    prevlistlen = math.inf

    while (listlen := len(blocks_by_id.keys())) > 0:
        assert listlen < prevlistlen
        prevlistlen = listlen
        #find the key of the value with the minimum top value
        top_most_id = min(blocks_by_id.keys(), key = lambda id: blocks_by_id[id]['Geometry']['BoundingBox']['Top'])

        #top_most_id = min(((id, block) for id, block in blocks_by_id.items() if block['BlockType'] == 'WORD'), key = lambda block_tuple: block_tuple[1]['Geometry']['BoundingBox']['Top'])[0]
        top_bound = blocks_by_id[top_most_id]['Geometry']['BoundingBox']['Top'] + blocks_by_id[top_most_id]['Geometry']['BoundingBox']['Height']
        top_filter = [id for id, block in blocks_by_id.items() if block['Geometry']['BoundingBox']['Top'] < top_bound]
        rows.append([blocks_by_id[id] for id in top_filter])
        for id in top_filter:
            assert id in blocks_by_id.keys()
            del blocks_by_id[id]

    # sort rows by leftmost bounding box
    rows = [sorted(row, key = lambda block: block['Geometry']['BoundingBox']['Left']) for row in rows]
    return rows


In [None]:
#iterate through all the json files in the folder
for filename in output_folder.iterdir():
    #delete the leftmost column
    left_filter = delete_leftmost(filename)
    #save the new json file
    output_file = output_folder / (filename.stem + '.json')
    output_file.write_text(left_filter)

In [8]:
testlists: Dict[pathlib.Path, List] = defaultdict(list)

#make a path to the OCR folder
OCR = pathlib.Path('OCR')

for filename in (OCR).iterdir():
    with open(filename) as f:
        data = json.load(f)
        result = delete_leftmost(data['Blocks'])
        result = data_by_row(result)
        #replace each block with a list of the text in each block
        result_text = []
        for row in result:
            row_text = []
            for block in row:
                if block['BlockType'] == 'WORD':
                    text = block['Text']
                    row_text.append(text)
            result_text.append(row_text)
        
        testlists[filename.name] = result_text

#write an output json file containing testlists
with open('testlists.json', 'w') as f:
    json.dump(testlists, f, indent=4, sort_keys=True, default=str)
    
        




KeyboardInterrupt: 