In [27]:
import os
import json
import shutil

import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
import glob2
import requests
import base64
from zipfile import ZipFile
from tqdm import tqdm


def download_url(url):
    '''
    utility funcction which downloads pdf to local environment
    '''
    # data is going to be read as stream
    chunk_size=2000
    r = requests.get(url, stream=True)
    
    # the pdf filename is extracted from the presigned url
    file_name = [el for el in url.split("/") if (".zip" in el)][0]
    os.makedirs('/tmp', exist_ok=True)
    
    # open a file to dump the stream in
    print(r)
    print(file_name)
    
    with open(f'/tmp/{file_name}', 'wb') as fd:
        for chunk in r.iter_content(chunk_size):
            fd.write(chunk)
    print(os.stat(f'/tmp/{file_name}').st_size)
    
    with ZipFile(f'/tmp/{file_name}', 'r') as zip:
        # extracting all the files
        print(zip.namelist())
        os.makedirs(f'/tmp/all_png/{file_name}', exist_ok=True)
        os.chdir('/tmp/all_png')

        for file_zip in zip.namelist():
            zip.extract(file_zip, '/tmp/all_png')
            print('Extracting all the files now...')
        print("double nested :\n")
        print(glob2.glob('/tmp/all_png/*/*.png'))
        
    return '/tmp/all_png'

def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if cell['BlockType'] == 'CELL':
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        # create new row
                        rows[row_index] = {}
                        
                    # get the text value
                    rows[row_index][col_index] = get_text(cell, blocks_map)
    return rows


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] =='SELECTED':
                            text +=  'X '    
    return text


def get_table_csv_results(file_name):
 
    with open(file_name, 'rb') as file:
        img_test = file.read()
        bytes_test = bytearray(img_test)
        print('Image loaded', file_name)

    # process using image bytes
    # get the results
    client = boto3.client('textract')

    response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])

    # Get the text blocks
    blocks=response['Blocks']
    #pprint(blocks)

    blocks_map = {}
    table_blocks = []
    for block in blocks:
        blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)

    if len(table_blocks) <= 0:
        return "<b> NO Table FOUND </b>"

    csv_list = []
    for index, table in enumerate(table_blocks):
        csv_list.append(generate_table_csv(table, blocks_map, index +1))
    return csv_list

def generate_table_csv(table_result, blocks_map, table_index):
    rows = get_rows_columns_map(table_result, blocks_map)

    table_id = 'Table_' + str(table_index)
    
    # get cells.
    csv = ''

    for row_index, cols in rows.items():
        
        for col_index, text in cols.items():
            csv += '{}'.format(text) + ","
        csv += '\n'
    return csv

def png_2_csv(file_name):
    table_csv_list = get_table_csv_results(file_name)

    # replace content
    for idx, table_csv in enumerate(table_csv_list):
        print(idx)
        new_file_name = file_name.replace(".png", f"sub_{str(idx)}.csv")
        with open(new_file_name, "wt") as fout:
            fout.write(table_csv)
    
    path_f_name = file_name.split("/")[-1].replace(".png","").split("_")[0]
    #all_csv = glob2.glob(f"{path_f_name}/*.csv")
    
    #for csv_path in all_csv:
    print(path_f_name)
    
    with ZipFile(f'{path_f_name}.zip','a') as zip:
        # writing each file one by one for file in png paths:
        zip.write(new_file_name, arcname=new_file_name.split("/")[-1])
    
    #s3_client = boto3.client('s3')
        
    #try:
    #    clean_folder = file_name.split("/")[-1].replace(".","_")
    #    end_name = new_file_name.split("/")[-1]
    #
    #    object_name = f"job_{clean_folder}/{end_name}"
    #    bucket = "liberta-leasing-ml"
    #    response = s3_client.upload_file(new_file_name, bucket, object_name)
    #    result = object_name
    #
    #except Exception as e:
    #    response = None
    #    result = "failed transaction"
    #    print(str(e))
    #    pass
    #return result
    return path_f_name
            
def parse(f_path):
    all_files = glob2.glob(os.path.join(f_path, "*.png"))
    paths = []
    for file_name in tqdm(all_files):
        paths.append(png_2_csv(file_name))
   
    s3_client = boto3.client('s3')
        
    try:
    #    clean_folder = file_name.split("/")[-1].replace(".","_")
    #    end_name = new_file_name.split("/")[-1]
    #
    #    object_name = f"job_{clean_folder}/{end_name}"
        bucket = "liberta-leasing-ml"
        response = s3_client.upload_file(paths[0], bucket, paths[0])
        result = paths[0]
    #
    except Exception as e:
        response = None
        result = "failed transaction"
        print(str(e))
        pass
    return result




In [28]:
f_path = "/Users/assansanogo/Downloads/tmp/erario"
parse(f_path)






  0%|          | 0/16 [00:00<?, ?it/s][A[A[A[A[A

Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_15._cropped.png







  6%|▋         | 1/16 [00:07<01:58,  7.92s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_12._cropped.png







 12%|█▎        | 2/16 [00:15<01:47,  7.69s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_0._cropped.png







 19%|█▉        | 3/16 [00:24<01:47,  8.29s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_7._cropped.png







 25%|██▌       | 4/16 [00:34<01:44,  8.69s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_6._cropped.png







 31%|███▏      | 5/16 [00:42<01:34,  8.58s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_1._cropped.png







 38%|███▊      | 6/16 [00:45<01:09,  6.91s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_13._cropped.png







 44%|████▍     | 7/16 [00:50<00:57,  6.41s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_14._cropped.png







 50%|█████     | 8/16 [00:54<00:45,  5.65s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_3._cropped.png







 56%|█████▋    | 9/16 [01:04<00:48,  6.97s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_4._cropped.png







 62%|██████▎   | 10/16 [01:15<00:48,  8.06s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_11._cropped.png







 69%|██████▉   | 11/16 [01:29<00:49,  9.88s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_10._cropped.png







 75%|███████▌  | 12/16 [01:41<00:41, 10.35s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_5._cropped.png
0







 81%|████████▏ | 13/16 [01:51<00:31, 10.36s/it][A[A[A[A[A

erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_2._cropped.png







 88%|████████▊ | 14/16 [02:04<00:22, 11.02s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_8._cropped.png







 94%|█████████▍| 15/16 [02:14<00:10, 10.94s/it][A[A[A[A[A

0
erario statement
Image loaded /Users/assansanogo/Downloads/tmp/erario/erario statement_9._cropped.png







100%|██████████| 16/16 [02:24<00:00,  9.06s/it][A[A[A[A[A

0
erario statement
[Errno 2] No such file or directory: 'erario statement'





'failed transaction'