In [1]:
### parsers script is credited to Chirag Rathod (https://github.com/srcecde/aws-tutorial-code/tree/master/lambda), this project could not accomplish without his script.
### Thank you Chirag!

# Import necessary libraries
from wsgiref.handlers import format_date_time
from distutils.command import upload
import boto3
from botocore.exceptions import ClientError, NoCredentialsError
from trp import Document
import pandas as pd
import shutil
from parsers import extract_text, map_word_id, extract_table_info, get_key_map, get_value_map, get_kv_map, move_file, upload_to_s3
import os
from datetime import datetime

# Replace 'YOUR_ACCESS_KEY' and 'YOUR_SECRET_KEY' with your actual AWS credentials
AWS_ACCESS_KEY = 'YOU_AWS_ACCESS_KEY'
AWS_SECRET_KEY = 'YOUR_AWS_SECRET_KEY'
REGION_NAME = 'us-west-1'

# Initialize Textract client
textract_client = boto3.client('textract', region_name=REGION_NAME, aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)

# 1. Create your AWS S3 bucket
# 2. Create a folder called Raw_JPG in the same path with this script
# 3. Create a folder called Processed_JPG
# 4. Create a folder called To_Merge
# 5. Create a folder called To_Review
# 6. Place the scanner file to extract the text and run this script
# 7. Check in the Processed_JPG (succeed) or To_Review (to review) for the output

s3_bucket = 'YOUR_S3_BUCKET_NAME'
raw_jpg_folder = '.\Raw_JPG'
processed_jpg_folder = '.\Processed_JPG'
to_merge_path = '.\To_Merge'
to_review_path = '.\To_Review'

try:

    files = os.listdir(raw_jpg_folder)

    # Print the list of files
    print("JPG files in the folder:")
    for file in files:
        if file.lower().endswith(".jpg"):
            print(file)
    
            # file_name = 'table2.jpg'
            source_file_path = os.path.join(raw_jpg_folder, file)
            destination_path = os.path.join(processed_jpg_folder, file)
            s3_key_path = file
            unknown_count = 0

            try:
                # Start Textract analysis for both forms and tables
                upload_to_s3(source_file_path, s3_bucket, s3_key_path, REGION_NAME, AWS_ACCESS_KEY, AWS_SECRET_KEY)
                response = textract_client.start_document_analysis(
                    DocumentLocation={'S3Object': {'Bucket': s3_bucket, 'Name': file}},
                    FeatureTypes=['FORMS', 'TABLES']
                )

                # Get the JobId from the response
                job_id = response['JobId']

                # Get the analysis results
                result = None
                while result == None or result['JobStatus'] == 'IN_PROGRESS':
                    print('Waiting for Textract analysis...')
                    result = textract_client.get_document_analysis(JobId=job_id)

                # After processed succeed move the file to processed JPG folder
                move_file(source_file_path, destination_path)


            except ClientError as e:
                print(f"Error: {e.response['Error']['Message']}")


            raw_text = extract_text(result, extract_by="LINE")
            word_map = map_word_id(result)
            table = extract_table_info(result, word_map)
            key_map = get_key_map(result, word_map)
            value_map = get_value_map(result, word_map)
            final_map = get_kv_map(key_map, value_map)


            table_keys = list(table.keys())
            first_key = table_keys[0]
            table_indexes = table[table_keys[0]][0]
            df_base = pd.DataFrame(columns=table_indexes)
            columns_to_drop = ['', ' ']
            df_base = df_base.drop(columns=[col for col in columns_to_drop if col in df_base.columns], axis=1, inplace=True)


            for index in table.keys():
                # print(table[index])
                column = table[index][0]
                data = table[index][1:]
                df_append = pd.DataFrame(data, columns=column)
                columns_to_drop = ['', ' ']
                df_append = df_append.drop(columns=[col for col in columns_to_drop if col in df_append.columns], axis=1)
                # # Concatenate them vertically
                df_base = pd.concat([df_base, df_append], ignore_index=True)

            for item in final_map:
                print(final_map[f'{item}'])
                if final_map[f'{item}'] == 'VALUE_NOT_FOUND':
                    final_map[f'{item}'] = ''

            # there are some mis-extraction for numbers with the characters, the section below is to replace the possible text with the numbers
            for index, row in df_base.iterrows():
                if row['Item'] == 'l' or row['Item'] == 'I' or row['Item'] == '|' or row['Item'] == '/' or row['Item'] == '\\':
                    df_base.at[index, 'Item'] = 1
                if row['Pieces'] == 'l' or row['Pieces'] == 'I' or row['Pieces'] == '|' or row['Pieces'] == '/' or row['Pieces'] == '\\':
                    df_base.at[index, 'Pieces'] = 1
                if row['Item'] == 's' or row['Item'] == 'S':
                    df_base.at[index, 'Item'] = 5
                if row['Pieces'] == 's' or row['Pieces'] == 'S':
                    df_base.at[index, 'Pieces'] = 5


            df_base.replace(['', ' '], pd.NA, inplace=True)
            df_base.dropna(how='all', inplace=True)
            df_base['Name'] = final_map['Name:']
            df_base['Date'] = final_map['Date:']
            trans_table = str.maketrans({' ': '-', '/': '-', '\\': '-'})
            formatted_date = final_map['Date:'].translate(trans_table)
            df_base['Dept'] = final_map['Opt. Dept. :']
            df_base['Activity'] = final_map['Activity:']
            df_base['Hours'] = final_map['Hours:']
            df_base['Total Pieces'] = final_map['Total Pieces:']
            df_base = df_base[['Name', 'Date', 'Dept', 'Activity', 'Item', 'Pieces', 'Hours', 'Total Pieces']]

            if final_map['Name:'] != '' or final_map['Name:'] != 'VALUE_NOT_FOUND' or \
                final_map['Date:'] != '' or final_map['Date:'] != 'VALUE_NOT_FOUND' or \
                final_map['Activity:'] != '' or final_map['Activity:'] != 'VALUE_NOT_FOUND' or \
                final_map['Opt. Dept. :'] != '' or final_map['Opt. Dept. :'] != 'VALUE_NOT_FOUND':

                if final_map['Name:'] != '' or final_map['Name:'] != 'VALUE_NOT_FOUND':
                    export_name = final_map['Name:']
                    df_base.to_csv(f'{to_review_path}\{export_name}{formatted_date}.csv', index=False)
                else:
                    export_name = file.replace('.jpg', '')
                    df_base.to_csv(f'{to_review_path}\{export_name}{unknown_count}.csv', index=False)
                    unknown_count += 1

            else:

                if final_map['Name:'] != '' or final_map['Name:'] != 'VALUE_NOT_FOUND':
                    export_name = final_map['Name:']
                    df_base.to_csv(f'{to_merge_path}\{export_name}.csv', index=False)
                else:
                    export_name = file.replace('.jpg', '')
                    df_base.to_csv(f'{to_merge_path}\{export_name}{unknown_count}.csv', index=False)
                    unknown_count += 1

except Exception as e:
    print(f"Error: {e}")

JPG files in the folder:
table1.jpg
File .\Raw_JPG\table1.jpg uploaded successfully to hankapicall/table1.jpg
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...
Waiting for Textract analysis...