In [1]:
%run -i "ocr_template_final.ipynb"

In [2]:
import import_ipynb
from difflib import SequenceMatcher
from faker import Faker
import random
import faker_commerce
from PIL import Image, ImageFont, ImageDraw, ImageFilter
import import_ipynb
import time
fake = Faker()
fake.add_provider(faker_commerce.Provider)

In [3]:
def table_data_generator(headers,max_entries,max_code_len,units,code_type):
    table = []
    number_of_rows = random.randint(1,max_entries)
    for i in range(number_of_rows):
        row_entry = []
        for j in headers:
            if j == 'index':
                row_entry.append(i+1)
            elif j == 'code':
                if code_type == 'None':
                    row_entry.append(fake.password(length=max_code_len,special_chars=False,upper_case=True))
                elif code_type == 'int':
                    row_entry.append(fake.random_int(min=10**max_code_len, max=10**(max_code_len+1)-1))
                elif code_type == 'special56':
                    row_entry.append(fake.pystr_format('?-??-###-???','ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
            elif j == 'desc':
                row_entry.append(fake.ecommerce_name()) 
            elif j == 'unit':
                if units:
                    row_entry.append(str(fake.random_int(min=1, max=100))+" PCS")
                else:
                    row_entry.append(fake.random_int(min=1, max=100))
            elif j == 'float':
                row_entry.append(fake.pyfloat(right_digits = 2,positive=True, min_value=0.1, max_value=100))
        table.append(row_entry)
    return table

In [4]:
def other_data_generator(data_type_arr,max_code_len):
    other_data = {}
    for i in data_type_arr:
        if i == 'invoice_number':
            other_data['invoice_number'] = fake.password(length=max_code_len,special_chars=False,upper_case=True)
        elif i == 'total':
            other_data['total'] = fake.pyfloat(right_digits = 2,positive=True, min_value=0.1, max_value=100)
        elif i == 'date':
            other_data['date'] = fake.day_of_month() + " "+ fake.month_name() + " " + fake.year()
        elif i == 'date_n':
            other_data['date_n'] = fake.date()
    return other_data

In [5]:
def image_generator(data):
    table = table_data_generator(headers=data['headers'],max_entries=data['max_entries'],max_code_len=data['max_code_len'],units=data['units'],code_type=data['code_type'])
    T_table = [[j[i] for j in table] for i in range(len(table[0]))]
    font = ImageFont.truetype(font = 'arial',size = data['textsize'])
    image = Image.open(data['template'])
    img = ImageDraw.Draw(image)
    line_spacing = 10
    
    for i in range(len(T_table)):
        position = data['position'][i]
        for j in T_table[i]:
            string = str(j)
            if len(string)>data['max_text_len']:
                string = string[:20:]
            img.text(position,string,font=font,fill=(0, 0, 0))
            position = (position[0],position[1]+data['spacing'])
    
    other_data = other_data_generator([i for i in data['template_data_loc'].keys()],data['max_code_len'])
    
    for k,v in data['template_data_loc'].items():
        if k != "table_data":
            img.text([int(i) for i in v[0]],str(other_data[k]),font=font,fill=(0, 0, 0))
    
#     rotation_angle = random.uniform(-2,2)
    rotation_angle = 0
    rotation_center = [random.uniform(image.size[0],image.size[1]) for _ in range(2)]
#     edge_enhance_level = random.randint(0,2)
    edge_enhance_level = 0
    for i in range(edge_enhance_level):
        image = image.filter(ImageFilter.EDGE_ENHANCE)
    image = image.rotate(rotation_angle,center = rotation_center,expand=True)
    image.show()
    return {'image':image,'rot_angle':rotation_angle,'enhance_level':edge_enhance_level,'test_data':table,'other_data':other_data}

In [6]:
def run_test(iterations):
    R_4_17_headers = ['index','desc','unit','float','float']
    R_4_17_position = [(15,460),(120,460),(754,460),(930,460),(1222,460)]
    R_4_17_template = '../templates/R-4-17.jpg'
    R_4_17_template_size = (1356,1696)
    R_4_17_template_data_loc = {"invoice_number":[[1128.0, 139.0], [1284.0, 140.0], [1128.0, 159.0], [1282.0, 495.0]],
                    "total":[[1248.0, 1422.0], [1335.0, 1422.0], [1335.0, 1449.0], [1248.0, 1449.0]],
                    "date":[[1139.0,161.0],[1291.0,161.0],[1291.0,191.0],[1144.0,182.0]],
                    "table_data":[[1.0, 363.0], [1315, 383.0], [1345.0,1283.0], [17.0, 1327.0]]}
    R_4_17 = {'headers':R_4_17_headers,'position':R_4_17_position,'template':R_4_17_template,'template_size':R_4_17_template_size,'template_data_loc':R_4_17_template_data_loc,'textsize':25,'spacing':63,'max_text_len':13,'code_type':'None','max_code_len':8,'max_entries':12,'units':True}

    R_4_43_headers = ['code','desc','unit','float','float','float']
    R_4_43_position = [(137,915),(268,915),(1074,920),(1225,915),(1432,915),(1565,915)]
    R_4_43_template = '../templates/R-4-43.jpg'
    R_4_43_template_size = (1680,2107)
    R_4_43_template_data_loc = {"invoice_number":[[1273.0, 183.0], [1503.0, 187.0], [1497.0, 228.0], [1279.0, 232.0]],
                    "total":[[1556.0, 1939.0], [1661.0, 1938.0], [1672.0, 1986.0], [1564.0, 1978.0]],
                    "date_n":[[358.0,275.0],[536.0,281.0],[538.0,309.0],[352.0,300.0]],
                    "table_data":[[139.0, 782.0], [1661, 784.0], [1637.0,1474.0], [137.0, 1450.0]]}
    R_4_43 = {'headers':R_4_43_headers,'position':R_4_43_position,'template':R_4_43_template,'template_size':R_4_43_template_size,'template_data_loc':R_4_43_template_data_loc,'textsize':30,'spacing':35,'max_code_len':5,'code_type':'None','max_text_len':13,'max_entries':15,'units':True}

    R_4_46_headers = ['index','code','desc','unit','float','float']
    R_4_46_position = [(120,690),(190,690),(417,690),(1082,690),(1175,690),(1340,690)]
    R_4_46_template = '../templates/R-4-46.jpg'
    R_4_46_template_size = (1571,2029)
    R_4_46_template_data_loc = {"invoice_number":[[1283.0, 316.0], [1441.0, 308.0], [1444.0, 345.0], [1285.0, 348.0]],
                    "total":[[1341.0, 1635.0], [1434.0, 1633.0], [1437.0, 1663.0], [1341.0, 1660.0]],
                    "date_n":[[1136.0,605.0],[1423.0,593.0],[1425.0,627.0],[1125.0,632.0]],
                    "table_data":[[105.0, 649.0], [1425, 643.0], [1431.0,1518.0], [99.0, 1512.0]]}
    R_4_46 = {'headers':R_4_46_headers,'position':R_4_46_position,'template':R_4_46_template,'template_size':R_4_46_template_size,'template_data_loc':R_4_46_template_data_loc,'textsize':24,'spacing':40,'max_code_len':13,'code_type':'int','max_text_len':13,'max_entries':15,'units':False}

    R_4_47_headers = ['index','desc','unit','float','float']
    R_4_47_position = [(198,840),(269,840),(1050,840),(1192,840),(1500,840)]
    R_4_47_template = '../templates/R-4-47.jpg'
    R_4_47_template_size = (1658,1988)
    R_4_47_template_data_loc = {"invoice_number":[[1320.0, 267.0], [1505.0, 260.0], [1503.0, 287.0], [1318.0, 294.0]],
                    "total":[[1454.0, 1804.0], [1554.0, 1800.0], [1561.0, 1833.0], [1451.0, 1834.0]],
                    "date":[[1274.0,390.0],[1440.0,379.0],[1446.0,411.0],[1268.0,416.0]],
                    "table_data":[[173.0, 703.0], [1581, 691.0], [1597.0,1601.0], [163.0, 1585.0]]}
    R_4_47 = {'headers':R_4_47_headers,'position':R_4_47_position,'template':R_4_47_template,'template_size':R_4_47_template_size,'template_data_loc':R_4_47_template_data_loc,'textsize':24,'spacing':61,'max_code_len':13,'code_type':'int','max_text_len':13,'max_entries':12,'units':True}

    R_4_56_headers = ['index','code','desc','unit','float','float']
    R_4_56_position = [(53,486),(97,486),(348,486),(1073,486),(1293,486),(1550,486)]
    R_4_56_template = '../templates/R-4-56.jpg'
    R_4_56_template_size = (1746,1976)
    R_4_56_template_data_loc = {"invoice_number":[[1480.0, 337.0], [1695.0, 339.0], [1695.0, 379.0], [1479.0, 374.0]],
                    "total":[[1629.0, 1665.0], [1707.0, 1665.0], [1701.0, 1694.0], [1607.0, 1694.0]],
                    "date":[[1283.0,332.0],[1427.0,340.0],[1432.0,370.0],[1269.0,372.0]],
                    "table_data":[[6.0, 362.0], [1699, 389.0], [1713.0,1364.0], [110.0, 1343.0]]}
    R_4_56 = {'headers':R_4_56_headers,'position':R_4_56_position,'template':R_4_56_template,'template_size':R_4_56_template_size,'template_data_loc':R_4_56_template_data_loc,'textsize':26,'spacing':40,'max_code_len':13,'code_type':'special56','max_text_len':13,'max_entries':20,'units':True}
    
    template_options = [R_4_17,R_4_43,R_4_46,R_4_47,R_4_56]
    result_str = []
    
    for i in range(iterations):
        start_time = time.time()
        random_pick = template_options[random.randint(0,len(template_options)-1)]
        test = image_generator(random_pick)
        results = run_ocr_template(test['image'],random_pick["template_data_loc"],random_pick['template_size'])
        time_taken = time.time() - start_time
        print("Test ",i+1," Parameters")
        print("Rotation Angle: ",test['rot_angle'],"\tEnhance Level: ", test['enhance_level'])
        print("Results")
        if results:
            table_found = 1
            count = 0
            similarity = 0
            history=[]
            for j in np.array(test['test_data']).flatten().tolist():
                for k in np.array(results['table_data']['row_data']).flatten().tolist()[:len(np.array(test['test_data']).flatten().tolist()):]:
                    if k!=None and k not in history and (str(j).casefold() in str(k).casefold() or str(k).casefold() in str(j).casefold() or SequenceMatcher(None, str(j).casefold(), str(k).casefold()).ratio()>0.6):
                        similarity+=1
                        history.append(k)
                        break
            for k,v in test['other_data'].items():
                if k in results and (str(results[k]).casefold() in str(v).casefold() or str(v).casefold() in str(results[k]).casefold() or SequenceMatcher(None, str(results[k]).casefold(), str(v).casefold()).ratio()>0.6):
                    similarity+=1
            similarity = 100*similarity/(len(np.array(test['test_data']).flatten().tolist())+len(test['other_data']))
            print("Similarity: ",similarity,"%",end='\t')
            count = 0
            accuracy = 0
            for row in range(len(test['test_data'])):
                for col in range(len(test['test_data'][0])):
                    if len(results['table_data']['row_data'])-1>row and len(results['table_data']['row_data'][row])-1>col: 
                        count+=1
                        if test['test_data'][row][col] == results['table_data']['row_data'][row][col]:
                            accuracy+=1
            for k,v in test['other_data'].items():
                if k in results and results[k].casefold() == str(v).casefold():
                    accuracy+=1
            accuracy = 100*accuracy/(count+len(test['other_data']))
            print("Accuracy: ",accuracy,"%")
        else:
            table_found = 0
            similarity = 0
            accuracy = 0
            print("Error. Table not found.")
        print("Time Taken: %s seconds" % (time.time() - start_time))
        print("-----------------------------------------------------")
#         print([print(k) for k in results['table_data']['row_data']])
        print(results)
        print()
        result_str.append(f"{i},{test['rot_angle']},{test['enhance_level']},{table_found},{similarity},{accuracy},{time_taken},{str(results)}")
        if len(result_str)>2 or i == iterations-1:
            try:
                open("test_log_temp.csv", "r")
            except:
                f = open("test_log_temp.csv", "a")
                f.write("Test Number,Rotation Angle,Enhance Level,Table Found,Similarity,Accuracy,Time Taken,Results\n")
            else:
                f = open("test_log_temp.csv", "a")
            for j in result_str:
                f.write(j)
                f.write('\n')
            f.close()
            result_str = []
        print("-----------------------------------------------------")
        

In [7]:
# errors = 0
# while(1):
#     try:
#         run_test(1)
#     except:
#         errors+=1
#         continue

In [10]:
run_test(1)

KeyboardInterrupt: 