In [45]:
import pandas as pd
import numpy as np
import pytesseract
from torchmetrics.functional import char_error_rate, word_error_rate
from skimage.transform import rotate
import PIL

In [2]:
from sibigrapi2023.pipelines.processing.utils import accuracyByLevenshteinDistance, cleanText

In [3]:
labels = context.catalog.load("labels")

In [4]:
express_expense = context.catalog.load("express_expense")

In [5]:
l1 = labels['1000-receipt.txt']()

In [6]:
r1 = pytesseract.image_to_string(express_expense['1000-receipt.jpg']())

In [7]:
accuracyByLevenshteinDistance(r1, l1)

0.9433962264150944

In [8]:
import time
start = time.process_time()
# your code here    
print(time.process_time() - start)

0.0002499369999995338


In [53]:
def generate_results(preprocess_image_func):
    results = list()
    for file_name, loader in express_expense.items():
        image = loader()
        image = preprocess_image_func(image)
        label = labels[file_name.replace('.jpg', '.txt')]()
        start = time.process_time()
        ocr_text = pytesseract.image_to_string(image)
        ocr_time = time.process_time() - start
        result = {'image_file': file_name,
                  'ld': accuracyByLevenshteinDistance(ocr_text, label),
                  'wer': float(word_error_rate(preds=cleanText(ocr_text), target=cleanText(label))),
                  'cer': float(char_error_rate(preds=cleanText(ocr_text), target=cleanText(label))),
                  'br': 1 if ocr_text else 0,
                  'ba': int(ocr_text == label),
                  'run_time': ocr_time}
        results.append(result)
    return pd.DataFrame(results)

def rotate5(image):
    return (rotate(np.array(image), 5, resize=True) * 255).astype(np.uint8)
def rotate10(image):
    return (rotate(np.array(image), 10, resize=True) * 255).astype(np.uint8)
def rotate15(image):
    return (rotate(np.array(image), 15, resize=True) * 255).astype(np.uint8)
def rotate20(image):
    return (rotate(np.array(image), 20, resize=True) * 255).astype(np.uint8)
def rotate_m5(image):
    return (rotate(np.array(image), -5, resize=True) * 255).astype(np.uint8)
def rotate_m10(image):
    return (rotate(np.array(image), -10, resize=True) * 255).astype(np.uint8)
def rotate_m15(image):
    return (rotate(np.array(image), -15, resize=True) * 255).astype(np.uint8)
def rotate_m20(image):
    return (rotate(np.array(image), -20, resize=True) * 255).astype(np.uint8)

In [16]:
default_results = list()
for file_name, loader in express_expense.items():
    image = loader()
    label = labels[file_name.replace('.jpg', '.txt')]()
    start = time.process_time()
    ocr_text = pytesseract.image_to_string(image)
    ocr_time = time.process_time() - start
    result = {'image_file': file_name,
              'ld': accuracyByLevenshteinDistance(ocr_text, label),
              'wer': float(word_error_rate(preds=cleanText(ocr_text), target=cleanText(label))),
              'cer': float(char_error_rate(preds=cleanText(ocr_text), target=cleanText(label))),
              'br': 1 if ocr_text else 0,
              'ba': int(ocr_text == label),
              'run_time': ocr_time}
    default_results.append(result)

In [17]:
df = pd.DataFrame(default_results)

In [19]:
df.to_csv("default_results_aggregated.csv")

In [20]:
df

Unnamed: 0,image_file,ld,wer,cer,br,ba,run_time
0,1000-receipt.jpg,0.943396,1.0,0.056604,1,0,0.665724
1,1001-receipt.jpg,0.000000,1.0,1.000000,0,0,0.489725
2,1002-receipt.jpg,0.187166,1.0,0.812834,1,0,0.587938
3,1003-receipt.jpg,0.000000,1.0,1.000000,0,0,0.543879
4,1004-receipt.jpg,0.826087,1.0,0.173913,1,0,0.477913
...,...,...,...,...,...,...,...
186,1195-receipt.jpg,0.652422,1.0,0.347578,1,0,0.463046
187,1196-receipt.jpg,0.673835,1.0,0.326165,1,0,0.092701
188,1197-receipt.jpg,0.819444,1.0,0.180556,1,0,0.104788
189,1198-receipt.jpg,0.792683,1.0,0.207317,1,0,0.137842


In [21]:
df.describe()

Unnamed: 0,ld,wer,cer,br,ba,run_time
count,191.0,191.0,191.0,191.0,191.0,191.0
mean,0.590971,1.0,0.409029,0.973822,0.0,0.443409
std,0.289236,0.0,0.289236,0.160084,0.0,0.726278
min,0.0,1.0,0.015209,0.0,0.0,0.041347
25%,0.3835,1.0,0.17751,1.0,0.0,0.15727
50%,0.671733,1.0,0.328267,1.0,0.0,0.449366
75%,0.82249,1.0,0.6165,1.0,0.0,0.541029
max,0.984791,1.0,1.0,1.0,0.0,9.980534


In [54]:
r5 = generate_results(rotate5)

In [55]:
r5.describe()

Unnamed: 0,ld,wer,cer,br,ba,run_time
count,191.0,191.0,191.0,191.0,191.0,191.0
mean,0.331409,1.0,0.668591,0.853403,0.0,0.59768
std,0.274867,0.0,0.274867,0.354633,0.0,0.647033
min,0.0,1.0,0.038023,0.0,0.0,0.132815
25%,0.071186,1.0,0.452484,1.0,0.0,0.363306
50%,0.309028,1.0,0.690972,1.0,0.0,0.569704
75%,0.547516,1.0,0.928814,1.0,0.0,0.65858
max,0.961977,1.0,1.0,1.0,0.0,9.12189


In [56]:
r10 = generate_results(rotate10)

In [57]:
r10.describe()

Unnamed: 0,ld,wer,cer,br,ba,run_time
count,191.0,191.0,191.0,191.0,191.0,191.0
mean,0.061472,1.0,0.938528,0.481675,0.0,0.586499
std,0.104725,0.0,0.104725,0.500977,0.0,0.354999
min,0.0,1.0,0.305221,0.0,0.0,0.146099
25%,0.0,1.0,0.914526,0.0,0.0,0.318052
50%,0.0,1.0,1.0,0.0,0.0,0.584234
75%,0.085474,1.0,1.0,1.0,0.0,0.739857
max,0.694779,1.0,1.0,1.0,0.0,3.858037
