In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
# pd.read_csv made some mistakes when reading the tsv files
# this function is able to read the files more reliably
def read_csv(path, sep= ',', names=["PAGENAME", "OCR", "GT"]):
    with open(path, 'r', encoding='utf8') as file:
        rows = file.read().split("\n")
        data = np.array([row.split(sep) for row in rows if len(row.split(sep)) == 3])
    return pd.DataFrame(data=data, columns=names)

In [3]:
ocr_file_0 = "../ocr_data/nlf_ocr_gt_tescomb5_2017.xlsx"
path = '../ocr_data/'
folders = [f for f in glob.glob(path + "**/*.tsv", recursive=True)] 

In [4]:
df_sampled = pd.concat([read_csv(path, sep = "\t", names=["PAGENAME", "OCR", "GT"]) for path in folders])

In [5]:
# reading the excel file takes a while
df_full = pd.read_excel(ocr_file_0, sheet_name="Words", header=3, skip_rows=[0,1,2])

In [6]:
ground_truth = df_full.values[:,5].astype(np.str)
ocr_tess = df_full.values[:,6].astype(np.str)
ocr_old = df_full.values[:,7].astype(np.str)
ocr_fr11 = df_full.values[:,8].astype(np.str)
ocr_sampled = df_sampled.values[:,1].astype(np.str)
ocr_sampled_gt = df_sampled.values[:,2].astype(np.str)

In [7]:
# cross referencing 3 ocr results and marking it correct if two of them have the same result
# correctly marks ~90% of of the results, requires at least 3 different ocr results to be used
# in other words the quality estimates can vary at least +-10% from the true values
# if the quality of the ocr results varies a lot between implementations, the best and worst are hard to distinguish

cross_cmp = (ocr_tess == ocr_old) & (ocr_tess == ocr_fr11)
cross_cmp_tess = (ocr_tess == ocr_old) | (ocr_tess == ocr_fr11)
cross_cmp_old = (ocr_old == ocr_tess) | (ocr_old == ocr_fr11)
cross_cmp_fr11 = (ocr_fr11 == ocr_tess) | (ocr_fr11 == ocr_old)

cross_cmp_int = cross_cmp*1
cross_cmp_tess_int = cross_cmp_tess*1
cross_cmp_old_int = cross_cmp_old*1
cross_cmp_fr11_int = cross_cmp_fr11*1

print("error of estimation tess", 1- np.mean(cross_cmp_tess_int == df_full.values[:,9]))
print("error of estimation old ", 1- np.mean(cross_cmp_old_int == df_full.values[:,10]))
print("error of estimation fr11", 1- np.mean(cross_cmp_fr11_int == df_full.values[:,11]))
print("predicted_quality combined", np.mean(cross_cmp_int)) #all match
print("predicted_quality tess", np.mean(cross_cmp_tess_int)) #at least one match
print("predicted_quality old ", np.mean(cross_cmp_old_int))
print("predicted_quality fr11", np.mean(cross_cmp_fr11_int))
print("true_quality tess", np.mean(df_full.values[:,9]))
print("true_quality old ", np.mean(df_full.values[:,10]))
print("true_quality fr11", np.mean(df_full.values[:,11]))

error of estimation tess 0.13683285394694789
error of estimation old  0.08856663470757431
error of estimation fr11 0.10438438798338123
predicted_quality combined 0.6034096356663471
predicted_quality tess 0.7501138542665389
predicted_quality old  0.7691714605305209
predicted_quality fr11 0.8104705976350272
true_quality tess 0.8135246883988495
true_quality old  0.7041506871204858
true_quality fr11 0.7693092841163311
