In [1]:
import pandas as pd
import numpy as np
import collections
import glob

In [2]:
# pd.read_csv made some mistakes when reading the tsv files
# this function is able to read the files more reliably
def read_csv(path, sep= ',', names=["PAGENAME", "OCR", "GT"]):
    with open(path, 'r', encoding='utf8') as file:
        rows = file.read().split("\n")
        data = np.array([row.split(sep) for row in rows if len(row.split(sep)) == 3])
    return pd.DataFrame(data=data, columns=names)

In [3]:
ocr_file_0 = "../ocr_data/nlf_ocr_gt_tescomb5_2017.xlsx"
path = '../ocr_data/'
folders = [f for f in glob.glob(path + "**/*.tsv", recursive=True)] 

In [4]:
df_sampled = pd.concat([read_csv(path, sep = "\t", names=["PAGENAME", "OCR", "GT"]) for path in folders])

In [5]:
# reading the excel file takes a while
df_full = pd.read_excel(ocr_file_0, sheet_name="Words", header=3, skip_rows=[0,1,2])

In [6]:
ground_truth = df_full.values[:,5].astype(np.str)
ocr_tess = df_full.values[:,6].astype(np.str)
ocr_old = df_full.values[:,7].astype(np.str)
ocr_fr11 = df_full.values[:,8].astype(np.str)
ocr_sampled = df_sampled.values[:,1].astype(np.str)
ocr_sampled_gt = df_sampled.values[:,2].astype(np.str)

In [7]:
# join the individual words into pages
def join(array):
    return " ".join(map(str, array))
def combine_to_pages(dataframe, column):
    pages = np.split(dataframe.values[:, column], np.cumsum(np.unique(dataframe.values[:, 3], return_counts=True)[1])[:-1])
    return np.array(list(map(join, pages)))
pages_gt = combine_to_pages(df_full, 5)
pages_tess = combine_to_pages(df_full, 6)
pages_old = combine_to_pages(df_full, 7)
pages_fr11 = combine_to_pages(df_full, 8)
ocr_sampled = df_sampled.values[:,1]
pages_gt[1][:140]

'että kunnille sallittaisiin walta sulkea kapakat alueellaan, mutta wielä ei ole semmoinen laki tullut toimeen. Parlamentti on tosin kerran h'

In [8]:
def entropy(string):
        "Calculates the Shannon entropy of a string"
        string = str(string)

        # get probability of chars in string
        prob = [ float(string.count(c)) / len(string) for c in dict.fromkeys(list(string)) ]

        # calculate the entropy
        entropy = - sum([ p * math.log(p) / math.log(2.0) for p in prob ])

        return entropy
    
def per_char_entropy(string):
    length = max(len(str(string)),1)
    return entropy(string) / length


def entropy_ideal(length):
        "Calculates the ideal Shannon entropy of a string with given length"

        prob = 1.0 / length

        return -1.0 * length * prob * math.log(prob) / math.log(2.0)
vectorEntropy = np.vectorize(entropy)
vectorPerCharEntropy = np.vectorize(per_char_entropy)

In [9]:
counts_tess = collections.Counter("".join(pages_tess))
counts_old = collections.Counter("".join(pages_old))
counts_fr11 = collections.Counter("".join(pages_fr11))
counts_gt = collections.Counter("".join(pages_gt))
counts_sampled = collections.Counter("".join(ocr_sampled))
len_tess = len("".join(pages_tess))
len_old = len("".join(pages_old))
len_fr11 = len("".join(pages_fr11))
len_gt = len("".join(pages_gt))
len_sampled = len("".join(ocr_sampled))
list(sorted(counts_old.items(), key = lambda x: x[1], reverse=True))[-5:]

[('ü', 5), ('Å', 5), ('#', 4), ('™', 1), ('Ü', 1)]

In [10]:
def KL_divergence(string_ratio_corpus_ratio):
    string_ratio, corpus_ratio = string_ratio_corpus_ratio
    return string_ratio*np.log2(string_ratio / corpus_ratio)
    
def mean_KL_divergence(string, counts_data, len_data):
    string = str(string)
    if(len(string)==0):
        return 0
    string_ratio = map(lambda char: string.count(char) / len(string), string)
    corpus_ratio = map(lambda char: counts_data[char] / len_data, string)
    KL_div = list(map(KL_divergence, zip(string_ratio, corpus_ratio)))
    return np.mean(KL_div) #max and mean have similar results, sum finds longer words
    
    
mean_KL_divergence("üö", counts_old, len_old)

6.419403082960818

In [11]:
def map_tess_KL(string):
    return mean_KL_divergence(string, counts_tess, len_tess)
def map_old_KL(string):
    return mean_KL_divergence(string, counts_old, len_old)
def map_fr11_KL(string):
    return mean_KL_divergence(string, counts_fr11, len_fr11)
def map_gt_KL(string):
    return mean_KL_divergence(string, counts_gt, len_gt)
def map_sampled_KL(string):
    return mean_KL_divergence(string, counts_sampled, len_sampled)
kl_tess = np.array(list(map(map_tess_KL, ocr_tess)))
kl_old = np.array(list(map(map_old_KL, ocr_old)))
kl_fr11 = np.array(list(map(map_fr11_KL, ocr_fr11)))
kl_gt = np.array(list(map(map_gt_KL, ground_truth)))
kl_sampled = np.array(list(map(map_sampled_KL, ocr_sampled)))

In [12]:
kl_treshold = 16
kl_treshold_tess_int = (kl_tess<kl_treshold)*1
kl_treshold_old_int = (kl_old<kl_treshold)*1
kl_treshold_fr11_int = (kl_fr11<kl_treshold)*1
kl_treshold_gt_int = (kl_gt<kl_treshold)*1
kl_treshold_sampled_int = (kl_sampled<kl_treshold)*1

print("error of estimation tess", 1-np.mean(kl_treshold_tess_int*1 == df_full.values[:,9]))
print("error of estimation old", 1-np.mean(kl_treshold_old_int*1 == df_full.values[:,10]))
print("error of estimation fr11", 1-np.mean(kl_treshold_fr11_int*1 == df_full.values[:,11]))
print("predicted_quality tess", np.mean(kl_treshold_tess_int))
print("predicted_quality old", np.mean(kl_treshold_old_int))
print("predicted_quality fr11", np.mean(kl_treshold_fr11_int))
print("predicted_quality gt", np.mean(kl_treshold_gt_int))
print("predicted_quality sampled", np.mean(kl_treshold_sampled_int))
print("true_quality tess", np.mean(df_full.values[:,9]))
print("true_quality old", np.mean(df_full.values[:,10]))
print("true_quality fr11", np.mean(df_full.values[:,11]))

error of estimation tess 0.18650327580696713
error of estimation old 0.29585930009587724
error of estimation fr11 0.23069071588366885
predicted_quality tess 0.9999240971556408
predicted_quality old 0.999938079258549
predicted_quality fr11 0.9999960051134548
predicted_quality gt 0.9996524448705657
predicted_quality sampled 1.0
true_quality tess 0.8135246883988495
true_quality old 0.7041506871204858
true_quality fr11 0.7693092841163311


In [13]:
# a lot of the tokens marked wrong are actually correct, just unusual
marked_wrong_tess = [i for i, x in enumerate(kl_tess) if x>kl_treshold]
print(ocr_tess[marked_wrong_tess])
print(ground_truth[marked_wrong_tess])
marked_wrong_old = [i for i, x in enumerate(kl_old) if x>kl_treshold]
print(ocr_old[marked_wrong_old])
print(ground_truth[marked_wrong_old])

['®' 'Q' '+' '+' '\\\\' '*' 'Z' '+' '+' '+' '+' '+' '+' '+' '+' '%' 'X'
 'Q' 'Ö' 'Q' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' 'Q' '+']
['O' 'Helsingissä,' '+' '+' '"' '½' 'Zefanias' '+' '+' '+' '+' '+' '+' '+'
 '+' '%' 'X' 'Korhonen' 'Toiwotan' 'Awaimia' '+' '+' '+' '4' '+' '+' '+'
 '+' '+' '+' '+' '+' '"' '+' '+' '+' 'L.' '+9']
['&' '&' '&' '&' '%' '&' '&' '&' '{' '•' '•' '&' '&' '+' '%' '%' '&' '%'
 '&' '_' '+' '__' '+' '+' '+' 'É' '•' '•' '&' '%' '&']
['&' '&' '&' '&' '%,' 'à' '&' '&' '½' '"' '"' '&' '&' '+' '%' 'laina' 'L'
 '%' '&' '—' '+' '—' '+' '+' '+' '&' '3.738:' 'A.' 'Lignell' '%' '&']


In [14]:
# True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
TP = np.sum(np.logical_and(kl_treshold_old_int == 1, df_full.values[:,10] == 1))

# True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
TN = np.sum(np.logical_and(kl_treshold_old_int == 0, df_full.values[:,10] == 0))
 
# False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
FP = np.sum(np.logical_and(kl_treshold_old_int == 1, df_full.values[:,10] == 0))
 
# False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
FN = np.sum(np.logical_and(kl_treshold_old_int == 0, df_full.values[:,10] == 1))

print(TP,FP,TN,FN)

352508 148101 13 18
