In [1]:
import os
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd
from nltk.lm import Vocabulary
import pickle
import sys
import torch
import importlib
from timeit import default_timer as t
sys.path.append("../../lib")
from metrics import levenshtein
import ocr_correction
from pytorch_decoding import seq2seq

In [2]:
folder = "../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish/DE/"

In [3]:
files = sorted(os.listdir(folder))
len(files)

7

In [4]:
import glob

files = glob.glob(folder + '/**/*.txt', recursive=True)
len(files)

2028

In [5]:
data = []
for f in tqdm(files):
    with open(f) as file:
        data.append(file.readlines())

  0%|          | 0/2028 [00:00<?, ?it/s]

In [6]:
data = pd.DataFrame(data, 
                    columns = ["ocr_to_input", 
                               "ocr_aligned", 
                               "gs_aligned"])\
.assign(ocr_to_input = lambda df: df.ocr_to_input.str.replace("[OCR_toInput] ", "", regex = False),
        ocr_aligned = lambda df: df.ocr_aligned.str.replace("[OCR_aligned] ", "", regex = False),
        gs_aligned = lambda df: df.gs_aligned.str.replace("[ GS_aligned] ", "", regex = False))

print(data.shape)
data.head()

(2028, 3)


Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
0,unfrucddtbare Qwitter unter Dden frudtbaren ba...,unfrucddtbare Qwitter unter Dden fru@dtbaren b...,unfruc@htbare Zwitter unter @den fruchtbaren@h...
1,Pertonen ettwan begafftet toctden / Gar feirie...,Pertonen ettwan begafftet toctden / Gar feirie...,Perſonen e@twan behafftet @werden / hat ſei@ne...
2,Ereffriglith. OSrpbeue vnd Apu— beFonmmen: Dno...,Ereffriglith. OSrpbeue vnd Apu— beFonmmen: Dno...,krefftiglich. O@rpheus vnd Apu@⸗beko@mmen: Vnd...
3,tifd—&cenufß abftecpen. ©tulfeor (den folle. g...,tifd—&cenufß abftecpen. ©tulfeor @(den folle. ...,Tiſch⸗Genu@ß abſtechen.@Stulfeyr ſaͤen ſolle.@...
4,©amen tn den ©cdQértlin/ Alliaria mwirbd eS in...,©amen tn den ©cdQértlin@/ Alliaria mwirbd eS i...,Samen in den Schoͤttlin / Alliaria @wir@d es i...


In [7]:
data.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,2028.0,2028.0,2028.0
mean,1541.066075,1583.443787,1582.443787
std,664.150889,708.503065,708.503065
min,5.0,5.0,4.0
25%,1311.0,1347.0,1346.0
50%,1481.0,1513.0,1512.0
75%,1653.25,1683.25,1682.25
max,12778.0,13452.0,13451.0


In [8]:
vocabulary = Vocabulary(data.ocr_to_input.sum() + data.ocr_aligned.sum() + data.gs_aligned.sum())
print(len(vocabulary))

262


In [9]:
distances = levenshtein(reference = data.gs_aligned.str.replace("@", ""), 
                        hypothesis = data.ocr_to_input)
distances.cer.describe()

count    2028.000000
mean       33.486161
std       193.187307
min         0.978294
25%        23.371109
50%        25.602553
75%        27.965791
max      6486.666667
Name: cer, dtype: float64

In [10]:
distances = levenshtein(reference = data.gs_aligned, 
                        hypothesis = data.ocr_to_input)
distances.cer.describe()

count    2028.000000
mean       24.768287
std         6.611045
min         0.976354
25%        22.188407
50%        24.210260
75%        26.272396
max       125.000000
Name: cer, dtype: float64

In [11]:
data.to_pickle("../../data/de/data/test.pkl")

In [12]:
distances.query("cer > 100")

Unnamed: 0,reference,hypothesis,distance,cer
1867,@@@@,ri'/\n,5,125.0
