In [1]:
import os
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd
from nltk.lm import Vocabulary
import sys
sys.path.append("../../lib")
from metrics import levenshtein
import pickle

In [5]:
folder = "../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish/DE/"

In [6]:
output_folder = Path("../../data/de")

In [7]:
files = sorted(os.listdir(folder))
len(files)

7

In [8]:
import glob

files = glob.glob(folder + '/**/*.txt', recursive=True)
len(files)

8052

In [9]:
from multiprocessing import Pool

def extract(name):
    with open(name) as file:
        return file.readlines()
    
def create_windows(x):
    A, B, window_length = x
    assert len(A) == len(B)
    return [(A[i:i + window_length], B[i:i + window_length]) 
            for i in range(len(A) + 1)]
    
p = Pool(4)
    
data = list(p.imap_unordered(extract, tqdm(files), chunksize = 128))
len(data)

  0%|          | 0/8052 [00:00<?, ?it/s]

8052

In [7]:
# data = []
# for f in tqdm(files):
#     with open(f) as file:
#         data.append(file.readlines())

data = pd.DataFrame(data, 
                    columns = ["ocr_to_input", 
                               "ocr_aligned", 
                               "gs_aligned"])\
.assign(ocr_to_input = lambda df: df.ocr_to_input.str.replace("[OCR_toInput] ", "", regex = False),
        ocr_aligned = lambda df: df.ocr_aligned.str.replace("[OCR_aligned] ", "", regex = False),
        gs_aligned = lambda df: df.gs_aligned.str.replace("[ GS_aligned] ", "", regex = False))

print(data.shape)
data.head()

(8052, 3)


Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
0,"fuefcat, Sie difcet latiné ? quid ni? aliis di...","fuefcat, Sie difcet latiné ? quid ni? aliis di...",ſueſcat. Sic diſcet latin@è? quid ni?@aliis di...
1,"Decemvir] Decem primi, Cic. Decaproti, Wp. Dem...","Decemvir] Decem primi, Cic. Decaproti, @Wp. De...","Decemvir] Decem primi, Cic. Decaproti, Vlp. De..."
2,licet altj amplius vel mmus accipfät ab eo +fc...,licet altj amplius vel mmu@s accipfät ab eo @+...,licet alij amplius vel mmuis accipiãt ab eo · ...
3,"nea, Taxb. Concordii res parvz creicunt , difc...","nea, Taxb. Concordii res parvz creicunt , difc...","nea, Taub.@Concordiâ res parvæ creſcunt@, diſc..."
4,"mone cclcbratam,arbitror extitiffe caufam, Ath...","mone cclcbratam,@arbitror extitiffe caufam, At...","mone celebratam, arbitror extitiſſe cauſam, At..."


In [8]:
data.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,8052.0,8052.0,8052.0
mean,1547.265897,1587.07762,1587.07762
std,727.281992,764.207432,764.207432
min,112.0,126.0,126.0
25%,1314.0,1347.0,1347.0
50%,1482.0,1515.0,1515.0
75%,1650.25,1679.0,1679.0
max,14561.0,16187.0,16187.0


In [9]:
levenshtein(reference = data.gs_aligned.str.replace("@", ""), 
            hypothesis = data.ocr_to_input).cer.describe()

count    8052.000000
mean       27.809843
std        85.454787
min         1.039755
25%        23.312188
50%        25.674876
75%        27.886864
max      4747.368421
Name: cer, dtype: float64

In [10]:
levenshtein(reference = data.gs_aligned, 
            hypothesis = data.ocr_aligned).cer.describe()

count    8052.000000
mean       24.523847
std         5.655523
min         1.039120
25%        22.087205
50%        24.184246
75%        26.150900
max        98.043478
Name: cer, dtype: float64

In [11]:
vocabulary = Vocabulary(data.ocr_to_input.sum() + data.ocr_aligned.sum() + data.gs_aligned.sum())
print(len(vocabulary))
with open(output_folder/"data/vocabulary.pkl", "wb") as file:
    pickle.dump(vocabulary, file)

334


In [12]:
dev = data.sample(n = 5, random_state = 1)
dev.to_pickle(output_folder/"data/dev.pkl")
dev.shape

(5, 3)

In [13]:
train = data.drop(dev.index)
train.to_pickle(output_folder/"data/train.pkl")
train.shape

(8047, 3)

In [23]:
train.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,8047.0,8047.0,8047.0
mean,1547.275879,1587.094818,1587.094818
std,727.493432,764.429859,764.429859
min,112.0,126.0,126.0
25%,1314.0,1347.0,1347.0
50%,1482.0,1515.0,1515.0
75%,1650.5,1679.0,1679.0
max,14561.0,16187.0,16187.0


In [14]:
dev.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,5.0,5.0,5.0
mean,1531.2,1559.4,1559.4
std,205.2211,212.435402,212.435402
min,1260.0,1266.0,1266.0
25%,1503.0,1535.0,1535.0
50%,1504.0,1544.0,1544.0
75%,1553.0,1588.0,1588.0
max,1836.0,1864.0,1864.0


In [15]:
levenshtein(reference = dev.gs_aligned.str.replace("@", ""), 
            hypothesis = dev.ocr_to_input).cer.describe()

count     5.000000
mean     23.046394
std       1.032912
min      21.635434
25%      22.736555
50%      23.112583
75%      23.238255
max      24.509140
Name: cer, dtype: float64

In [16]:
levenshtein(reference = dev.gs_aligned, 
            hypothesis = dev.ocr_to_input).cer.describe()

count     5.000000
mean     21.855722
std       1.071343
min      20.439914
25%      21.693811
50%      21.721959
75%      21.977330
max      23.445596
Name: cer, dtype: float64

In [17]:
window_length = 100

In [18]:
df = train#.head(100)
train_aligned = list(p.imap_unordered(create_windows, 
                                      tqdm(zip(df.ocr_aligned, 
                                               df.gs_aligned, 
                                               [window_length for x in df.ocr_aligned]), 
                                           total = len(df.ocr_aligned)),
                                      chunksize = 128))
s = []
for r in tqdm(train_aligned):
    s.extend(r)
train_aligned = pd.DataFrame(s, columns = ["source", "target"])
print(train_aligned.shape)
train_aligned.head()

  0%|          | 0/8047 [00:00<?, ?it/s]

  0%|          | 0/8047 [00:00<?, ?it/s]

(12779399, 2)


Unnamed: 0,source,target
0,Zropffen crtich Dr?albafter ibme@in ben Dtund ...,Tropffen etlich @@Malvaſier ihme in den @Mund ...
1,ropffen crtich Dr?albafter ibme@in ben Dtund g...,ropffen etlich @@Malvaſier ihme in den @Mund g...
2,opffen crtich Dr?albafter ibme@in ben Dtund ge...,opffen etlich @@Malvaſier ihme in den @Mund ge...
3,"pffen crtich Dr?albafter ibme@in ben Dtund ge,...",pffen etlich @@Malvaſier ihme in den @Mund ge@...
4,"ffen crtich Dr?albafter ibme@in ben Dtund ge, ...",ffen etlich @@Malvaſier ihme in den @Mund ge@@...


In [19]:
train_aligned = train_aligned.assign(source = lambda df: df.source.str.replace("@", ""))
train_aligned.head()

Unnamed: 0,source,target
0,Zropffen crtich Dr?albafter ibmein ben Dtund g...,Tropffen etlich @@Malvaſier ihme in den @Mund ...
1,ropffen crtich Dr?albafter ibmein ben Dtund ge...,ropffen etlich @@Malvaſier ihme in den @Mund g...
2,"opffen crtich Dr?albafter ibmein ben Dtund ge,...",opffen etlich @@Malvaſier ihme in den @Mund ge...
3,"pffen crtich Dr?albafter ibmein ben Dtund ge, ...",pffen etlich @@Malvaſier ihme in den @Mund ge@...
4,"ffen crtich Dr?albafter ibmein ben Dtund ge, 3...",ffen etlich @@Malvaſier ihme in den @Mund ge@@...


In [20]:
dev_aligned = dev.apply(lambda r: create_windows((r["ocr_aligned"], r["gs_aligned"], window_length)), 
                            axis = 1).sum()
dev_aligned = pd.DataFrame(dev_aligned, columns = ["source", "target"])
print(dev_aligned.shape)
dev_aligned.head()

(7802, 2)


Unnamed: 0,source,target
0,"der einen Geite den ftrengfken Ölauben, auf bd...","der einen Seite den ſtrengſten Glauben, auf @d..."
1,"er einen Geite den ftrengfken Ölauben, auf bde...","er einen Seite den ſtrengſten Glauben, auf @de..."
2,"r einen Geite den ftrengfken Ölauben, auf bder...","r einen Seite den ſtrengſten Glauben, auf @der..."
3,"einen Geite den ftrengfken Ölauben, auf bder ...","einen Seite den ſtrengſten Glauben, auf @der ..."
4,"einen Geite den ftrengfken Ölauben, auf bder a...","einen Seite den ſtrengſten Glauben, auf @der a..."


In [21]:
dev_aligned = dev_aligned.assign(source = lambda df: df.source.str.replace("@", ""))
dev_aligned.head()

Unnamed: 0,source,target
0,"der einen Geite den ftrengfken Ölauben, auf bd...","der einen Seite den ſtrengſten Glauben, auf @d..."
1,"er einen Geite den ftrengfken Ölauben, auf bde...","er einen Seite den ſtrengſten Glauben, auf @de..."
2,"r einen Geite den ftrengfken Ölauben, auf bder...","r einen Seite den ſtrengſten Glauben, auf @der..."
3,"einen Geite den ftrengfken Ölauben, auf bder ...","einen Seite den ſtrengſten Glauben, auf @der ..."
4,"einen Geite den ftrengfken Ölauben, auf bder a...","einen Seite den ſtrengſten Glauben, auf @der a..."


In [22]:
train_aligned.to_pickle(output_folder/"data/train_aligned.pkl")
dev_aligned.to_pickle(output_folder/"data/dev_aligned.pkl")