In [1]:
import os
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd
from nltk.lm import Vocabulary
import sys
sys.path.append("../../lib")
from metrics import levenshtein
import pickle

In [2]:
folder = "../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish/FR/"

In [3]:
output_folder = Path("../../data/fr")

In [4]:
files = sorted(os.listdir(folder))
len(files)

3

In [5]:
import glob

files = glob.glob(folder + '/**/*.txt', recursive=True)
len(files)

2257

In [6]:
from multiprocessing import Pool

def extract(name):
    with open(name) as file:
        return file.readlines()
    
def create_windows(x):
    A, B, window_length = x
    assert len(A) == len(B)
    return [(A[i:i + window_length], B[i:i + window_length]) 
            for i in range(len(A) + 1)]
    
p = Pool(4)
    
data = list(p.imap_unordered(extract, tqdm(files), chunksize = 128))
len(data)

  0%|          | 0/2257 [00:00<?, ?it/s]

2257

In [7]:
# data = []
# for f in tqdm(files):
#     with open(f) as file:
#         data.append(file.readlines())

data = pd.DataFrame(data, 
                    columns = ["ocr_to_input", 
                               "ocr_aligned", 
                               "gs_aligned"])\
.assign(ocr_to_input = lambda df: df.ocr_to_input.str.replace("[OCR_toInput] ", "", regex = False),
        ocr_aligned = lambda df: df.ocr_aligned.str.replace("[OCR_aligned] ", "", regex = False),
        gs_aligned = lambda df: df.gs_aligned.str.replace("[ GS_aligned] ", "", regex = False))

print(data.shape)
data.head()

(2257, 3)


Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
0,"Charles, etc. Savoir faisons à touz, presens e...","Charles, etc. Savoir faisons à touz, presens e...","Charles, etc. Savoir faisons à touz, presens e..."
1,"par la grace de Dieu, roys de France. Savoir f...","@@@@@@@@@@@par la grace de Dieu, roys de Franc...","Philippes, par la grace de Dieu, roys de Franc..."
2,"Jehan ainsné, filz et lieu tenant du roy de Fr...","Jehan@ ainsné, filz et lieu tenant du roy de F...","Jehan, ainsné@ filz et lieu tenant du roy de F..."
3,"Johannes, Dei gratia, Francorum rex. Notum fac...","Johannes, Dei gratia, Francorum rex. Notum fac...","Johannes, Dei gratia, Francorum rex. Notum fac..."
4,umbre de ce que eulx et nostre procureur disoi...,umbre de ce que eulx et nostre procureur disoi...,umbre de ce que eulz et nostre procureur disoi...


In [8]:
data.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,2257.0,2257.0,2257.0
mean,1534.002658,1575.715995,1575.715995
std,1518.163885,1558.757803,1558.757803
min,1.0,2.0,2.0
25%,432.0,439.0,439.0
50%,900.0,919.0,919.0
75%,2166.0,2238.0,2238.0
max,9760.0,10051.0,10051.0


In [9]:
levenshtein(reference = data.gs_aligned.str.replace("@", ""), 
            hypothesis = data.ocr_to_input).cer.describe()

count     2257.000000
mean        15.638400
std        217.453987
min          0.000000
25%          3.469641
50%          5.024051
75%          8.736718
max      10228.571429
Name: cer, dtype: float64

In [10]:
levenshtein(reference = data.gs_aligned, 
            hypothesis = data.ocr_aligned).cer.describe()

count    2257.000000
mean        8.629824
std        11.475876
min         0.000000
25%         3.448276
50%         4.976775
75%         8.593322
max        99.799599
Name: cer, dtype: float64

In [11]:
vocabulary = Vocabulary(data.ocr_to_input.sum() + data.ocr_aligned.sum() + data.gs_aligned.sum())
print(len(vocabulary))
with open(output_folder/"data/vocabulary.pkl", "wb") as file:
    pickle.dump(vocabulary, file)

191


In [12]:
dev = data.sample(n = 5, random_state = 1)
dev.to_pickle(output_folder/"data/dev.pkl")
dev.shape

(5, 3)

In [13]:
train = data.drop(dev.index)
train.to_pickle(output_folder/"data/train.pkl")
train.shape

(2252, 3)

In [14]:
train.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,2252.0,2252.0,2252.0
mean,1536.216252,1577.992451,1577.992451
std,1519.010779,1559.622058,1559.622058
min,1.0,2.0,2.0
25%,432.0,439.75,439.75
50%,901.5,921.0,921.0
75%,2181.0,2246.5,2246.5
max,9760.0,10051.0,10051.0


In [15]:
dev.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,5.0,5.0,5.0
mean,537.0,550.4,550.4
std,433.894572,451.103979,451.103979
min,298.0,298.0,298.0
25%,305.0,326.0,326.0
50%,340.0,340.0,340.0
75%,435.0,436.0,436.0
max,1307.0,1352.0,1352.0


In [16]:
levenshtein(reference = dev.gs_aligned.str.replace("@", ""), 
            hypothesis = dev.ocr_to_input).cer.describe()

count    5.000000
mean     5.547289
std      1.365981
min      4.147465
25%      4.776119
50%      4.844291
75%      6.606607
max      7.361963
Name: cer, dtype: float64

In [17]:
levenshtein(reference = dev.gs_aligned, 
            hypothesis = dev.ocr_to_input).cer.describe()

count    5.000000
mean     5.480630
std      1.381836
min      4.128440
25%      4.697987
50%      4.705882
75%      6.508876
max      7.361963
Name: cer, dtype: float64

In [18]:
window_length = 100

In [19]:
df = train#.head(100)
train_aligned = list(p.imap_unordered(create_windows, 
                                      tqdm(zip(df.ocr_aligned, 
                                               df.gs_aligned, 
                                               [window_length for x in df.ocr_aligned]), 
                                           total = len(df.ocr_aligned)),
                                      chunksize = 128))
s = []
for r in tqdm(train_aligned):
    s.extend(r)
train_aligned = pd.DataFrame(s, columns = ["source", "target"])
print(train_aligned.shape)
train_aligned.head()

  0%|          | 0/2252 [00:00<?, ?it/s]

  0%|          | 0/2252 [00:00<?, ?it/s]

(3555891, 2)


Unnamed: 0,source,target
0,"Karolus, etc. Notum facimus universis, present...","Karolus, etc. Notum facimus universis, present..."
1,"arolus, etc. Notum facimus universis, presenti...","arolus, etc. Notum facimus universis, presenti..."
2,"rolus, etc. Notum facimus universis, presentib...","rolus, etc. Notum facimus universis, presentib..."
3,"olus, etc. Notum facimus universis, presentibu...","olus, etc. Notum facimus universis, presentibu..."
4,"lus, etc. Notum facimus universis, presentibus...","lus, etc. Notum facimus universis, presentibus..."


In [20]:
train_aligned = train_aligned.assign(source = lambda df: df.source.str.replace("@", ""))
train_aligned.head()

Unnamed: 0,source,target
0,"Karolus, etc. Notum facimus universis, present...","Karolus, etc. Notum facimus universis, present..."
1,"arolus, etc. Notum facimus universis, presenti...","arolus, etc. Notum facimus universis, presenti..."
2,"rolus, etc. Notum facimus universis, presentib...","rolus, etc. Notum facimus universis, presentib..."
3,"olus, etc. Notum facimus universis, presentibu...","olus, etc. Notum facimus universis, presentibu..."
4,"lus, etc. Notum facimus universis, presentibus...","lus, etc. Notum facimus universis, presentibus..."


In [21]:
dev_aligned = dev.apply(lambda r: create_windows((r["ocr_aligned"], r["gs_aligned"], window_length)), 
                            axis = 1).sum()
dev_aligned = pd.DataFrame(dev_aligned, columns = ["source", "target"])
print(dev_aligned.shape)
dev_aligned.head()

(2757, 2)


Unnamed: 0,source,target
0,"Philippus, etc. Notam facimus universis, tam p...","Philippus, etc. Notum facimus universis, tam p..."
1,"hilippus, etc. Notam facimus universis, tam pr...","hilippus, etc. Notum facimus universis, tam pr..."
2,"ilippus, etc. Notam facimus universis, tam pre...","ilippus, etc. Notum facimus universis, tam pre..."
3,"lippus, etc. Notam facimus universis, tam pres...","lippus, etc. Notum facimus universis, tam pres..."
4,"ippus, etc. Notam facimus universis, tam prese...","ippus, etc. Notum facimus universis, tam prese..."


In [22]:
dev_aligned = dev_aligned.assign(source = lambda df: df.source.str.replace("@", ""))
dev_aligned.head()

Unnamed: 0,source,target
0,"Philippus, etc. Notam facimus universis, tam p...","Philippus, etc. Notum facimus universis, tam p..."
1,"hilippus, etc. Notam facimus universis, tam pr...","hilippus, etc. Notum facimus universis, tam pr..."
2,"ilippus, etc. Notam facimus universis, tam pre...","ilippus, etc. Notum facimus universis, tam pre..."
3,"lippus, etc. Notam facimus universis, tam pres...","lippus, etc. Notum facimus universis, tam pres..."
4,"ippus, etc. Notam facimus universis, tam prese...","ippus, etc. Notum facimus universis, tam prese..."


In [23]:
train_aligned.to_pickle(output_folder/"data/train_aligned.pkl")
dev_aligned.to_pickle(output_folder/"data/dev_aligned.pkl")