In [1]:
import os
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd
from nltk.lm import Vocabulary
import sys
sys.path.append("../../lib")
from metrics import levenshtein
import pickle

In [2]:
folder = "../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish/SL/"

In [3]:
output_folder = Path("../../data/sl")

In [4]:
files = sorted(os.listdir(folder))
len(files)

1

In [5]:
import glob

files = glob.glob(folder + '/**/*.txt', recursive=True)
len(files)

149

In [6]:
from multiprocessing import Pool

def extract(name):
    with open(name) as file:
        return file.readlines()
    
def create_windows(x):
    A, B, window_length = x
    assert len(A) == len(B)
    return [(A[i:i + window_length], B[i:i + window_length]) 
            for i in range(len(A) + 1)]
    
p = Pool(4)
    
data = list(p.imap_unordered(extract, tqdm(files), chunksize = 128))
len(data)

  0%|          | 0/149 [00:00<?, ?it/s]

149

In [7]:
# data = []
# for f in tqdm(files):
#     with open(f) as file:
#         data.append(file.readlines())

data = pd.DataFrame(data, 
                    columns = ["ocr_to_input", 
                               "ocr_aligned", 
                               "gs_aligned"])\
.assign(ocr_to_input = lambda df: df.ocr_to_input.str.replace("[OCR_toInput] ", "", regex = False),
        ocr_aligned = lambda df: df.ocr_aligned.str.replace("[OCR_aligned] ", "", regex = False),
        gs_aligned = lambda df: df.gs_aligned.str.replace("[ GS_aligned] ", "", regex = False))

print(data.shape)
data.head()

(149, 3)


Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
0,"100 Dete Jezus je oteto ino v zavetji, kar mo-...","@100 Dete Jezus je oteto ino v zavetji, kar mo...","100 Dete Jezus je oteto ino v zavetji, kar mo..."
1,65 ino sreča per vojski ropnika storila. Ker n...,@6@@5@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,"65 5 40. Goljat velikan. David pride enkrat, ..."
2,"28 zmed njih želi hlapec zvediti, ino je tudi ...","@28 zmed njih želi hlapec zvediti, ino je tudi...","28 zmed njih želi hlapec zvediti, ino je tudi..."
3,88 na desni strani altarja angelja. Strah ga o...,@88 na desni strani altarja angelja. Strah ga ...,88 na desni strani altarja angelja. Strah ga ...
4,"VI začetka sveta do ljudi, ino kako je iz ljub...","@VI začetka sveta do ljudi, ino kako je iz lju...","VI začetka svetá do ljudi, ino kako je iz lju..."


In [8]:
data.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,149.0,149.0,149.0
mean,1385.932886,1445.583893,1445.583893
std,232.593534,257.963624,257.963624
min,35.0,44.0,44.0
25%,1368.0,1393.0,1393.0
50%,1421.0,1459.0,1459.0
75%,1495.0,1537.0,1537.0
max,1586.0,1962.0,1962.0


In [9]:
levenshtein(reference = data.gs_aligned.str.replace("@", ""), 
            hypothesis = data.ocr_to_input).cer.describe()

count    149.000000
mean      12.791041
std       22.672815
min        0.572519
25%        1.450326
50%        1.997337
75%        6.176266
max       78.844765
Name: cer, dtype: float64

In [10]:
levenshtein(reference = data.gs_aligned, 
            hypothesis = data.ocr_aligned).cer.describe()

count    149.000000
mean      11.239569
std       19.448676
min        0.572155
25%        1.449275
50%        1.981506
75%        5.989233
max       72.945892
Name: cer, dtype: float64

In [11]:
vocabulary = Vocabulary(data.ocr_to_input.sum() + data.ocr_aligned.sum() + data.gs_aligned.sum())
print(len(vocabulary))
with open(output_folder/"data/vocabulary.pkl", "wb") as file:
    pickle.dump(vocabulary, file)

127


In [12]:
dev = data.sample(n = 5, random_state = 1)
dev.to_pickle(output_folder/"data/dev.pkl")
dev.shape

(5, 3)

In [13]:
train = data.drop(dev.index)
train.to_pickle(output_folder/"data/train.pkl")
train.shape

(144, 3)

In [14]:
train.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,144.0,144.0,144.0
mean,1384.513889,1442.923611,1442.923611
std,236.332467,260.899147,260.899147
min,35.0,44.0,44.0
25%,1365.25,1392.5,1392.5
50%,1421.0,1459.5,1459.5
75%,1496.0,1537.0,1537.0
max,1586.0,1962.0,1962.0


In [15]:
dev.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,5.0,5.0,5.0
mean,1426.8,1522.2,1522.2
std,52.770257,145.386382,145.386382
min,1370.0,1433.0,1433.0
25%,1384.0,1439.0,1439.0
50%,1431.0,1455.0,1455.0
75%,1447.0,1507.0,1507.0
max,1502.0,1777.0,1777.0


In [16]:
levenshtein(reference = dev.gs_aligned.str.replace("@", ""), 
            hypothesis = dev.ocr_to_input).cer.describe()

count     5.000000
mean     14.370251
std      24.443099
min       1.101170
25%       1.600000
50%       1.744592
75%       9.788167
max      57.617329
Name: cer, dtype: float64

In [17]:
levenshtein(reference = dev.gs_aligned, 
            hypothesis = dev.ocr_to_input).cer.describe()

count     5.000000
mean     11.737540
std      18.854759
min       1.099656
25%       1.592568
50%       1.737318
75%       9.351012
max      44.907147
Name: cer, dtype: float64

In [18]:
window_length = 100

In [19]:
df = train#.head(100)
train_aligned = list(p.imap_unordered(create_windows, 
                                      tqdm(zip(df.ocr_aligned, 
                                               df.gs_aligned, 
                                               [window_length for x in df.ocr_aligned]), 
                                           total = len(df.ocr_aligned)),
                                      chunksize = 128))
s = []
for r in tqdm(train_aligned):
    s.extend(r)
train_aligned = pd.DataFrame(s, columns = ["source", "target"])
print(train_aligned.shape)
train_aligned.head()

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

(207925, 2)


Unnamed: 0,source,target
0,@120@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,120 Ljubi moji! povzemimo še enkrat vse te le...
1,120@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,120 Ljubi moji! povzemimo še enkrat vse te lep...
2,20@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,20 Ljubi moji! povzemimo še enkrat vse te lepe...
3,0@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,0 Ljubi moji! povzemimo še enkrat vse te lepe ...
4,@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,Ljubi moji! povzemimo še enkrat vse te lepe J...


In [20]:
train_aligned = train_aligned.assign(source = lambda df: df.source.str.replace("@", ""))
train_aligned.head()

Unnamed: 0,source,target
0,120.0,120 Ljubi moji! povzemimo še enkrat vse te le...
1,120.0,120 Ljubi moji! povzemimo še enkrat vse te lep...
2,20.0,20 Ljubi moji! povzemimo še enkrat vse te lepe...
3,0.0,0 Ljubi moji! povzemimo še enkrat vse te lepe ...
4,,Ljubi moji! povzemimo še enkrat vse te lepe J...


In [21]:
dev_aligned = dev.apply(lambda r: create_windows((r["ocr_aligned"], r["gs_aligned"], window_length)), 
                            axis = 1).sum()
dev_aligned = pd.DataFrame(dev_aligned, columns = ["source", "target"])
print(dev_aligned.shape)
dev_aligned.head()

(7616, 2)


Unnamed: 0,source,target
0,@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,32 15. Jakop gre v svoj kraj. Ko je Laban vid...
1,@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,32 15. Jakop gre v svoj kraj. Ko je Laban vidi...
2,@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,2 15. Jakop gre v svoj kraj. Ko je Laban vidil...
3,@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,"15. Jakop gre v svoj kraj. Ko je Laban vidil,..."
4,@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...,"15. Jakop gre v svoj kraj. Ko je Laban vidil, ..."


In [22]:
dev_aligned = dev_aligned.assign(source = lambda df: df.source.str.replace("@", ""))
dev_aligned.head()

Unnamed: 0,source,target
0,,32 15. Jakop gre v svoj kraj. Ko je Laban vid...
1,,32 15. Jakop gre v svoj kraj. Ko je Laban vidi...
2,,2 15. Jakop gre v svoj kraj. Ko je Laban vidil...
3,,"15. Jakop gre v svoj kraj. Ko je Laban vidil,..."
4,,"15. Jakop gre v svoj kraj. Ko je Laban vidil, ..."


In [23]:
train_aligned.to_pickle(output_folder/"data/train_aligned.pkl")
dev_aligned.to_pickle(output_folder/"data/dev_aligned.pkl")