In [1]:
import os
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd
from nltk.lm import Vocabulary
import sys
sys.path.append("../../lib")
from metrics import levenshtein
import pickle

In [2]:
folder = "../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish/EN/"

In [3]:
output_folder = Path("../../data/en")

In [4]:
files = sorted(os.listdir(folder))
len(files)

1

In [5]:
import glob

files = glob.glob(folder + '/**/*.txt', recursive=True)
len(files)

148

In [6]:
from multiprocessing import Pool

def extract(name):
    with open(name) as file:
        return file.readlines()
    
def create_windows(x):
    A, B, window_length = x
    assert len(A) == len(B)
    return [(A[i:i + window_length], B[i:i + window_length]) 
            for i in range(len(A) + 1)]
    
p = Pool(4)
    
data = list(p.imap_unordered(extract, tqdm(files), chunksize = 128))
len(data)

  0%|          | 0/148 [00:00<?, ?it/s]

148

In [7]:
# data = []
# for f in tqdm(files):
#     with open(f) as file:
#         data.append(file.readlines())

data = pd.DataFrame(data, 
                    columns = ["ocr_to_input", 
                               "ocr_aligned", 
                               "gs_aligned"])\
.assign(ocr_to_input = lambda df: df.ocr_to_input.str.replace("[OCR_toInput] ", "", regex = False),
        ocr_aligned = lambda df: df.ocr_aligned.str.replace("[OCR_aligned] ", "", regex = False),
        gs_aligned = lambda df: df.gs_aligned.str.replace("[ GS_aligned] ", "", regex = False))

print(data.shape)
data.head()

(148, 3)


Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
0,"10* THE CHEATS OF SCAPIN. Scapin. Well,. Sir, ...","10* THE CHEATS OF SCAPIN. Scapin. Well,. Sir, ...","@@@@@@@@@@@@@@@@@@@@@@@@@ Scapin. Well,@ Sir, ..."
1,"t J 5 ] stayers, for th© 'continuance of such ...","t J 5@ @@] stayers, for th© 'continuance of su...","@ [15] out prayers, for the @continuance of su..."
2,t Jeademia Scienthrum. I $ mltiplicare feu in ...,t Jeademia Scienthrum. I $ mltiplicare feu in ...,@@@@@@@@@@@@@@@@@@@@@@@@@@ mltiplicare seu in ...
3,"Plotting of a Tgvjh- Fichl> &c. Plot, and ther...",@@@@@Plotting of a Tgvjh- Fichl> &c@@@@@.@@@@@...,"106 Plotting of a T@@own‑Field, &c. Fig. 36 t..."
4,r\n,r\n,@\n


In [8]:
data.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,148.0,148.0,148.0
mean,1415.263514,1520.682432,1520.682432
std,947.873153,1057.77948,1057.77948
min,2.0,2.0,2.0
25%,953.5,1014.75,1014.75
50%,1235.0,1267.5,1267.5
75%,1589.75,1707.5,1707.5
max,6767.0,6840.0,6840.0


In [9]:
levenshtein(reference = data.gs_aligned.str.replace("@", ""), 
            hypothesis = data.ocr_to_input).cer.describe()

count     148.000000
mean      277.546633
std      1229.004427
min         1.050175
25%         5.980658
50%        12.703118
75%        40.811855
max      8000.000000
Name: cer, dtype: float64

In [10]:
levenshtein(reference = data.gs_aligned, 
            hypothesis = data.ocr_aligned).cer.describe()

count    148.000000
mean      23.828322
std       24.876417
min        1.048951
25%        5.890478
50%       12.342462
75%       36.548646
max       98.814229
Name: cer, dtype: float64

In [11]:
vocabulary = Vocabulary(data.ocr_to_input.sum() + data.ocr_aligned.sum() + data.gs_aligned.sum())
print(len(vocabulary))
with open(output_folder/"data/vocabulary.pkl", "wb") as file:
    pickle.dump(vocabulary, file)

161


In [12]:
dev = data.sample(n = 5, random_state = 1)
dev.to_pickle(output_folder/"data/dev.pkl")
dev.shape

(5, 3)

In [13]:
train = data.drop(dev.index)
train.to_pickle(output_folder/"data/train.pkl")
train.shape

(143, 3)

In [14]:
train.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,143.0,143.0,143.0
mean,1400.20979,1508.566434,1508.566434
std,837.181925,962.640212,962.640212
min,2.0,2.0,2.0
25%,967.0,1021.0,1021.0
50%,1243.0,1272.0,1272.0
75%,1594.5,1709.0,1709.0
max,4376.0,5488.0,5488.0


In [15]:
dev.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,5.0,5.0,5.0
mean,1845.8,1867.2,1867.2
std,2810.28801,2840.270269,2840.270269
min,18.0,18.0,18.0
25%,66.0,66.0,66.0
50%,1163.0,1178.0,1178.0
75%,1215.0,1234.0,1234.0
max,6767.0,6840.0,6840.0


In [16]:
levenshtein(reference = dev.gs_aligned.str.replace("@", ""), 
            hypothesis = dev.ocr_to_input).cer.describe()

count       5.000000
mean     1642.842686
std      2812.712606
min         3.921569
25%         4.467912
50%         5.823948
75%      1700.000000
max      6500.000000
Name: cer, dtype: float64

In [17]:
levenshtein(reference = dev.gs_aligned, 
            hypothesis = dev.ocr_to_input).cer.describe()

count     5.000000
mean     41.407376
std      50.284914
min       3.904924
25%       4.457050
50%       5.745614
75%      94.444444
max      98.484848
Name: cer, dtype: float64

In [18]:
window_length = 100

In [19]:
df = train#.head(100)
train_aligned = list(p.imap_unordered(create_windows, 
                                      tqdm(zip(df.ocr_aligned, 
                                               df.gs_aligned, 
                                               [window_length for x in df.ocr_aligned]), 
                                           total = len(df.ocr_aligned)),
                                      chunksize = 128))
s = []
for r in tqdm(train_aligned):
    s.extend(r)
train_aligned = pd.DataFrame(s, columns = ["source", "target"])
print(train_aligned.shape)
train_aligned.head()

  0%|          | 0/143 [00:00<?, ?it/s]

  0%|          | 0/143 [00:00<?, ?it/s]

(215868, 2)


Unnamed: 0,source,target
0,"@@@@@@@@@open crye, to which the Lyon being ma...","( 145 ) open crye, to which the Lyon being ma..."
1,"@@@@@@@@open crye, to which the Lyon being mas...","( 145 ) open crye, to which the Lyon being mas..."
2,"@@@@@@@open crye, to which the Lyon being mast...","145 ) open crye, to which the Lyon being mast..."
3,"@@@@@@open crye, to which the Lyon being maste...","145 ) open crye, to which the Lyon being maste..."
4,"@@@@@open crye, to which the Lyon being master...","45 ) open crye, to which the Lyon being master..."


In [20]:
train_aligned = train_aligned.assign(source = lambda df: df.source.str.replace("@", ""))
train_aligned.head()

Unnamed: 0,source,target
0,"open crye, to which the Lyon being master hunt...","( 145 ) open crye, to which the Lyon being ma..."
1,"open crye, to which the Lyon being master hunt...","( 145 ) open crye, to which the Lyon being mas..."
2,"open crye, to which the Lyon being master hunt...","145 ) open crye, to which the Lyon being mast..."
3,"open crye, to which the Lyon being master hunt...","145 ) open crye, to which the Lyon being maste..."
4,"open crye, to which the Lyon being master hunt...","45 ) open crye, to which the Lyon being master..."


In [21]:
dev_aligned = dev.apply(lambda r: create_windows((r["ocr_aligned"], r["gs_aligned"], window_length)), 
                            axis = 1).sum()
dev_aligned = pd.DataFrame(dev_aligned, columns = ["source", "target"])
print(dev_aligned.shape)
dev_aligned.head()

(9341, 2)


Unnamed: 0,source,target
0,"^ ‘ "" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N ...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...
1,"‘ "" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N :...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...
2,"‘ "" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N :c...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...
3,""" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N :cv...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...
4,""" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N :cv ...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...


In [22]:
dev_aligned = dev_aligned.assign(source = lambda df: df.source.str.replace("@", ""))
dev_aligned.head()

Unnamed: 0,source,target
0,"^ ‘ "" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N ...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...
1,"‘ "" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N :...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...
2,"‘ "" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N :c...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...
3,""" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N :cv...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...
4,""" -■■■ '■ I I .1 :*T S 4 I S SEVih IT A N :cv ...",@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...


In [23]:
train_aligned.to_pickle(output_folder/"data/train_aligned.pkl")
dev_aligned.to_pickle(output_folder/"data/dev_aligned.pkl")