In [1]:
import os
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd
from nltk.lm import Vocabulary
import pickle
import sys
import torch
import importlib
from timeit import default_timer as t
sys.path.append("../../lib")
from metrics import levenshtein
import ocr_correction
from pytorch_decoding import seq2seq

In [2]:
folder = "../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish/FR/"

In [3]:
files = sorted(os.listdir(folder))
len(files)

3

In [4]:
import glob

files = glob.glob(folder + '/**/*.txt', recursive=True)
len(files)

592

In [5]:
data = []
for f in tqdm(files):
    with open(f) as file:
        data.append(file.readlines())

  0%|          | 0/592 [00:00<?, ?it/s]

In [6]:
data = pd.DataFrame(data, 
                    columns = ["ocr_to_input", 
                               "ocr_aligned", 
                               "gs_aligned"])\
.assign(ocr_to_input = lambda df: df.ocr_to_input.str.replace("[OCR_toInput] ", "", regex = False),
        ocr_aligned = lambda df: df.ocr_aligned.str.replace("[OCR_aligned] ", "", regex = False),
        gs_aligned = lambda df: df.gs_aligned.str.replace("[ GS_aligned] ", "", regex = False))

print(data.shape)
data.head()

(592, 3)


Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
0,M E S L E? E t. 4dl dislimuloit pas les suites...,M E S L E? E t. 4dl dislimuloit pas les suites...,@@@@@@@@@@@@@@@@@@@ dissimuloit pas les suites...
1,"Discovrs paree que Pensant estant né, totit ce...","@Discovrs paree que @Pensant estant né@, totit...","DISCOVRS parce que l'enfant estant né , to@ut..."
2,ïi 4 Conduite du l’Estre ? ôc d où pourrais je...,ïi 4 Conduite du l’Estre ? ôc d où pourrais je...,@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @Qu@@@@@@@@...
3,Jugement naturel. S 5 cause ou auec l’effet : ...,Jugement naturel. S 5 cause ou auec l’effet : ...,@@@@@@@@@@@@@@@@@@@@@ cause ou auec l'effet : ...
4,Inìlmciion XJint-troi/iéme. 435 desquelles il ...,Inìlmciion XJint-troi/iéme. 435 desquelles il ...,@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@...


In [7]:
data.applymap(len).describe()

Unnamed: 0,ocr_to_input,ocr_aligned,gs_aligned
count,592.0,592.0,592.0
mean,1471.636824,1521.02027,1520.02027
std,1533.388377,1639.812042,1639.812042
min,2.0,2.0,1.0
25%,403.0,407.0,406.0
50%,775.5,814.0,813.0
75%,1912.5,1967.5,1966.5
max,7057.0,14113.0,14112.0


In [8]:
vocabulary = Vocabulary(data.ocr_to_input.sum() + data.ocr_aligned.sum() + data.gs_aligned.sum())
print(len(vocabulary))

170


In [9]:
distances = levenshtein(reference = data.gs_aligned.str.replace("@", ""), 
                        hypothesis = data.ocr_to_input)
distances.cer.describe()

count      592.000000
mean        35.174741
std        542.861425
min          0.396040
25%          3.625991
50%          5.195593
75%          9.379564
max      13180.000000
Name: cer, dtype: float64

In [10]:
distances = levenshtein(reference = data.gs_aligned, 
                        hypothesis = data.ocr_to_input)
distances.cer.describe()

count    592.000000
mean       9.403462
std       14.434755
min        0.396040
25%        3.568486
50%        5.141836
75%        9.127205
max      200.000000
Name: cer, dtype: float64

In [11]:
data.to_pickle("../../data/fr/data/test.pkl")

In [13]:
distances.query("cer > 100")

Unnamed: 0,reference,hypothesis,distance,cer
8,@,û\n,2,200.0


In [14]:
distances.reference[8]

'@'

In [16]:
distances.hypothesis[8]

'û\n'