In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

from tqdm.notebook import tqdm

In [None]:
import os
os.environ['LOGURU_LEVEL'] = 'INFO'

In [None]:
import logging

from loguru import logger

class InterceptHandler(logging.Handler):
    def emit(self, record):
        # Get corresponding Loguru level if it exists
        try:
            level = logger.level(record.levelname).name
        except ValueError:
            level = record.levelno

        # Find caller from where originated the logged message
        frame, depth = logging.currentframe(), 2
        while frame.f_code.co_filename == logging.__file__:
            frame = frame.f_back
            depth += 1

        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())

logging.basicConfig(handlers=[InterceptHandler()], level=0)

In [None]:
from datautils import generate_data

In [None]:
%%time
# Train and val data
in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish')
data, md = generate_data(in_dir)

In [None]:
# test data
in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish')
data_test, md_test = generate_data(in_dir)

In [None]:
out_dir = Path('icdar-dataset-20220207')

In [None]:
X_train = pd.read_csv(out_dir/'train.csv', index_col=0)
X_val = pd.read_csv(out_dir/'val.csv', index_col=0)
X_test = pd.read_csv(out_dir/'test.csv', index_col=0)

In [None]:
import dataclasses

tokens = []

for key, d in tqdm(data.items()):
    for token in d.tokens:
        if token.ocr.strip() != token.gs.strip():
            r = dataclasses.asdict(token)
            r['language'] = key[:2]
            r['subset'] = key.split('/')[1]

            tokens.append(r)

In [None]:
data = pd.DataFrame(tokens)

In [None]:
print('data:', data.shape[0], 'samples')

In [None]:
def update_data(tdata):
    tdata['ocr'] = tdata['ocr'].apply(lambda x: x.strip())
    tdata['gs'] = tdata['gs'].apply(lambda x: x.strip())
    tdata['len_ocr'] = tdata.apply(lambda row: len(row.ocr), axis=1)
    tdata['len_gs'] = tdata.apply(lambda row: len(row.gs), axis=1)
    tdata['diff'] = tdata.len_ocr - tdata.len_gs
    return tdata

data = update_data(data)

In [None]:
data.len_ocr.describe()

In [None]:
data.len_gs.describe()

In [None]:
data['diff'].describe()

In [None]:
data.len_ocr.hist(bins=1000, figsize=(10,5))

In [None]:
data.len_ocr.hist(bins=1000, figsize=(10,5))
plt.ylim(0, 30)

In [None]:
data.len_ocr.hist(bins=1000, figsize=(10,5))
plt.xlim(0, 30)

In [None]:
data.to_csv(out_dir/'task2_training_18M.csv')

In [None]:
data = data.query('len_ocr <= 10').query('len_gs <= 10').copy()

In [None]:
from collections import Counter

c = Counter()

for ocr, gs in tqdm(zip(data.ocr.to_list(), data.gs.to_list()), total=data.shape[0]):
    c[f'{ocr}@@@{gs}'] += 1

In [None]:
len(c)

In [None]:
for k, v in c.most_common(25):
    ocr, gs = k.split('@@@')
    print(repr(ocr), repr(gs), v)


In [None]:
for k, v in c.items():
    if k.endswith('@@@'):
        print(k, v)

In [None]:
train.to_csv(out_dir/'task2_train.csv')
val.to_csv(out_dir/'task2_val.csv')

In [None]:
train = pd.read_csv(out_dir/'task2_train.csv', index_col=0)
val = pd.read_csv(out_dir/'task2_val.csv', index_col=0)

train = train.fillna('')
val = val.fillna('')