In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

from tqdm.notebook import tqdm

In [None]:
import os
os.environ['LOGURU_LEVEL'] = 'INFO'

In [None]:
import logging

from loguru import logger

class InterceptHandler(logging.Handler):
    def emit(self, record):
        # Get corresponding Loguru level if it exists
        try:
            level = logger.level(record.levelname).name
        except ValueError:
            level = record.levelno

        # Find caller from where originated the logged message
        frame, depth = logging.currentframe(), 2
        while frame.f_code.co_filename == logging.__file__:
            frame = frame.f_back
            depth += 1

        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())

logging.basicConfig(handlers=[InterceptHandler()], level=0)

In [None]:
in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish')
in_dir.is_dir()

In [None]:
from datautils import remove_label_and_nl

In [None]:
from datautils import normalized_ed, AlignedToken, InputToken, \
    get_input_tokens, tokenize_aligned

In [None]:
t = AlignedToken('Long ow.', 'Longhow.', 'Long ow.', 'Longhow.', 24, 8)
print(t)

tokens = []
labels = []
gs = []

for inp_tok in get_input_tokens(t):
    print(inp_tok)
    tokens.append(inp_tok.ocr)
    labels.append(inp_tok.label)
    gs.append(inp_tok.gs)

assert tokens == ['Long', 'ow.']
assert labels == [1, 2]
assert ''.join(gs) == t.gs

In [None]:
t = AlignedToken('INEVR', 'I NEVER', 'I@NEV@R', 'I NEVER', 0, 5)
print(t)

tokens = []
labels = []
gs = []

for inp_tok in get_input_tokens(t):
    print(inp_tok)
    tokens.append(inp_tok.ocr)
    labels.append(inp_tok.label)
    gs.append(inp_tok.gs)

assert tokens == ['INEVR']
assert labels == [1]
assert ''.join(gs) == t.gs

In [None]:
t = AlignedToken('Major', 'Major', 'Major', 'Major', 19, 5)
print(t)

tokens = []
labels = []
gs = []

for inp_tok in get_input_tokens(t):
    print(inp_tok)
    tokens.append(inp_tok.ocr)
    labels.append(inp_tok.label)
    gs.append(inp_tok.gs)

assert tokens == ['Major']
assert labels == [0]
assert ''.join(gs) == t.gs

In [None]:
from datautils import window

In [None]:
from datautils import process_text

in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish')

text = process_text(in_dir/'NL'/'NL1'/'17.txt')

In [None]:
text.tokens[29]

In [None]:
text.input_tokens[33]

In [None]:
text.score

In [None]:
from datautils import generate_data

In [None]:
%%time
# Train and val data
in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish')
data, md = generate_data(in_dir)

In [None]:
md

In [None]:
%%time
# test data
in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish')
data_test, md_test = generate_data(in_dir)

In [None]:
md_test

In [None]:
md.num_tokens.describe()

In [None]:
md.num_tokens.hist(bins=2000, figsize=(10,5))

In [None]:
md.num_input_tokens.describe()

In [None]:
md.num_input_tokens.hist(bins=2000, figsize=(10,5))

In [None]:
md.score.describe()

In [None]:
md.score.hist(bins=50, figsize=(10,5))

In [None]:
md.query('score <= 0.3').num_tokens.describe()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, _, _ = train_test_split(md, md['file_name'], test_size=0.1, 
                                        shuffle=True, stratify=md['language'])

In [None]:
out_dir = Path('icdar-dataset-20220207')
out_dir.mkdir(parents=True, exist_ok=True)

In [None]:
X_train.to_csv(out_dir/'train.csv')
X_val.to_csv(out_dir/'val.csv')

In [None]:
X_test = md_test

X_test.to_csv(out_dir/'test.csv')

In [None]:
X_train = pd.read_csv(out_dir/'train.csv')
X_val = pd.read_csv(out_dir/'val.csv')
X_test = pd.read_csv(out_dir/'test.csv')

In [None]:
# Generate 'sentences' for train and val sets

from datautils import generate_sentences

train_data = generate_sentences(X_train, data, size=35, step=30)
val_data = generate_sentences(X_val, data, size=35, step=30)
test_data = generate_sentences(X_test, data_test, size=35, step=30)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.to_json(out_dir/'icdar_train.jsonl', orient='records', lines=True)
val_data.to_json(out_dir/'icdar_val.jsonl', orient='records', lines=True)
test_data.to_json(out_dir/'icdar_test.jsonl', orient='records', lines=True)

In [None]:
from datasets import load_dataset

out_dir = Path('icdar-dataset-20220207')

data_files = {'train': str(out_dir/'icdar_train.jsonl'),
              'val': str(out_dir/'icdar_val.jsonl'),
              'test': str(out_dir/'icdar_test.jsonl')}

icdar_dataset = load_dataset("json", data_files=data_files)

In [None]:
icdar_dataset

In [None]:
train_data.score.describe()

In [None]:
train_data.score.hist(bins=50, figsize=(10,5))

In [None]:
val_data.score.describe()

In [None]:
val_data.score.hist(bins=50, figsize=(10,5))

In [None]:
icdar_dataset = icdar_dataset.filter(lambda sample: sample['score'] <= 0.3)

In [None]:
icdar_dataset

In [None]:
icdar_dataset.save_to_disk('icdar-0.3')

In [None]:
from datasets import load_from_disk

icdar_dataset = load_from_disk('icdar-0.3')

In [None]:
icdar_dataset