In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

from tqdm.notebook import tqdm

In [None]:
import os
os.environ['LOGURU_LEVEL'] = 'INFO'

In [None]:
import logging

from loguru import logger

class InterceptHandler(logging.Handler):
    def emit(self, record):
        # Get corresponding Loguru level if it exists
        try:
            level = logger.level(record.levelname).name
        except ValueError:
            level = record.levelno

        # Find caller from where originated the logged message
        frame, depth = logging.currentframe(), 2
        while frame.f_code.co_filename == logging.__file__:
            frame = frame.f_back
            depth += 1

        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())

logging.basicConfig(handlers=[InterceptHandler()], level=0)

In [None]:
in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish')
in_dir.is_dir()

In [None]:
with open(in_dir/'NL'/'NL1'/'17.txt') as f:
    lines = f.readlines()

In [None]:
def remove_label_and_nl(line):
    return line.strip()[14:]

ocr_aligned = remove_label_and_nl(lines[1])
gs_aligned = remove_label_and_nl(lines[2])

In [None]:
from dataclasses import dataclass, field
import edlib

def normalized_ed(ed, ocr, gs):
    score = 0.0
    l = max(len(ocr), len(gs))
    if l > 0:
        score = ed / l
    return score


@dataclass
class Token:
    ocr: str
    gs: str
    ocr_aligned: str
    gs_aligned: str
    start: int
    ed: int
    score: float = field(init=False)

    def __post_init__(self):
        self.score = normalized_ed(self.ed, self.ocr_aligned, self.gs_aligned)
        


def tokenize_aligned(ocr_aligned, gs_aligned, sentence_start=0):

    ocr_cursor = 0

    ocr_token_chars = []
    gs_token_chars = []
    ocr_token_chars_aligned = []
    gs_token_chars_aligned = []
    start_char = 0

    tokens = []

    for ocr_aligned_char, gs_aligned_char in zip(ocr_aligned, gs_aligned):
        #print(ocr_aligned_char, gs_aligned_char, ocr_cursor)
        if ocr_aligned_char != '@':
            ocr_cursor += 1

        if ocr_aligned_char == ' ' and gs_aligned_char == ' ':
            #print('TOKEN')
            #print('OCR:', repr(''.join(ocr_token_chars)))
            #print(' GS:', repr(''.join(gs_token_chars)))
            #print('start:', start_char)

            ed = edlib.align(''.join(ocr_token_chars_aligned), ''.join(gs_token_chars_aligned))

            tokens.append(Token(''.join(ocr_token_chars), 
                                ''.join(gs_token_chars), 
                                ''.join(ocr_token_chars_aligned), 
                                ''.join(gs_token_chars_aligned), 
                                sentence_start+start_char,
                                ed['editDistance']))

            ocr_token_chars = []
            gs_token_chars = []
            ocr_token_chars_aligned = []
            gs_token_chars_aligned = []
            start_char = ocr_cursor
        else:
            # TODO: handle # in gs(?)
            ocr_token_chars_aligned.append(ocr_aligned_char)
            gs_token_chars_aligned.append(gs_aligned_char)
            if ocr_aligned_char != '@':
                ocr_token_chars.append(ocr_aligned_char)
            if gs_aligned_char != '@':
                gs_token_chars.append(gs_aligned_char)
    ed = edlib.align(''.join(ocr_token_chars_aligned), ''.join(gs_token_chars_aligned))
    tokens.append(Token(''.join(ocr_token_chars), 
                        ''.join(gs_token_chars), 
                        ''.join(ocr_token_chars_aligned), 
                        ''.join(gs_token_chars_aligned), 
                        sentence_start+start_char,
                        ed['editDistance']))

    return tokens

tokens = tokenize_aligned(ocr_aligned, gs_aligned)

In [None]:
print(tokens[3])

In [None]:
import nltk.data
import edlib

@dataclass
class Sentence:
    ocr: str
    gs: str
    ocr_aligned: str
    gs_aligned: str
    start: int
    tokens: list
    ed: int
    score: float = field(init=False)

    def __post_init__(self):
        self.score = normalized_ed(self.ed, self.ocr_aligned, self.gs_aligned)


def clean(string):
    string = string.replace('@', '')
    string = string.replace('#', '')

    return string

def extract_sentences(in_file):
    with open(in_file) as f:
        lines = f.readlines()

    ocr_aligned = remove_label_and_nl(lines[1])
    gs_aligned = remove_label_and_nl(lines[2])

    sentences = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    for i, (start, end) in enumerate(sent_detector.span_tokenize(gs_aligned)):
        gs_sentence = gs_aligned[start: end]
        ocr_sentence = ocr_aligned[start: end]
        #print(start, end)
        #print(gs_sentence)
        #print(ocr_sentence)

        ed = edlib.align(ocr_sentence, gs_sentence)
        

        tokens = tokenize_aligned(ocr_sentence, gs_sentence, sentence_start=start)
        sent = Sentence(clean(ocr_sentence), clean(gs_sentence), ocr_sentence, gs_sentence, start, tokens, ed['editDistance'])
        sentences.append(sent)
    return sentences

In [None]:
sentences = extract_sentences(in_dir/'NL'/'NL1'/'17.txt')

In [None]:
sentences[0]

In [None]:
%%time
import os

in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish')

data = {}
# df of sentence scores
scores = []
languages = []

subsets = []
file_languages = []
file_names = []

for language_dir in in_dir.iterdir():
    #print(language_dir.stem)
    language = language_dir.stem
    
    for text_file in tqdm(language_dir.rglob('*.txt'), desc=language):
        #print(text_file)
        #print(text_file.relative_to(in_dir))
        key = str(text_file.relative_to(in_dir))
        data[key] = extract_sentences(text_file)
        parts = key.split(os.path.sep)
        subsets.append(parts[1])
        file_languages.append(language)
        file_names.append(key)
        for s in data[key]:
            scores.append(s.score)
            languages.append(language)

In [None]:
import pickle

with open('train.pickle', 'wb') as f:
    pickle.dump(data, f)

In [None]:
%%time
import pickle

with open('train.pickle', 'rb') as f:
    data = pickle.load(f)

In [None]:
df = pd.DataFrame({'score': scores,
                   'language': languages})
df.to_csv('train-scores.csv')

In [None]:
df = pd.read_csv('train-scores.csv', index_col=0)
df.head()

In [None]:
df.score.describe()

In [None]:
df.score.hist(figsize=(10, 5))

To test whether the tokenization is correct, we compare the extracted token strings with the unaligned OCR input text. Sometimes, this text contains alignment characters, and if we remove those, the alignment is correct.

In [None]:
ocr_unaligned = remove_label_and_nl(lines[0])
ocr_unaligned = ocr_unaligned.replace('@', '')

for t in tokens:
    try:
        assert t.ocr == ocr_unaligned[t.start:t.start+len(t.ocr)]
    except AssertionError:
        print(t)
        print(ocr_unaligned[t.start:t.start+len(t.ocr)])

Export to flair corpus

In [None]:
# Divide into train and dev set
files = pd.DataFrame.from_dict({'file_name': file_names,
                                'subset': subsets,
                                'language': file_languages})
files.to_csv('train-file-data.csv')
files.shape

In [None]:
files = pd.read_csv('train-file-data.csv', index_col=0)
print(files.shape)
files.head()

In [None]:
from sklearn.model_selection import train_test_split

train_files, dev_files = train_test_split(files, test_size=0.1, random_state=42, stratify=files['subset'])

In [None]:
# test data
in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish')

test_data = {}

subsets = []
file_languages = []
file_names = []

for language_dir in in_dir.iterdir():
    language = language_dir.stem    
    for text_file in tqdm(language_dir.rglob('*.txt'), desc=language):
        key = str(text_file.relative_to(in_dir))
        test_data[key] = extract_sentences(text_file)
        parts = key.split(os.path.sep)
        subsets.append(parts[1])
        file_languages.append(language)
        file_names.append(key)

In [None]:
import pickle

with open('test.pickle', 'wb') as f:
    pickle.dump(test_data, f)

In [None]:
%%time
import pickle

with open('test.pickle', 'rb') as f:
    test_data = pickle.load(f)

In [None]:
test_files = pd.DataFrame.from_dict({'file_name': file_names,
                                     'subset': subsets,
                                     'language': file_languages})
files.to_csv('test-file-data.csv')
files.shape

In [None]:
test_files = pd.read_csv('test-file-data.csv', index_col=0)
test_files.head()

In [None]:
# TODO: count sentences (to compare with dataset)
def text2columns(text, threshold=1.0):
    output = []
    for sentence in text:
        if sentence.score <= threshold:
            for token in sentence.tokens:
                if token.ed == 0:
                    annotation = 0
                else:
                    annotation = 1
                if token.ocr != '':
                    output.append(f'{token.ocr}\t{annotation}\n')
            # Separate sentences with an empty line
            if len(output) > 0 and output[-1] != '\n':
                output.append('\n')
    return ''.join(output)

print(text2columns(data['NL/NL1/17.txt']))

In [None]:
def save_column_data(file_df, data, out_file, threshold=1.0):
    with open(out_file, 'w') as f:
        for key in tqdm(file_df['file_name']):
            f.write(text2columns(data[key], threshold=threshold))

In [None]:
# all data

save_column_data(train_files, data, 'train.txt')
save_column_data(dev_files, data, 'dev.txt')
save_column_data(test_files, test_data, 'test.txt')

In [None]:
# high quality data

save_column_data(train_files, data, 'train-0.4.txt', threshold=0.4)
save_column_data(dev_files, data, 'dev-0.4.txt', threshold=0.4)
save_column_data(test_files, test_data, 'test-0.4.txt', threshold=0.4)

In [None]:
# French only

save_column_data(train_files.query('language == "FR"'), data, 'train-french.txt')
save_column_data(dev_files.query('language == "FR"'), data, 'dev-french.txt')
save_column_data(test_files.query('language == "FR"'), test_data, 'test-french.txt')

Create competition result

In [None]:
result = {}

for i, t in enumerate(tokens):
    if t.ocr != t.gs:
        #print(t)
        #print(t.start)
        #print(len(t.ocr.split()))
        task1_result = f'{t.start}:{len(t.ocr.split())}'
        #print(task1_result)
        result[task1_result] = {}

In [None]:
result

In [None]:
output = {
    'NL/NL1/17.txt': result
}

In [None]:
import json
with open('result.json', 'w') as f:
    json.dump(output, f, indent=2)