In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

from tqdm.notebook import tqdm

In [4]:
import logging

from loguru import logger

class InterceptHandler(logging.Handler):
    def emit(self, record):
        # Get corresponding Loguru level if it exists
        try:
            level = logger.level(record.levelname).name
        except ValueError:
            level = record.levelno

        # Find caller from where originated the logged message
        frame, depth = logging.currentframe(), 2
        while frame.f_code.co_filename == logging.__file__:
            frame = frame.f_back
            depth += 1

        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())

logging.basicConfig(handlers=[InterceptHandler()], level=0)

In [6]:
in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish')
in_dir.is_dir()

True

In [8]:
with open(in_dir/'NL'/'NL1'/'17.txt') as f:
    lines = f.readlines()

In [16]:
def remove_label_and_nl(line):
    return line.strip()[14:]

ocr_aligned = remove_label_and_nl(lines[1])
gs_aligned = remove_label_and_nl(lines[2])

In [52]:
from dataclasses import dataclass

@dataclass
class Token:
    ocr: str
    gs: str
    ocr_aligned: str
    gs_aligned: str
    start: int


def tokenize_aligned(ocr_aligned, gs_aligned):

    ocr_cursor = 0

    ocr_token_chars = []
    gs_token_chars = []
    ocr_token_chars_aligned = []
    gs_token_chars_aligned = []
    start_char = 0

    tokens = []

    for ocr_aligned_char, gs_aligned_char in zip(ocr_aligned, gs_aligned):
        #print(ocr_aligned_char, gs_aligned_char, ocr_cursor)
        if ocr_aligned_char != '@':
            ocr_cursor += 1

        if ocr_aligned_char == ' ' and gs_aligned_char == ' ':
            #print('TOKEN')
            #print('OCR:', repr(''.join(ocr_token_chars)))
            #print(' GS:', repr(''.join(gs_token_chars)))
            #print('start:', start_char)

            tokens.append(Token(''.join(ocr_token_chars), 
                                ''.join(gs_token_chars), 
                                ''.join(ocr_token_chars_aligned), 
                                ''.join(gs_token_chars_aligned), 
                                start_char))

            ocr_token_chars = []
            gs_token_chars = []
            ocr_token_chars_aligned = []
            gs_token_chars_aligned = []
            start_char = ocr_cursor
        else:
            ocr_token_chars_aligned.append(ocr_aligned_char)
            gs_token_chars_aligned.append(gs_aligned_char)
            if ocr_aligned_char != '@':
                ocr_token_chars.append(ocr_aligned_char)
            if gs_aligned_char != '@':
                gs_token_chars.append(gs_aligned_char)
    tokens.append(Token(''.join(ocr_token_chars), 
                        ''.join(gs_token_chars), 
                        ''.join(ocr_token_chars_aligned), 
                        ''.join(gs_token_chars_aligned), 
                        start_char))

    return tokens

tokens = tokenize_aligned(ocr_aligned, gs_aligned)

In [53]:
for t in tokens:
    print(t)
    break

Token(ocr='Europijcbe', gs=' Europische', ocr_aligned='@Europijcbe', gs_aligned=' Europische', start=0)


To test whether the tokenization is correct, we compare the extracted token strings with the unaligned OCR input text. Sometimes, this text contains alignment characters, and if we remove those, the alignment is correct.

In [51]:
ocr_unaligned = remove_label_and_nl(lines[0])
ocr_unaligned = ocr_unaligned.replace('@', '')

for t in tokens:
    try:
        assert t.ocr == ocr_unaligned[t.start:t.start+len(t.ocr)]
    except AssertionError:
        print(t)
        print(ocr_unaligned[t.start:t.start+len(t.ocr)])