In [71]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

from tqdm.notebook import tqdm

In [4]:
import logging

from loguru import logger

class InterceptHandler(logging.Handler):
    def emit(self, record):
        # Get corresponding Loguru level if it exists
        try:
            level = logger.level(record.levelname).name
        except ValueError:
            level = record.levelno

        # Find caller from where originated the logged message
        frame, depth = logging.currentframe(), 2
        while frame.f_code.co_filename == logging.__file__:
            frame = frame.f_back
            depth += 1

        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())

logging.basicConfig(handlers=[InterceptHandler()], level=0)

In [6]:
in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_training_18M_without_Finnish')
in_dir.is_dir()

True

In [8]:
with open(in_dir/'NL'/'NL1'/'17.txt') as f:
    lines = f.readlines()

In [16]:
def remove_label_and_nl(line):
    return line.strip()[14:]

ocr_aligned = remove_label_and_nl(lines[1])
gs_aligned = remove_label_and_nl(lines[2])

In [106]:
from dataclasses import dataclass
import edlib

@dataclass
class Token:
    ocr: str
    gs: str
    ocr_aligned: str
    gs_aligned: str
    start: int
    ed: int


def tokenize_aligned(ocr_aligned, gs_aligned, sentence_start=0):

    ocr_cursor = 0

    ocr_token_chars = []
    gs_token_chars = []
    ocr_token_chars_aligned = []
    gs_token_chars_aligned = []
    start_char = 0

    tokens = []

    for ocr_aligned_char, gs_aligned_char in zip(ocr_aligned, gs_aligned):
        #print(ocr_aligned_char, gs_aligned_char, ocr_cursor)
        if ocr_aligned_char != '@':
            ocr_cursor += 1

        if ocr_aligned_char == ' ' and gs_aligned_char == ' ':
            #print('TOKEN')
            #print('OCR:', repr(''.join(ocr_token_chars)))
            #print(' GS:', repr(''.join(gs_token_chars)))
            #print('start:', start_char)

            ed = edlib.align(''.join(ocr_token_chars_aligned), ''.join(gs_token_chars_aligned))

            tokens.append(Token(''.join(ocr_token_chars), 
                                ''.join(gs_token_chars), 
                                ''.join(ocr_token_chars_aligned), 
                                ''.join(gs_token_chars_aligned), 
                                sentence_start+start_char,
                                ed['editDistance']))

            ocr_token_chars = []
            gs_token_chars = []
            ocr_token_chars_aligned = []
            gs_token_chars_aligned = []
            start_char = ocr_cursor
        else:
            # TODO: handle # in gs(?)
            ocr_token_chars_aligned.append(ocr_aligned_char)
            gs_token_chars_aligned.append(gs_aligned_char)
            if ocr_aligned_char != '@':
                ocr_token_chars.append(ocr_aligned_char)
            if gs_aligned_char != '@':
                gs_token_chars.append(gs_aligned_char)
    ed = edlib.align(''.join(ocr_token_chars_aligned), ''.join(gs_token_chars_aligned))
    tokens.append(Token(''.join(ocr_token_chars), 
                        ''.join(gs_token_chars), 
                        ''.join(ocr_token_chars_aligned), 
                        ''.join(gs_token_chars_aligned), 
                        sentence_start+start_char,
                        ed['editDistance']))

    return tokens

tokens = tokenize_aligned(ocr_aligned, gs_aligned)

In [108]:
import nltk.data
import edlib

@dataclass
class Sentence:
    ocr: str
    gs: str
    ocr_aligned: str
    gs_aligned: str
    start: int
    tokens: list
    ed: int

def clean(string):
    string = string.replace('@', '')
    string = string.replace('#', '')

    return string


sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

for i, (start, end) in enumerate(sent_detector.span_tokenize(gs_aligned)):
    gs_sentence = gs_aligned[start: end]
    ocr_sentence = ocr_aligned[start: end]
    print(start, end)
    print(gs_sentence)
    print(ocr_sentence)

    ed = edlib.align(ocr_sentence, gs_sentence)
    

    tokens = tokenize_aligned(ocr_sentence, gs_sentence, sentence_start=start)
    sent = Sentence(clean(ocr_sentence), clean(gs_sentence), ocr_sentence, gs_sentence, start, tokens, ed['editDistance'])
    #for token in tokens:
    #    print(token)
    print(sent)
    print('---')
    if i == 2:
        break

0 37
 Europische Donderdaeghs Courant @N°.
@Europijcbe Donderdaeghs Courant S\£.
Sentence(ocr='Europijcbe Donderdaeghs Courant S\\£.', gs=' Europische Donderdaeghs Courant N°.', ocr_aligned='@Europijcbe Donderdaeghs Courant S\\£.', gs_aligned=' Europische Donderdaeghs Courant @N°.', start=0, tokens=[Token(ocr='Europijcbe', gs=' Europische', ocr_aligned='@Europijcbe', gs_aligned=' Europische', start=0, ed=3), Token(ocr='Donderdaeghs', gs='Donderdaeghs', ocr_aligned='Donderdaeghs', gs_aligned='Donderdaeghs', start=11, ed=0), Token(ocr='Courant', gs='Courant', ocr_aligned='Courant', gs_aligned='Courant', start=24, ed=0), Token(ocr='S\\£.', gs='N°.', ocr_aligned='S\\£.', gs_aligned='@N°.', start=32, ed=3)], ed=6)
---
38 41
44.
44.
Sentence(ocr='44.', gs='44.', ocr_aligned='44.', gs_aligned='44.', start=38, tokens=[Token(ocr='44.', gs='44.', ocr_aligned='44.', gs_aligned='44.', start=38, ed=0)], ed=0)
---
41 66
@ Wt Romen den 8 October.
' Wc Romen den 8 Oftober.
Sentence(ocr="' Wc Romen den

In [107]:
for t in tokens:
    print(t)
    break

Token(ocr='Europijcbe', gs=' Europische', ocr_aligned='@Europijcbe', gs_aligned=' Europische', start=0, ed=3)


To test whether the tokenization is correct, we compare the extracted token strings with the unaligned OCR input text. Sometimes, this text contains alignment characters, and if we remove those, the alignment is correct.

In [51]:
ocr_unaligned = remove_label_and_nl(lines[0])
ocr_unaligned = ocr_unaligned.replace('@', '')

for t in tokens:
    try:
        assert t.ocr == ocr_unaligned[t.start:t.start+len(t.ocr)]
    except AssertionError:
        print(t)
        print(ocr_unaligned[t.start:t.start+len(t.ocr)])

In [63]:
result = {}

for i, t in enumerate(tokens):
    if t.ocr != t.gs:
        #print(t)
        #print(t.start)
        #print(len(t.ocr.split()))
        task1_result = f'{t.start}:{len(t.ocr.split())}'
        #print(task1_result)
        result[task1_result] = {}

In [64]:
result

{'0:1': {},
 '32:1': {},
 '37:1': {},
 '42:1': {},
 '57:1': {},
 '66:1': {},
 '69:1': {},
 '78:1': {},
 '88:2': {},
 '95:1': {},
 '105:1': {},
 '113:1': {},
 '122:2': {},
 '146:2': {},
 '154:1': {},
 '161:1': {},
 '168:1': {},
 '172:1': {},
 '178:1': {},
 '184:2': {},
 '196:1': {},
 '199:1': {},
 '206:1': {},
 '213:3': {},
 '221:1': {},
 '230:1': {},
 '236:1': {},
 '246:1': {},
 '255:1': {},
 '269:1': {},
 '280:1': {},
 '290:1': {},
 '304:2': {},
 '309:1': {},
 '315:1': {},
 '319:1': {},
 '323:1': {},
 '335:2': {},
 '365:1': {},
 '371:1': {},
 '375:1': {},
 '385:1': {},
 '395:1': {},
 '398:2': {},
 '417:1': {},
 '422:1': {},
 '429:1': {},
 '442:1': {},
 '456:1': {},
 '459:1': {},
 '463:1': {},
 '471:1': {},
 '480:1': {},
 '491:1': {},
 '495:1': {},
 '506:1': {},
 '509:1': {},
 '513:1': {},
 '518:1': {},
 '521:1': {},
 '528:1': {},
 '541:2': {},
 '548:1': {},
 '552:1': {},
 '555:1': {},
 '576:1': {},
 '587:1': {},
 '592:1': {},
 '606:1': {},
 '613:1': {},
 '624:1': {},
 '632:1': {},
 '6

In [69]:
output = {
    'NL/NL1/17.txt': result
}

In [70]:
import json
with open('result.json', 'w') as f:
    json.dump(output, f, indent=2)