## Preparation

## Preparation

### Resource Paths

In [34]:
PROJECT_ROOT_PATH = '../../../..'
TEMP_PATH = PROJECT_ROOT_PATH + '/tmp'
DATA_PATH  = TEMP_PATH + '/detect/data'
MODEL_PATH = TEMP_PATH + '/detect/model'

WORDS_TRAIN_PATH = DATA_PATH + '/words.train.tsv'
WORDS_TEST_PATH  = DATA_PATH + '/words.test.tsv'
OUT_WORDS_TRAIN_PATH = DATA_PATH + '/words.train.fix.tsv'
OUT_WORDS_TEST_PATH  = DATA_PATH + '/words.test.fix.tsv'

MIBIO_PATH = PROJECT_ROOT_PATH + '/ocrrect-experiment/src/main/resources/mibio-ocr'
OCR_TEXT_PATH = MIBIO_PATH + '/ocr'

### Data Load

In [2]:
import glob

OCR_TEXT = ''.join([open(path, 'r').read() for path in glob.glob(OCR_TEXT_PATH + '/*')])

In [35]:
def _read_words(path):
    words = []
    with open(path, 'r') as word_file:
        for line in word_file:
            line = line.rstrip()
            _, _, _, _, word, _, _, _, pos = line.split('\t')
            words.append((word, pos))
    return words
            
# Print out the number of training words before ad-hoc modification.
SPLIT_POS = int(_read_words(WORDS_TRAIN_PATH)[-1][1])
print(SPLIT_POS)

UNALIGNED_WORDS = []
UNALIGNED_WORDS.extend(_read_words(WORDS_TRAIN_PATH))
UNALIGNED_WORDS.extend(_read_words(WORDS_TEST_PATH))

407369


In [None]:
# Ad-hoc modification on words.
UNALIGNED_WORDS.insert(12222, ('\'s', -1))
UNALIGNED_WORDS.insert(40541, ('\'s', -2))
UNALIGNED_WORDS.insert(44896, ('\'s', -3))
UNALIGNED_WORDS.insert(66548, ('\'s', -4))
UNALIGNED_WORDS.insert(89249, ('\'s', -5))
UNALIGNED_WORDS.insert(99066, ('\'s', -5))
UNALIGNED_WORDS.append(('Sussex', -6))
UNALIGNED_WORDS.append(('.', -7))

## Tokenization Alignment

Align the tokens to the original OCR content.

In [4]:
import re

def _match(c1, c2):
    return ord(c1) == ord(c2)

def align(text, unaligned_words):
    aligned_words = []
    tpos = 0
    for widx, (word, pos) in enumerate(unaligned_words):
        for wpos in range(len(word)):
            while text[tpos].isspace():
                tpos += 1
            if not _match(word[wpos], text[tpos]):
                # One observed bug is tokenization accidentially drops `'s` token. Thus we detect if the upcoming token is missing.
                mo = re.match(r'(\'s *)(.)', text[tpos:tpos + 10])
                if mo and _match(word[wpos], mo.group(2)):
                    aligned_words.append(('\'s', tpos))
                    tpos += len(mo.group(1))
                    aligned_words.append((word, tpos))
                elif text[tpos:tpos + 2] == '-\n' and ord(word[wpos]) == ord(text[tpos + 2]):
                    tpos += 2
                else:
                    raise ValueError('Character mismatched: (%d) %s, (%d) %s\nat %d:  %s\nand: %s'
                                     % (ord(word[wpos]), word[wpos + 1: wpos + 5],
                                        ord(text[tpos]), text[tpos + 1: tpos + 5],
                                        widx, str(unaligned_words[widx - 2: widx + 3]),
                                        text[tpos - 10: tpos + 10]
                                       ))
            elif not wpos:
                aligned_words.append((word, tpos))
            tpos += 1
    return aligned_words

ALIGNED_WORDS = align(OCR_TEXT, UNALIGNED_WORDS)

### Varify Alignment

In [33]:
def varify(text, aligned_words):
    tpos = 0
    for widx, (word, pos) in enumerate(aligned_words):
        for wpos in range(len(word)):
            while text[tpos].isspace():
                tpos += 1
            if not _match(word[wpos], text[tpos]):
                if text[tpos:tpos + 2] == '-\n' and ord(word[wpos]) == ord(text[tpos + 2]):
                    tpos += 2
                else:
                    raise ValueError('Character mismatched: (%d) %s, (%d) %s\nat %d:  %s\nand: %s'
                                     % (ord(word[wpos]), word[wpos + 1: wpos + 5],
                                        ord(text[tpos]), text[tpos + 1: tpos + 5],
                                        widx, str(aligned_words[widx - 2: widx + 3]),
                                        text[tpos - 10: tpos + 10]
                                       ))
            tpos += 1

# varify the fixed word
varify(OCR_TEXT, ALIGNED_WORDS)

### Output Aligned Words

In [25]:
def _to_word_obj(words, i):
    l = len(words)
    return [words[i - 4][0] if i > 3 else '',
            words[i - 3][0] if i > 2 else '',
            words[i - 2][0] if i > 1 else '',
            words[i - 1][0] if i > 0 else '',
            words[i][0],
            words[i + 1][0] if i < l - 1 else '',
            words[i + 2][0] if i < l - 2 else '',
            words[i + 3][0] if i < l - 3 else '',
            str(words[i][1])
           ]
    
def to_word_objs(words):
    return [_to_word_obj(words, i) for i in range(len(words))]

WORDS_OBJS = to_word_objs(ALIGNED_WORDS)

In [18]:
WORDS_OBJS[-10:]

[['(', 'Kent', ')', ',', '1906', ',', 'and', 'altogether', '497230'],
 ['Kent', ')', ',', '1906', ',', 'and', 'altogether', 'over', '497234'],
 [')', ',', '1906', ',', 'and', 'altogether', 'over', 'a', '497236'],
 [',', '1906', ',', 'and', 'altogether', 'over', 'a', 'score', '497240'],
 ['1906', ',', 'and', 'altogether', 'over', 'a', 'score', 'in', '497251'],
 [',', 'and', 'altogether', 'over', 'a', 'score', 'in', 'Sussex', '497256'],
 ['and', 'altogether', 'over', 'a', 'score', 'in', 'Sussex', '.', '497258'],
 ['altogether', 'over', 'a', 'score', 'in', 'Sussex', '.', '', '497264'],
 ['over', 'a', 'score', 'in', 'Sussex', '.', '', '', '497267'],
 ['a', 'score', 'in', 'Sussex', '.', '', '', '', '497273']]

In [31]:
for i, w in enumerate(WORDS_OBJS):
    if int(w[-1]) > SPLIT_POS:
        split_idx = i
        break
print(SPLIT_POS)
print(split_idx)

def write(word_objs, file_path):
    with open(file_path, 'w') as file:
        lines = ['\t'.join(w) for w in word_objs]
        file.write('\n'.join(lines))
        
write(WORDS_OBJS[:split_idx], OUT_WORDS_TRAIN_PATH)
write(WORDS_OBJS[split_idx:], OUT_WORDS_TEST_PATH)

407369
83132
