In [135]:
# Resources for evaluation.
PROJECT_ROOT_PATH    = '../../../..'
DETECT_DATA_PATH     = PROJECT_ROOT_PATH + '/tmp/detect/data'
TOKENIZE_DATA_PATH   = PROJECT_ROOT_PATH + '/tmp/tokenize/data'
RESOURCE_PATH        = PROJECT_ROOT_PATH + '/ocrrect-experiment/src/main/resources'
WORDS_TRAIN_PATH     = DETECT_DATA_PATH + '/words.train.tsv'
WORDS_TEST_PATH      = DETECT_DATA_PATH + '/words.test.tsv'
WORDS_PTB_PATH       = DETECT_DATA_PATH + '/words.ptb.tsv'
WORDS_WS_PATH        = DETECT_DATA_PATH + '/words.ws.tsv'
TOKENS_ELEPHANT_PATH = TOKENIZE_DATA_PATH + '/tokens.elephant.txt'
TEXT_OCR_PATH        = RESOURCE_PATH + '/mibio-ocr/ocr'
ERROR_GT_PATH        = RESOURCE_PATH + '/mibio-ocr/error.gt.tsv'
TOKEN_GT_PATH        = RESOURCE_PATH + '/mibio-ocr/ocr.token.tsv'
TOKEN_GT_HYPHEN_PATH = RESOURCE_PATH + '/mibio-ocr/ocr.token.hyphen.tsv'

EXP_RES_PATH     = PROJECT_ROOT_PATH + '/ocrrect-experiment/src/main/resources'
WASTE_BNC_PATH   = EXP_RES_PATH + '/waste.bnc.tokenize.fcgi.mod.txt'
WASTE_BROWN_PATH = EXP_RES_PATH + '/waste.brown.tokenize.fcgi.txt'
WASTE_WSJ_PATH   = EXP_RES_PATH + '/waste.wsj.tokenize.fcgi.txt'

### Token generation

In [74]:
""" Concat OCR text. """

TEXT_PATH = TOKENIZE_DATA_PATH + '/text.ocr.txt' 

import os
import glob

if not os.path.exists(TOKENIZE_DATA_PATH):
    os.makedirs(TOKENIZE_DATA_PATH)

with open(TEXT_PATH, 'w') as file:
    txt = ''.join([open(f, 'r').read() for f in glob.glob(TEXT_OCR_PATH + '/*')])
    file.write(txt)

In [106]:
def gen_elephant_dict():
    tks = [l[:-1] for l in open(TOKENS_ELEPHANT_PATH, 'r')]
    tk_idx, tk_pt = 0, 0 # current reading position

    # c_pos = 0
    positioned = {}
    for k, v in TOKENS_GT_DICT.items():
        #for c in v.replace('↵', ''):
        for i, c in enumerate(v):
            # skip newline and whitespace char
            if ord(c) == 8629 or c == ' ':
                continue

            tk_c = tks[tk_idx][tk_pt]

            # if matches.
            if ord(c) == ord(tk_c):
                # if it is the first character in the token, then record this position.
                if tk_pt == 0:
                    positioned[k + i] = tks[tk_idx]
                    # print(str(k + i) + ' ' + tks[tk_idx])

            else:
                raise ValueError(str(k) + ' ' + v + ' ' + str(i) + 
                                 ' (' + c + ',' + tk_c + ') ' + 
                                 ' (' + str(ord(c)) + ',' + str(ord(tk_c)) + ') ' + 
                                 tks[tk_idx] + ' ' + str(tk_pt)
                                )

            # move the pointer to the next char in tks
            tk_pt += 1
            if tk_pt == len(tks[tk_idx]):
                tk_idx += 1; tk_pt = 0

    return positioned

In [245]:
def gen_waste_dict(path):
    tks = []
    with open(path, 'r') as file:
        for lidx, line in enumerate(file):
            if lidx == 0 or len(line) == 1: continue
            tk, _, _ = line.split('\t', 3)
            tks.append(tk)
            
    tk_idx, tk_pt = 0, 0 # current reading position

    # c_pos = 0
    positioned = {}
    for k, v in TOKENS_GT_DICT.items():
        #for c in v.replace('↵', ''):
        v = v.replace('-↵', '').replace('ﬃ', 'ffi').replace('ﬂ', 'fl')
        for i, c in enumerate(v):
            # skip newline and whitespace char
            if ord(c) == 8629 or c == ' ':
                continue

            tk_c = tks[tk_idx][tk_pt]

            # if matches.
            if ord(c) == ord(tk_c):
                # if it is the first character in the token, then record this position.
                if tk_pt == 0:
                    positioned[k + i] = tks[tk_idx]
                    # print(str(k + i) + ' ' + tks[tk_idx])
                    
            # ignore the missing hyphen
            elif c is '-':
                continue

            else:
                print(c is '-')
                print(len(v) > i + 1)
                raise ValueError(str(k) + ' ' + v + ' ' + str(i) + 
                                 ' (' + c + ',' + tk_c + ') ' + 
                                 ' (' + str(ord(c)) + ',' + str(ord(tk_c)) + ') ' + 
                                 tks[tk_idx] + ' ' + str(tk_pt)
                                )

            # move the pointer to the next char in tks
            tk_pt += 1
            if tk_pt == len(tks[tk_idx]):
                tk_idx += 1; tk_pt = 0
            
    return positioned

dic = gen_waste_dict(WASTE_BNC_PATH)
# print('\n'.join(str(k) + ' ' + v for k, v in sorted(dic.items())))

In [107]:
""" Evaluate tokenization methods. """

print('elephant'); print_eval_method(dic=gen_elephant_dict())

elephant
tp:  94685  fp:   4895  fn:   7042  prec: 0.9508  recall: 0.9308  f: 0.9407  err: 0.1120


### Tokenization Accuracy Evaluation

We evalute the accuracy of the proposed tokenization on **MiBio** dataset.

In [212]:
""" Generate the ground truth token map. """

import csv
import pandas as pd


def read_token_tsv(path):
    """ Read a token TSV file as a dictionary that maps the starting position to
        the token string name.
    """
    col_names = ['name', 'pos_str', 'pos_end']
    df = pd.read_table(open(path, 'r'),
                       header=None,
                       quoting=csv.QUOTE_NONE,
                       names=col_names)
    return dict((pos, name) for _, (name, pos, _) in df.iterrows())


TOKENS_GT_DICT = read_token_tsv(TOKEN_GT_PATH) 


# print('\n'.join(str(k) + ' ' + v for k, v in sorted(TOKENS_GT_DICT.items())))

In [257]:
""" Evaluation metrics. """


def read_word_tsv(path):
    """ Read a word TSV file as a dictionary that maps the starting position to
        the word string name.
    """
    col_names = ['w-4', 'w-3', 'w-2', 'w-1', 'w', 'w+1', 'w+2', 'w+3', 'pos']
    df = pd.read_table(open(path, 'r'),
                       header=None,
                       quoting=csv.QUOTE_NONE,
                       names=col_names)
    return dict((pos, name) for _, (_, _, _, _, name, _, _, _, pos)
                in df.iterrows())


def _match_tokens(model_dict):
    """ Evaluate the accuracy of the model generated words.
    """
    def get_gt_matches(pos):
        """ Look the position and 4 character ahead for valid words as the
            ground truth mapping words.
        """
        gt_matches = []
        for offset in range(4):
            try:
                gt_matches.append(TOKENS_GT_DICT[pos + offset])
            except KeyError:
                pass
        return gt_matches
    
    def match_gt(name, gt_name):
        """ Returns true if a model generated word string matches with the
            ground truth.
        """
        #print('(' + name + ',' + gt_name + ')', end='')
        if gt_name == name:
            return True
        if '↵' in gt_name:
            p1, p2 = re.split(r'-\s*↵', gt_name)
            #print('(' + (p1 + p2) + ',' + gt_name + ')', end='')
            if p1 + p2 == name or p1 + '-' + p2 == name:
                return True
        return False
        
    corr, err = 0, 0
    err_records = []  # cache of the errors.
    for pos, name in model_dict.items():
        curr = [pos, name]
        gt_matches = get_gt_matches(pos)
        curr.append(gt_matches)
        if any(match_gt(name, gt) for gt in get_gt_matches(pos)):
            corr += 1
        else:
            err_records.append(curr)
#             if len(err_records) < 100:
#             print(err_records[-1])
            err += 1
    return corr, err, err_records


def eval_method(path=None, dic=None):
    if not dic:
        dic = read_word_tsv(path)
    tp, fp, _ = _match_tokens(dic)
    fn = len(TOKENS_GT_DICT) - tp
    prec = tp / (tp + fp)
    recall = tp / (tp + fn)
    f = 2 * prec * recall / (prec + recall)
    err = (fp + fn) / (tp + fp + fn)
    return tp + fp, tp, fp, fn, prec, recall, f, err


def print_eval_method(path=None, dic=None):
    print('total: %6d  tp: %6d  fp: %6d  fn: %6d  prec: %.4f  recall: %.4f  f: %.4f  err: %.4f'
          % eval_method(path, dic))
    

In [250]:
""" Evaluate tokenization methods. """

print('ptb'); print_eval_method(path=WORDS_PTB_PATH)
print('ws');  print_eval_method(path=WORDS_WS_PATH)
print('prop'); print_eval_method(dic={**read_word_tsv(WORDS_TRAIN_PATH),
                                     **read_word_tsv(WORDS_TEST_PATH)})

ptb
total: 101291  tp:  95546  fp:   5745  fn:   6163  prec: 0.9433  recall: 0.9394  f: 0.9413  err: 0.1108
ws
total:  87002  tp:  74388  fp:  12614  fn:  27321  prec: 0.8550  recall: 0.7314  f: 0.7884  err: 0.3493
prop
total: 101645  tp: 100671  fp:    974  fn:   1038  prec: 0.9904  recall: 0.9898  f: 0.9901  err: 0.0196


In [256]:
""" Evaluate tokenization methods. """

print('elephant'); print_eval_method(dic=gen_elephant_dict())

elephant
[336, 'so-called', ['so', '-', 'called']]
[413, 'Hang-nests', ['Hang']]
[1150, 'classifi-', ['classifi-↵cation']]
[1160, 'cation', []]
[1496, 'salmon-pink', ['salmon']]
[1530, 'blackish-brown', ['blackish']]
[1567, 'lilacine-greyisli', ['lilacine']]
[1585, 'shell-markings', ['shell']]
[1606, 'call-notes', ['call']]
[1951, 'Familx-', ['Familx']]
[1959, 'ORIOL', ['ORIOL ID^E']]
[1965, 'ID^E', []]
[2209, 'south-eastwards', ['south']]
[2536, 'south-eastern', ['south']]
[2964, 'unsuc-', ['unsuc-↵cessful']]
[2971, 'cessful', []]
[3167, 'H', ['H.', 'W.']]
[3168, '.', ['W.']]
[3170, 'W', ['W.', 'Feildeu']]
[3171, '.', ['Feildeu']]
[3456, 'gamboge-yellow', ['gamboge']]
[3489, '(excepting', ['(', 'excepting']]
[3526, 'primary-coverts', ['primary']]
[3682, 'j^ellowish-white', ['j^ellowish']]
[3717, 'tail-feathers', ['tail']]
[3891, 'reddish-ochreous', ['reddish']]
[3915, 'leaden-grey', ['leaden']]
[4580, 'Thrush-like', ['Thrush']]
[5410, 'monkey-nature', ['monkey']]
[5473, 'j', ['j ust']

[47371, 'buff-brownish', ['buff']]
[47672, 'wing-coverts', ['wing']]
[47703, 'tail-coverts', ['tail']]
[47758, 'Flj', ["Flj'catcher"]]
[47761, "'catcher", []]
[47936, 'usuall}-placing', ['usuall}-']]
[48066, 'fruit-tree', ['fruit']]
[48180, 'trellis-work', ['trellis']]
[48320, 'ni}', ["ni}'"]]
[48323, "'", ['collection']]
[48516, 'fruit-branch', ['fruit']]
[48694, 'var}', ["var}'"]]
[48698, "'", ['somewhat']]
[49028, 'pea-green', ['pea', '-']]
[49053, 'bluish-white', ['bluish']]
[49139, 'red-brown', ['red', '-']]
[49982, 'hazel-nuts', ['hazel']]
[50143, 'hazel-nuts', ['hazel']]
[50180, 'egg-', ['egg', '-']]
[50232, 'sa3', ["sa3'S"]]
[50235, "'S", [':']]
[50486, 'awa}', ["awa}'"]]
[50490, "'", ['.', 'It']]
[50687, 'call-note', ['call']]
[50794, 'perching-place', ['perching']]
[50929, 'call-note', ['call']]
[50963, 'zt-chick', ['zt', '-', 'chick']]
[51165, 'zt-chick', ['zt', '-', 'chick']]
[51420, 'mountain-ash', ['mountain']]
[52198, 'destro3', ["destro3'ed"]]
[52205, "'ed", []]
[52733,

[90522, '}', ['cabinet']]
[90523, "'", ['cabinet']]
[90681, 'best-formed', ['best']]
[90835, 'gloss', ['gloss}-']]
[90840, '}', [',']]
[90841, '-', [',', 'and']]
[90953, 'Sand-Martins', ['Sand']]
[91010, 'birds', ["birds'"]]
[91015, "'-nester", ['-', 'nester']]
[91094, 'ever}-year', ['ever}-']]
[91360, 'land-slip', ['land']]
[91377, 'temporar}', ["temporar}'"]]
[91386, "'", ['block']]
[91520, 'Sand-', ['Sand']]
[91583, 'frequentl}', ["frequentl}'"]]
[91593, "'", ['be']]
[91921, 'call-note', ['call']]
[92066, 'vaguel', ["vaguel}'"]]
[92072, '}', ['resembling']]
[92073, "'", ['resembling']]
[92110, 'Reed-Warbler', ['Reed']]
[92128, 'usuall', ["usuall}'"]]
[92134, '}', ['uttered']]
[92135, "'", ['uttered']]
[92399, 'wide-open', ['wide']]
[92475, '3', ["3'oung"]]
[92476, "'oung", []]
[92508, 'dragon-flies', ['dragon']]
[92650, 'b', ["b}'"]]
[92651, '}', ['such']]
[92652, "'", ['such']]
[92841, 'dragon-fly', ['dragon']]
[92865, 'Sand-Martin', ['Sand']]
[93145, 'cage-bird', ['cage']]
[93159,

[130187, '}-', ['imported']]
[130222, 'cage-bird', ['cage']]
[130275, 'Jul', ["Jul}'"]]
[130278, '}', ['1896']]
[130279, "'", ['1896']]
[130517, 'insect-eating', ['insect']]
[130601, 'sunflower-seed', ['sunflower']]
[130703, 'aviar', ["aviar}'"]]
[130708, '}', ['consisted']]
[130709, "'", ['consisted']]
[130918, 'H', ['H.', 'St.']]
[130919, '.', ['St.']]
[130921, 'St', ['St.']]
[130923, '.', ['Quintin']]
[130969, 'sec', ['sec.']]
[130972, '.', ['Vol.']]
[130974, 'Vol', ['Vol.']]
[130977, '.', ['V', ',']]
[130982, 'PP-', ['PP- .']]
[130986, '.55-56', ['55', '-']]
[131001, 'Family-', ['Family']]
[131024, 'Subfamily-', ['Subfamily']]
[131066, 'Py}', ['Py})hula']]
[131069, ')hula', []]
[131093, 'DR', ['DR.']]
[131095, '.', ['SHARPB']]
[131421, 'pyrrhulay', ['pyrrhula']]
[131876, 'well-wooded', ['well']]
[131987, 'e3', ["e3'e"]]
[131989, "'e", ['and']]
[132048, 'blue-black', ['blue']]
[132088, 'ash-grey', ['ash', '-']]
[132127, 'wing-coverts', ['wing']]
[132456, 'salmon-red', ['salmon']]
[1

[173155, '.7.', ['.7', '.']]
[173159, 'hoi', ['hoi iiema)i):i']]
[173163, 'iiema', []]
[173168, ')i', []]
[173170, ')', []]
[173171, ':i', ['as']]
[173193, 'A', ['A.', 'exiiipe^']]
[173194, '.', ['exiiipe^']]
[173370, 'closelj', ["closelj'"]]
[173377, "'-related", ['-related']]
[173550, 'usuall^', ['usuall^^']]
[173557, '^', ['met']]
[173988, 'principall', ["principall}'"]]
[173998, '}', ['along']]
[173999, "'", ['along']]
[174423, 'co', ['co.']]
[174425, '.', ['Kildare']]
[174479, 'p', ['p.', '182']]
[174480, '.', ['182']]
[174526, 'co', ['co.']]
[174528, '.', ['Mayo']]
[174580, 'p', ['p.', '15']]
[174581, '.', ['15']]
[174614, 'breeding-plumage', ['breeding']]
[174720, 'blackish-brown', ['blackish']]
[174798, 'rose-pink', ['rose']]
[175242, 'rose-pink', ['rose']]
[175279, 'bufEsh-white', ['bufEsh']]
[175327, 'tail-coverts', ['tail']]
[175363, 'blackish-brown', ['blackish']]
[175396, 'horn-colour', ['horn']]
[175598, 'prominentl', ["prominentl}'"]]
[175608, '}', ['streaked']]
[175609,

[205393, 'tiirra', ['tiirra!']]
[205399, '!', ["''"]]
[205745, 'soap-suds', ['soap']]
[205833, 'c/ii', ["c/ii'ck"]]
[205837, "'ck", [',']]
[205865, 'chicka-chick', ['chicka']]
[205899, "'", ["''", ';']]
[205900, "'", [';']]
[206051, 'ia', ['ia)i)ial)i)i(t']]
[206053, ')i', []]
[206055, ')ial', []]
[206059, ')i)i(t', []]
[206581, 'capacit}', ["capacit}'"]]
[206589, "'", ['for']]
[206608, 'an', ["an}'"]]
[206610, '}', ['of']]
[206611, "'", ['of']]
[206660, 'ni}', ["ni}'"]]
[206663, "'", ['possession']]
[206983, 'p', ['p.', '170']]
[206984, '.', ['170']]
[206995, 'nt', ['nt)t']]
[206997, ')t', ['remarkable']]
[207489, 'etc', ['etc.']]
[207492, '.', [',', 'and']]
[207714, 'last-mentioned', ['last']]
[207852, 'hard-billed', ['hard']]
[207948, 'seed-eating', ['seed']]
[208016, 'hand-reared', ['hand']]
[208761, '(nine', ['(', 'nine']]
[208799, 'egg-food', ['egg', '-']]
[209049, 'bird-catchers', ['bird']]
[209171, 'p', ['p.', '220']]
[209172, '.', ['220']]
[209203, 'Vol', ['Vol.']]
[209206, '.

[239916, 'wide-gaping', ['wide']]
[240315, 'eye-lashes', ['eye', '-']]
[240506, 'above-mentioned', ['above']]
[240834, 'flight-cage', ['flight']]
[241244, 'Canary', ['Canary/']]
[241250, '/', ['laid']]
[241564, 'Fringillida', ['Fringillida:']]
[241575, ':', ['in']]
[241823, 'Family-', ['Family']]
[241831, 'FR', ['FR I X(UL L ID. ¥.']]
[241834, 'I', []]
[241836, 'X(UL', []]
[241841, 'L', []]
[241843, 'ID.', []]
[241847, '¥', ['.']]
[241862, '-^FRIXGIL', ['-', '^FRIXGIL L IN^E']]
[241872, 'L', []]
[241874, 'IN^E', []]
[241897, 'Frinoii', ['Frinoii/d']]
[241904, '/d', ['iiii>iitijriii<^illii']]
[241937, 'OI', ['OI""']]
[241939, '"', ['tlie']]
[241940, '"', ['tlie']]
[242105, 'sub-Arctic', ['sub', '-']]
[242159, 'valle', ["valle}'"]]
[242164, '}', ['of']]
[242165, "'", ['of']]
[242521, 'P3', ["P3'renees"]]
[242523, "'renees", []]
[242584, '(Manual', ['(', 'Manual']]
[242610, 'p', ['p.', '177']]
[242611, '.', ['177']]
[243004, 'else-where', ['else']]
[243087, 'beech-woods', ['beech']]
[2431

[278271, "'", ['built']]
[278300, '.irbor-vita', ['.irbor']]
[278356, 'Hedge-Sparrow', ['Hedge']]
[278506, 'building-site', ['building']]
[278602, 'marsh-flag', ['marsh']]
[278884, 'insect-food', ['insect']]
[279401, 'Family-', ['Family']]
[279409, 'FRlXi', ['FRlXi UL L ID. F']]
[279415, 'UL', []]
[279418, 'L', []]
[279420, 'ID', []]
[279424, 'F', ['.']]
[279428, 'Suhjaiiiily-FHIBFRIZIA', ['Suhjaiiiily']]
[279451, '>F.', ['.']]
[279460, 'Yellow-Breasted', ['Yellow']]
[279544, 'Clc}', ['Clc}-']]
[279548, '-', [',', 'Norfolk']]
[279626, 'south-eastern', ['south']]
[279649, 'Family-', ['Family']]
[279657, 'FRFXGILLID.F.', ['FRFXGILLID.F']]
[279672, 'Suhfam', ['Suhfam ily']]
[279679, 'ily-EMBERIZIN', ['-']]
[279692, '\\F', ['.']]
[280165, 'North-west', ['North']]
[280535, 'reeogni/', ['reeogni/.ed']]
[280543, '.ed', []]
[280756, 'Zoologist"', ['Zoologist']]
[280811, 'appear-', ['appear-↵ance']]
[280819, 'ance', []]
[280854, 'i<S8(S.', ['i<S8(S']]
[281196, 'olive-green', ['olive']]
[281270,

[316821, '.', ['132']]
[316916, 'Snow-Bunting', ['Snow']]
[317072, '3', ["3'ears"]]
[317073, "'ears", []]
[317262, 'Snow-', ['Snow']]
[317327, 'flute-like', ['flute']]
[317470, 'Snow-', ['Snow']]
[318018, 'pp', ['pp.']]
[318020, '.', ['385']]
[318022, '385-386.', ['385', '-']]
[318077, 'cage-bird', ['cage']]
[318402, 'J', ['J.', 'H.']]
[318403, '.', ['H.']]
[318405, 'H', ['H.', 'Gurne}-']]
[318406, '.', ['Gurne}-']]
[318408, 'Gurne}', ['Gurne}-']]
[318414, '-', [',', 'this']]
[318527, 'over-feeding', ['over']]
[318544, 'probabl', ["probabl}'"]]
[318551, '}', ['the']]
[318552, "'", ['the']]
[318692, 'certainl', ["certainl}'"]]
[318700, '}', ['a']]
[318701, "'", ['a']]
[318791, 'Vol', ['Vol.']]
[318794, '.', ['I', ',']]
[318799, 'p', ['p.', '181']]
[318800, '.', ['181']]
[319098, 'cage-bird', ['cage']]
[319110, 'Family-', ['Family']]
[319133, 'SiihJaniily-EMBERIZIN^F:', ['SiihJaniily']]
[319164, 'Snow-Bunting', ['Snow']]
[319193, 'niva', ['niva/is']]
[319197, '/is', [',']]
[319432, 'sea-

[355596, "'", ['mollusca']]
[355665, 'wholl', ["wholl}'"]]
[355670, '}', ['inaccessible']]
[355671, "'", ['inaccessible']]
[355839, 'church-towers', ['church']]
[355865, 'lime-kilns', ['lime']]
[355904, 'plant-stems', ['plant']]
[356091, 'greenish-white', ['greenish']]
[356109, 'ground-colour', ['ground']]
[356382, 'generally', ["generally'"]]
[356391, "'", ['commences']]
[356522, 'Red-legged', ['Red', '-']]
[356719, 'roosting-place', ['roosting']]
[356816, 'some-times', ['some']]
[357302, 'Crj', ["Crj'Stal"]]
[357305, "'Stal", []]
[357466, 'twelve-month', ['twelve']]
[357510, 'l', ['l)een']]
[357511, ')een', []]
[357904, 'p', ['p.', '431']]
[357905, '.', ['431']]
[357977, 'fP', ['f', 'P.']]
[357979, '.', ["'^raculns"]]
[358062, 'stoke-hole', ['stoke']]
[358074, 'l', ['l)ut']]
[358075, ')ut', []]
[358702, 'well-known', ['well']]
[358956, 'Familv-CORl', ['Familv']]
[358967, "'ID.E", []]
[359036, 'pine-forests', ['pine']]
[359117, 'Norwaj', ["Norwaj'"]]
[359123, "'", [',', 'Sweden']]
[35

[398892, 'leaf-like', ['leaf']]
[399304, 'flight-cage', ['flight']]
[399386, 'Familx-', ['Familx']]
[399394, 'COR', ['COR I ID. E']]
[399398, 'I', []]
[399400, 'ID.', []]
[399404, 'E', ['.']]
[399867, 'l', ['l)een']]
[399868, ')een', []]
[400001, 'south-western', ['south']]
[400259, '1885-6', ['1885']]
[400307, 'ni}', ["ni}'"]]
[400310, "'", ['garden']]
[401308, '3', ["3'et"]]
[401309, "'et", [',']]
[401343, 'pla}', ["pla}'"]]
[401347, "'", [',', 'gives']]
[401515, '\\', ["\\'ol"]]
[401516, "'ol", ['.']]
[401528, '212-213', ['212', '-']]
[401968, 'j', ["j'ear"]]
[401969, "'ear", []]
[402144, 'Februarj', ["Februarj'"]]
[402152, "'", [',', 'and']]
[402257, 'Ma}', ["Ma}'"]]
[402260, "'", ['.', 'The']]
[402321, 'look-out', ['look']]
[402965, 'gunshot-range', ['gunshot']]
[403523, 'tolerabl', ["tolerabl}'"]]
[403531, '}', ['rapid']]
[403532, "'", ['rapid']]
[403684, 'heather-stems', ['heather']]
[403862, 'ground-colour', ['ground']]
[403899, 'greenish-blue', ['greenish']]
[403916, 'olive-gr

[451838, 'saj', ["saj'S"]]
[451841, "'S", [':']]
[452009, 'snow-storm', ['snow']]
[452471, '245-8', ['245', '-']]
[452493, 'B', ['B.', 'Farn']]
[452494, '.', ['Farn']]
[452544, '250-4', ['250', '-']]
[452576, 'Family^', ['Family']]
[452584, 'ALA', ['ALA UDID. F']]
[452588, 'UDID', []]
[452594, 'F', ['.']]
[452602, 'Wood-Lark', ['Wood']]
[452639, '"', ['" T N']]
[452641, 'T', []]
[452643, 'N', ['summer']]
[452656, 'Wood-Lark', ['Wood']]
[452742, 'N.', ['N', '.', 'lat.']]
[452745, 'lat', ['lat.']]
[452748, '.', [',', 'as']]
[453153, '"', ['"^', '-']]
[453154, '^', ['-']]
[453199, 'Wood-Lark', ['Wood']]
[453891, 'Wood-Lark', ['Wood']]
[453922, 'Sky-Lark', ['Sky', '-']]
[454197, 'tail-coverts', ['tail']]
[454215, 'primary-coverts', ['primary']]
[454257, 'tail-feathers', ['tail']]
[454272, 'reddish-brown', ['reddish']]
[454480, 'buffish-white', ['buffish']]
[454548, 'ear-coverts', ['ear', '-']]
[454631, 'distinctl', ["distinctl}'"]]
[454640, '}', ['yellowish']]
[454641, "'", ['yellowish']]


[495477, 'Family-', ['Family']]
[495485, 'MO', ['MO TA CIL L ID^']]
[495488, 'TA', []]
[495491, 'CIL', []]
[495495, 'L', []]
[495497, 'ID^', ['.']]
[495507, 'Black-Headed', ['Black']]
[495641, 'flaz', ["flaz'a"]]
[495645, "'a", [',']]
[495676, 'Fa', ['Fa mi ly']]
[495679, 'mi', []]
[495682, 'ly-', ['-']]
[495686, 'MO', ['MO TA CILL ID. F']]
[495689, 'TA', []]
[495692, 'CILL', []]
[495697, 'ID.', []]
[495701, 'F', ['.']]
[495815, 'Family-', ['Family']]
[495823, 'MO', ['MO TA CILLlDzE']]
[495826, 'TA', []]
[495829, 'CILLlDzE', []]
[495844, 'AvShy-Headed', ['AvShy']]
[495893, 'Sa', ["Sa\\'I"]]
[495895, '\\', ['.']]
[495896, "'I", ['.']]
[495998, 'May?', ['May', '?']]
[496084, 'sub-species', ['sub-']]
[496103, 'M', ['M.', '/lava']]
[496104, '.', ['/lava']]
[496153, 'nowadaj', ["nowadaj'S"]]
[496160, "'S", ['is']]
[496221, 'Tree-Pipit', ['Tree']]
[496317, 'W', ['W.', 'E.']]
[496318, '.', ['E.']]
[496320, 'E', ['E.', 'Clarke']]
[496321, '.', ['Clarke']]
[496393, 'Red-throated', ['Red', '-']]

In [258]:
""" Evaluate tokenization methods. """

print('waste-bnc'); print_eval_method(dic=gen_waste_dict(WASTE_BNC_PATH))
# print('waste-brown'); print_eval_method(dic=gen_waste_dict(WASTE_BROWN_PATH))
# print('waste-wsj'); print_eval_method(dic=gen_waste_dict(WASTE_WSJ_PATH))

waste-bnc
total:  99531  tp:  94735  fp:   4796  fn:   6974  prec: 0.9518  recall: 0.9314  f: 0.9415  err: 0.1105


## Evaluate on Penn Treebank

In [272]:
import re
import nltk
from nltk.corpus import treebank as ptb


def build_sent(words):
    sent = words[0]
    for i in range(1, len(words)):
        if re.search(r'[a-zA-Z]', words[i]):
            sent += ' '
        sent += words[i]
    return sent
    
def build_txt(sents):
    return ' '.join(build_sent(words) for words in sents)

print(dir(ptb))
print(len(ptb.fileids()))
for f in ptb.fileids():
    print(ptb.words(f))
    print(build_txt(ptb.sents(f)))
    break

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '__weakref__', '_comment_char', '_detect_blocks', '_encoding', '_fileids', '_get_root', '_normalize', '_parse', '_read_block', '_read_parsed_sent_block', '_read_sent_block', '_read_tagged_sent_block', '_read_tagged_word_block', '_read_word_block', '_root', '_tag', '_tagset', '_unload', '_word', 'abspath', 'abspaths', 'citation', 'encoding', 'ensure_loaded', 'fileids', 'license', 'open', 'parsed_sents', 'raw', 'readme', 'root', 'sents', 'tagged_sents', 'tagged_words', 'unicode_repr', 'words']
199
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', ...]
Pierre Vinken,61 years old, will join the board as a nonexecutive director Nov.29. Mr. Vinken is