# It's a Machine and Natural Language Tagger

In [1]:
from src.IaMaN.base import LM
from src.utils.data import load_ud
from src.utils.munge import stick_spaces
from pprint import pprint as pprint
from collections import defaultdict
from collections import Counter
from tqdm import tqdm
import numpy as np
import os, re

seed = 691; max_char = 200_000_000
m = 10; space = True; fine_tune = False; num_posttrain = 0; noise = 0.001
positional = 'dependent'; positionally_encode = True; bits = 50; update = True; btype = 'f'; ms_init = 'waiting_time'
runners = 10; gpu = False; tokenizer = 'hr-bpe'; decode_method = 'argmax'

print("Loading post-training data...")
posttrain_path = '/data/newstweet/week_2019-40_article_texts/'
total_posttrain = len([posttrain_file for posttrain_file in os.listdir(posttrain_path) if re.search("^\d+.txt$", posttrain_file)])
all_posttrain_files = [posttrain_file for posttrain_file in os.listdir(posttrain_path) if re.search("^\d+.txt$", posttrain_file)]
if num_posttrain:
    np.random.seed(seed)
    posttrain_files = np.random.choice(all_posttrain_files, size=num_posttrain, replace=False)
else:
    posttrain_files = np.array([])
ptdocs = [[[open(posttrain_path+posttrain_file).read()]] for posttrain_file in tqdm(posttrain_files)]
print("Loading gold-tagged UDs data...")
load_set = "GUM"
all_docs = load_ud("English", num_articles = 0, seed = seed, load_set = load_set, rebuild = True, space = space)
test_docs = [doc for doc in all_docs if 'test' in doc['id'] and len(doc['text']) <= max_char] # [:2]
train_docs = [doc for doc in all_docs if 'test' not in doc['id'] and len(doc['text']) <= max_char] # [:4]
nsamp = len(test_docs)
print('Avail. post-train, total post-train, Avail. gold, total gold-train, total test-gold: ', 
      total_posttrain, len(ptdocs), len(all_docs), len(train_docs), len(test_docs))

Loading post-training data...


0it [00:00, ?it/s]

Loading gold-tagged UDs data...





Avail. post-train, total post-train, Avail. gold, total gold-train, total test-gold:  14198 0 150 132 18


In [2]:
docs = [["".join([row[1] for row in s]) for s in d['conllu']] for d in train_docs]
tdocs = [["".join([row[1] for row in s]) for s in d['conllu']] for d in test_docs]
covering = [[[row[1] for row in s] for s in d['conllu']] for d in train_docs]
tcovering = [[[row[1] for row in s] for s in d['conllu']] for d in test_docs]
if not space:
    for d_i, d in enumerate(covering):
        for s_i, s in enumerate(d):
            covering[d_i][s_i] = stick_spaces(s)
    for d_i, d in enumerate(tcovering):
        for s_i, s in enumerate(d):
            tcovering[d_i][s_i] = stick_spaces(s)
covering_vocab = set([t for d in covering for s in d for t in s])

# 'lem': [[row[2] for row in s] for s in d['conllu']], # note: for speed, remove lemma layer
train_layers = {d_i: {'sty': [[d['s_type'][s_i] for row in s] for s_i, s in enumerate(d['conllu'])], 
                      'pos': [[row[3] for row in s] for s in d['conllu']], 
                      'sup': [[(str(int(row[6]) - int(row[0])) if int(row[6]) else row[6]) for row in s] for s in d['conllu']], 
                      'dep': [[row[7] for row in s] for s in d['conllu']]}
                for d_i, d in enumerate(train_docs)}
# 'lem': [[row[2] for row in s] for s in d['conllu']], # note: for speed, remove lemma layer
test_layers = {d_i: {'sty': [[d['s_type'][s_i] for row in s] for s_i, s in enumerate(d['conllu'])], 
                     'pos': [[row[3] for row in s] for s in d['conllu']], 
                     'sup': [[(str(int(row[6]) - int(row[0])) if int(row[6]) else row[6]) for row in s] for s in d['conllu']], 
                     'dep': [[row[7] for row in s] for s in d['conllu']]}
               for d_i, d in enumerate(test_docs)}

model = LM(m = m, tokenizer = tokenizer, noise = noise, seed = seed, space = space, positional = positional,
           positionally_encode = positionally_encode, runners = runners, gpu = gpu, bits = bits, 
           btype = btype, ms_init = ms_init)
data_streams = model.fit(docs, f'{load_set}-{nsamp}', covering = covering, all_layers = train_layers, fine_tune = fine_tune)

Training tokenizer...


Initializing: 100%|██████████| 6503/6503 [00:01<00:00, 4466.59it/s]
Fitting:  20%|██        | 20/100 [00:42<02:49,  2.12s/it]


Built a vocabulary of 15327 types
Tokenizing documents...


100%|██████████| 132/132 [00:14<00:00,  8.85it/s]


Pre-processing data...


100%|██████████| 132/132 [00:05<00:00, 24.87it/s]


Counting documents and aggregating counts...


5082533it [03:39, 23171.26it/s] 


Counting tag-tag transition frequencies...


100%|██████████| 132/132 [00:00<00:00, 157.87it/s]


Encoding parameters...


100%|██████████| 5082533/5082533 [00:55<00:00, 91682.97it/s]


Building target vocabularies...


100%|██████████| 24/24 [00:00<00:00, 1015.01it/s]


Pre-computing BOW probabilities... done.
Pre-computing wave amplitudes... done.
Stacking output vocabularies for decoders...


100%|██████████| 11/11 [00:00<00:00, 83886.08it/s]


Building dense output heads...


100%|██████████| 23/23 [00:00<00:00, 32.55it/s]


Building transition matrices for tag-sequence decoding...


  self._trXs[ltype] = (lambda M: M/M.sum(axis = 1)[:,None])(self._trXs[ltype])
100%|██████████| 7/7 [00:00<00:00, 2147.94it/s]

Done.
Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 886984 10568 50 1071 353 258 9.681





In [3]:
model._ms

{'sty': 2,
 'pos': 6,
 'sup': 7,
 'dep': 7,
 'nov': 2,
 'iat': 2,
 'bot': 2,
 'eot': 2,
 'eos': 2,
 'eod': 2,
 'form': 8,
 'bits': 10}

In [4]:
model.interpret([[' Gone again!']], seed = seed)
for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print(f'opening next sent: {s._sty}')
        for t in s._tokens:
            print(f'opening next token: {t._form}, {t._lem}, {t._sep}, {t._pos}, {t._sup}, {t._dep}')
            print(["\""+w._form+"\"" for w in t._whatevers])

Tokenizing documents...


100%|██████████| 1/1 [00:00<00:00, 2425.86it/s]


Pre-processing data...


100%|██████████| 1/1 [00:00<00:00, 13025.79it/s]
100%|██████████| 1/1 [00:00<00:00, 1569.72it/s]


Interpreting documents...


100%|██████████| 1/1 [00:00<00:00, 114.47it/s]


opening next doc:
opening next sent: decl
opening next token:  , None, False, SPACE, 1, space
['" "']
opening next token: Gone, None, False, PROPN, 0, root
['"Gon"', '"e"']
opening next token:  , None, False, SPACE, -1, space
['" "']
opening next token: again, None, False, X, -1, dep
['"again"']
opening next token: !, None, True, PUNCT, -1, punct
['"!"']





In [5]:
d_i = 3; s_max = 2
interpret_docs = list([docs[d_i][:s_max]])
print(interpret_docs)
model.interpret(interpret_docs, eval_layers = {0: {tag: train_layers[d_i][tag][:s_max] for tag in train_layers[d_i]}},
                eval_covering = [covering[d_i][:s_max]], seed = seed)

for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print(f'opening next sent: {s._sty}')
        for t in s._tokens:
            print(f'opening next token: {t._form}, {t._lem}, {t._sep}, {t._pos}, {t._sup}, {t._dep}')

[[' Emperor Norton ', ' Joshua Abraham Norton (c. 1818 – January 8, 1880), known as Emperor Norton, was a citizen of San Francisco, California, who in 1859 proclaimed himself "Norton I, Emperor of the United States". ']]
Tokenizing documents...


100%|██████████| 1/1 [00:00<00:00, 124.92it/s]


Pre-processing data...


100%|██████████| 1/1 [00:00<00:00, 5882.61it/s]
100%|██████████| 1/1 [00:00<00:00, 181.36it/s]


Interpreting documents...


100%|██████████| 1/1 [00:00<00:00,  1.29it/s]

Document 0's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 0's STY accuracy:  0.5
Document 0's Token segmentation performance without space:  {('P', 0.9782608695652174), ('R', 0.9782608695652174), ('F', 0.9782608695652174)}
Document 0's POS accuracy with/out space: 0.7654320987654321 0.5777777777777777
Document 0's SUP:DEP accuracy with/out space:  0.5555555555555556 0.2

Overall sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Overall STY accuracy:  0.5
Overall Token segmentation performance without space:  {('P', 0.9782608695652174), ('R', 0.9782608695652174), ('F', 0.9782608695652174)}
Overall POS accuracy with/out space: 0.7654320987654321 0.5777777777777777
Overall SUP:DEP accuracy with/out space:  0.5555555555555556 0.2
opening next doc:
opening next sent: decl
opening next token:  , None, False, SPACE, 1, space
opening next token: Emperor, None, False, PROPN, 0, root
opening next token:  , None, False, SPACE, 1, space
o




In [6]:
d_i = 0; s_max = 1
interpret_docs = list([tdocs[d_i][:s_max]])
print(interpret_docs)
model.interpret(interpret_docs, eval_layers = {0: {tag: test_layers[d_i][tag][:s_max] for tag in test_layers[d_i]}},
                eval_covering = [tcovering[d_i][:s_max]], seed = seed)

for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print(f'opening next sent: {s._sty}')
        for t in s._tokens:
            print(f'opening next token: {t._form}, {t._lem}, {t._sep}, {t._pos}, {t._sup}, {t._dep}')

[[' The prevalence of discrimination across racial groups in contemporary America: ']]
Tokenizing documents...


100%|██████████| 1/1 [00:00<00:00, 325.64it/s]


Pre-processing data...


100%|██████████| 1/1 [00:00<00:00, 12192.74it/s]
100%|██████████| 1/1 [00:00<00:00, 575.98it/s]


Interpreting documents...


100%|██████████| 1/1 [00:00<00:00, 14.49it/s]

Document 0's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 0's STY accuracy:  0.0
Document 0's Token segmentation performance without space:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 0's POS accuracy with/out space: 0.7272727272727273 0.45454545454545453
Document 0's SUP:DEP accuracy with/out space:  0.5 0.09090909090909091

Overall sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Overall STY accuracy:  0.0
Overall Token segmentation performance without space:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Overall POS accuracy with/out space: 0.7272727272727273 0.45454545454545453
Overall SUP:DEP accuracy with/out space:  0.5 0.09090909090909091
opening next doc:
opening next sent: decl
opening next token:  , None, False, SPACE, 1, space
opening next token: The, None, False, PROPN, 0, root
opening next token:  , None, False, SPACE, 1, space
opening next token: prevalence, None, False, PROPN, -2, flat
opening next token:  , None, F




In [7]:
d_i = 1; s_max = 2
interpret_docs = list([tdocs[d_i][:s_max]])
print(interpret_docs)
model.interpret(interpret_docs, eval_layers = {0: {tag: test_layers[d_i][tag][:s_max] for tag in test_layers[d_i]}},
                eval_covering = [tcovering[d_i][:s_max]], seed = seed)
for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print(f'opening next sent: {s._sty}')
        for t in s._tokens:
            print(f'opening next token: {t._form}, {t._lem}, {t._sep}, {t._pos}, {t._sup}, {t._dep}')

[[' 2. GUJJOLAAY EEGIMAA, ITS SPEAKERS AND THEIR NEIGHBOURS ', ' This section briefly presents the Gújjolaay Eegimaa (Eegimaa for short; Ethnologue code: ISO 639-3: bqj), its speakers and its varieties. ']]
Tokenizing documents...


100%|██████████| 1/1 [00:00<00:00, 194.40it/s]


Pre-processing data...


100%|██████████| 1/1 [00:00<00:00, 5159.05it/s]
100%|██████████| 1/1 [00:00<00:00, 95.30it/s]


Interpreting documents...


100%|██████████| 1/1 [00:00<00:00,  1.49it/s]

Document 0's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 0's STY accuracy:  0.5
Document 0's Token segmentation performance without space:  {('P', 0.5507246376811594), ('R', 0.76), ('F', 0.638655462184874)}
Document 0's POS accuracy with/out space: 0.6666666666666666 0.3888888888888889
Document 0's SUP:DEP accuracy with/out space:  0.3484848484848485 0.027777777777777776

Overall sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Overall STY accuracy:  0.5
Overall Token segmentation performance without space:  {('P', 0.5507246376811594), ('R', 0.76), ('F', 0.638655462184874)}
Overall POS accuracy with/out space: 0.6666666666666666 0.3888888888888889
Overall SUP:DEP accuracy with/out space:  0.3484848484848485 0.027777777777777776
opening next doc:
opening next sent: decl
opening next token:  , None, False, SPACE, 1, space
opening next token: 2, None, False, PROPN, 5, vocative
opening next token: ., None, False, PROPN, 0, root





In [8]:
d_i = 1; s_max = 2
interpret_docs = list([tdocs[d_i][:s_max]])
print(interpret_docs)
model.interpret(interpret_docs, eval_layers = {0: {tag: test_layers[d_i][tag][:s_max] for tag in test_layers[d_i]}},
                covering = [tcovering[d_i][:s_max]], seed = seed)
for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print(f'opening next sent: {s._sty}')
        for t in s._tokens:
            print(f'opening next token: {t._form}, {t._lem}, {t._sep}, {t._pos}, {t._sup}, {t._dep}')

[[' 2. GUJJOLAAY EEGIMAA, ITS SPEAKERS AND THEIR NEIGHBOURS ', ' This section briefly presents the Gújjolaay Eegimaa (Eegimaa for short; Ethnologue code: ISO 639-3: bqj), its speakers and its varieties. ']]
Tokenizing documents...


100%|██████████| 1/1 [00:00<00:00, 218.20it/s]


Pre-processing data...


100%|██████████| 1/1 [00:00<00:00, 762.74it/s]
100%|██████████| 1/1 [00:00<00:00, 115.33it/s]


Interpreting documents...


100%|██████████| 1/1 [00:00<00:00,  2.75it/s]

Document 0's STY accuracy:  0.5
Document 0's POS accuracy with/out space: 0.7121212121212122 0.4722222222222222
Document 0's SUP:DEP accuracy with/out space:  0.36363636363636365 0.027777777777777776

Overall STY accuracy:  0.5
Overall POS accuracy with/out space: 0.7121212121212122 0.4722222222222222
Overall SUP:DEP accuracy with/out space:  0.36363636363636365 0.027777777777777776
opening next doc:
opening next sent: decl
opening next token:  , None, False, SPACE, 1, space
opening next token: 2., None, False, PROPN, 0, root
opening next token:  , None, False, SPACE, -1, space
opening next token: GUJJOLAAY, None, False, PROPN, 2, nsubj
opening next token:  , None, False, SPACE, 1, space
opening next token: EEGIMAA, None, False, VERB, -4, punct
opening next token: ,, None, False, PUNCT, -1, punct
opening next token:  , None, False, SPACE, 1, space
opening next token: ITS, None, False, VERB, -3, nsubj
opening next token:  , None, False, SPACE, -1, space
opening next token: SPEAKERS, Non




In [9]:
model.interpret(tdocs, eval_layers = test_layers, eval_covering = tcovering, seed = seed, verbose_result = False, decode_method = decode_method)

Tokenizing documents...


100%|██████████| 18/18 [00:01<00:00,  9.12it/s]


Pre-processing data...


100%|██████████| 18/18 [00:00<00:00, 558.81it/s]
100%|██████████| 18/18 [00:02<00:00,  7.21it/s]


Interpreting documents...


  6%|▌         | 1/18 [00:14<04:03, 14.35s/it]

Document 0's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 0's STY accuracy:  0.6851851851851852
Document 0's Token segmentation performance without space:  {('P', 0.9112688553682342), ('R', 0.9227313566936208), ('F', 0.9169642857142857)}
Document 0's POS accuracy with/out space: 0.7686116700201208 0.5642245480494766
Document 0's SUP:DEP accuracy with/out space:  0.56841046277666 0.21027592768791628



 11%|█         | 2/18 [00:32<04:26, 16.65s/it]

Document 1's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 1's STY accuracy:  0.8611111111111112
Document 1's Token segmentation performance without space:  {('P', 0.7702371218315618), ('R', 0.8618481244281794), ('F', 0.8134715025906736)}
Document 1's POS accuracy with/out space: 0.7435597189695551 0.5150166852057843
Document 1's SUP:DEP accuracy with/out space:  0.5392271662763466 0.1557285873192436



 17%|█▋        | 3/18 [00:43<03:30, 14.04s/it]

Document 2's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 2's STY accuracy:  0.9310344827586207
Document 2's Token segmentation performance without space:  {('P', 0.8073394495412844), ('R', 0.8585365853658536), ('F', 0.8321513002364067)}
Document 2's POS accuracy with/out space: 0.7543726235741445 0.5373563218390804
Document 2's SUP:DEP accuracy with/out space:  0.5422053231939163 0.16810344827586207



 22%|██▏       | 4/18 [00:59<03:29, 14.95s/it]

Document 3's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 3's STY accuracy:  0.9
Document 3's Token segmentation performance without space:  {('P', 0.8631950573698146), ('R', 0.8874773139745916), ('F', 0.8751677852348994)}
Document 3's POS accuracy with/out space: 0.7841880341880342 0.590030518819939
Document 3's SUP:DEP accuracy with/out space:  0.5710470085470085 0.20142421159715157



 28%|██▊       | 5/18 [01:05<02:32, 11.70s/it]

Document 4's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 4's STY accuracy:  0.46078431372549017
Document 4's Token segmentation performance without space:  {('P', 0.9296875), ('R', 0.9133771929824561), ('F', 0.9214601769911503)}
Document 4's POS accuracy with/out space: 0.7850985221674877 0.6
Document 4's SUP:DEP accuracy with/out space:  0.6059113300492611 0.2755813953488372



 33%|███▎      | 6/18 [01:17<02:19, 11.63s/it]

Document 5's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 5's STY accuracy:  0.7894736842105263
Document 5's Token segmentation performance without space:  {('P', 0.857872340425532), ('R', 0.8865435356200527), ('F', 0.8719723183391004)}
Document 5's POS accuracy with/out space: 0.7490755414685684 0.5335305719921104
Document 5's SUP:DEP accuracy with/out space:  0.5467511885895404 0.19230769230769232



 39%|███▉      | 7/18 [01:32<02:19, 12.70s/it]

Document 6's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 6's STY accuracy:  0.9148936170212766
Document 6's Token segmentation performance without space:  {('P', 0.8862642169728784), ('R', 0.8964601769911504), ('F', 0.8913330400351956)}
Document 6's POS accuracy with/out space: 0.75 0.5241379310344828
Document 6's SUP:DEP accuracy with/out space:  0.5447530864197531 0.15369458128078817



 44%|████▍     | 8/18 [01:39<01:50, 11.01s/it]

Document 7's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 7's STY accuracy:  0.5344827586206896
Document 7's Token segmentation performance without space:  {('P', 0.8984547461368654), ('R', 0.9105145413870246), ('F', 0.9044444444444445)}
Document 7's POS accuracy with/out space: 0.7786211258697027 0.5687732342007435
Document 7's SUP:DEP accuracy with/out space:  0.592662871600253 0.22057001239157373



 50%|█████     | 9/18 [01:52<01:43, 11.55s/it]

Document 8's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 8's STY accuracy:  0.5952380952380952
Document 8's Token segmentation performance without space:  {('P', 0.8648373983739838), ('R', 0.8986272439281943), ('F', 0.8814085965820818)}
Document 8's POS accuracy with/out space: 0.7422934648581998 0.508274231678487
Document 8's SUP:DEP accuracy with/out space:  0.5591861898890259 0.1773049645390071



 56%|█████▌    | 10/18 [02:12<01:53, 14.18s/it]

Document 9's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 9's STY accuracy:  0.72
Document 9's Token segmentation performance without space:  {('P', 0.8479890933878664), ('R', 0.8693221523410203), ('F', 0.8585231193926847)}
Document 9's POS accuracy with/out space: 0.7546699875466999 0.5347551342812006
Document 9's SUP:DEP accuracy with/out space:  0.5466998754669987 0.15955766192733017



 61%|██████    | 11/18 [02:20<01:27, 12.46s/it]

Document 10's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 10's STY accuracy:  0.5428571428571428
Document 10's Token segmentation performance without space:  {('P', 0.8435754189944135), ('R', 0.8640915593705293), ('F', 0.8537102473498234)}
Document 10's POS accuracy with/out space: 0.7390939597315436 0.5056
Document 10's SUP:DEP accuracy with/out space:  0.5385906040268457 0.1488



 67%|██████▋   | 12/18 [02:34<01:15, 12.64s/it]

Document 11's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 11's STY accuracy:  0.8181818181818182
Document 11's Token segmentation performance without space:  {('P', 0.8804554079696395), ('R', 0.9106967615309126), ('F', 0.8953207911239749)}
Document 11's POS accuracy with/out space: 0.7720670391061453 0.5562841530054645
Document 11's SUP:DEP accuracy with/out space:  0.5754189944134078 0.1825136612021858



 72%|███████▏  | 13/18 [02:46<01:02, 12.47s/it]

Document 12's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 12's STY accuracy:  0.6545454545454545
Document 12's Token segmentation performance without space:  {('P', 0.8933209647495362), ('R', 0.9084905660377358), ('F', 0.9008419083255378)}
Document 12's POS accuracy with/out space: 0.7588075880758808 0.5462012320328542
Document 12's SUP:DEP accuracy with/out space:  0.5382113821138211 0.15503080082135523



 78%|███████▊  | 14/18 [02:56<00:47, 11.89s/it]

Document 13's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 13's STY accuracy:  0.4418604651162791
Document 13's Token segmentation performance without space:  {('P', 0.9328358208955224), ('R', 0.9171907756813418), ('F', 0.9249471458773786)}
Document 13's POS accuracy with/out space: 0.751015670342426 0.5244444444444445
Document 13's SUP:DEP accuracy with/out space:  0.546720835751596 0.15



 83%|████████▎ | 15/18 [03:17<00:43, 14.51s/it]

Document 14's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 14's STY accuracy:  0.8918918918918919
Document 14's Token segmentation performance without space:  {('P', 0.8849630238290879), ('R', 0.8952618453865336), ('F', 0.8900826446280992)}
Document 14's POS accuracy with/out space: 0.7688995215311005 0.5624430264357339
Document 14's SUP:DEP accuracy with/out space:  0.5464114832535886 0.15679124886052873



 89%|████████▉ | 16/18 [03:24<00:24, 12.43s/it]

Document 15's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 15's STY accuracy:  0.8421052631578947
Document 15's Token segmentation performance without space:  {('P', 0.8428184281842819), ('R', 0.8735955056179775), ('F', 0.8579310344827586)}
Document 15's POS accuracy with/out space: 0.7687296416938111 0.5456
Document 15's SUP:DEP accuracy with/out space:  0.5749185667752443 0.1936



 94%|█████████▍| 17/18 [03:32<00:10, 10.99s/it]

Document 16's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 16's STY accuracy:  0.32558139534883723
Document 16's Token segmentation performance without space:  {('P', 0.9100968188105117), ('R', 0.9228611500701263), ('F', 0.9164345403899722)}
Document 16's POS accuracy with/out space: 0.774781919111816 0.5672333848531684
Document 16's SUP:DEP accuracy with/out space:  0.5900079302141158 0.2241112828438949



100%|██████████| 18/18 [03:41<00:00, 12.31s/it]

Document 17's Sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Document 17's STY accuracy:  0.4153846153846154
Document 17's Token segmentation performance without space:  {('P', 0.9175068744271311), ('R', 0.9242843951985226), ('F', 0.9208831646734131)}
Document 17's POS accuracy with/out space: 0.7970118495620814 0.6044176706827309
Document 17's SUP:DEP accuracy with/out space:  0.5744461617722824 0.1897590361445783

Overall sentence segmentation performance:  {('F', 1.0), ('R', 1.0), ('P', 1.0)}
Overall STY accuracy:  0.6588366890380313
Overall Token segmentation performance without space:  {('P', 0.8734375845462908), ('R', 0.895683054045056), ('F', 0.8844204585924444)}
Overall POS accuracy with/out space: 0.7636820730999807 0.5501356684755797
Overall SUP:DEP accuracy with/out space:  0.5603687230065106 0.18296743956586087





In [10]:
model.interpret(tdocs, eval_layers = test_layers, covering = tcovering, seed = seed, verbose_result = False, decode_method = decode_method)

Tokenizing documents...


100%|██████████| 18/18 [00:01<00:00,  9.21it/s]


Pre-processing data...


100%|██████████| 18/18 [00:00<00:00, 46.37it/s]
100%|██████████| 18/18 [00:02<00:00,  7.68it/s]


Interpreting documents...


  6%|▌         | 1/18 [00:13<03:48, 13.44s/it]

Document 0's STY accuracy:  0.6851851851851852
Document 0's POS accuracy with/out space: 0.7892354124748491 0.6032350142721218
Document 0's SUP:DEP accuracy with/out space:  0.5714285714285714 0.2131303520456708



 11%|█         | 2/18 [00:25<03:26, 12.91s/it]

Document 1's STY accuracy:  0.8611111111111112
Document 1's POS accuracy with/out space: 0.7640515222482436 0.5539488320355951
Document 1's SUP:DEP accuracy with/out space:  0.5345433255269321 0.14015572858731926



 17%|█▋        | 3/18 [00:34<02:44, 10.98s/it]

Document 2's STY accuracy:  0.9310344827586207
Document 2's POS accuracy with/out space: 0.7749049429657795 0.5761494252873564
Document 2's SUP:DEP accuracy with/out space:  0.5444866920152092 0.17672413793103448



 22%|██▏       | 4/18 [00:48<02:50, 12.18s/it]

Document 3's STY accuracy:  0.9
Document 3's POS accuracy with/out space: 0.8018162393162394 0.6236012207527976
Document 3's SUP:DEP accuracy with/out space:  0.5737179487179487 0.20549338758901323



 28%|██▊       | 5/18 [00:54<02:08,  9.85s/it]

Document 4's STY accuracy:  0.46078431372549017
Document 4's POS accuracy with/out space: 0.8146551724137931 0.6558139534883721
Document 4's SUP:DEP accuracy with/out space:  0.625615763546798 0.3127906976744186



 33%|███▎      | 6/18 [01:04<01:57,  9.77s/it]

Document 5's STY accuracy:  0.7894736842105263
Document 5's POS accuracy with/out space: 0.7844690966719493 0.5996055226824457
Document 5's SUP:DEP accuracy with/out space:  0.5541468568409932 0.21005917159763313



 39%|███▉      | 7/18 [01:16<01:58, 10.77s/it]

Document 6's STY accuracy:  0.9148936170212766
Document 6's POS accuracy with/out space: 0.7844650205761317 0.5901477832512315
Document 6's SUP:DEP accuracy with/out space:  0.5509259259259259 0.17142857142857143



 44%|████▍     | 8/18 [01:23<01:34,  9.41s/it]

Document 7's STY accuracy:  0.5344827586206896
Document 7's POS accuracy with/out space: 0.7988614800759013 0.6084262701363073
Document 7's SUP:DEP accuracy with/out space:  0.5989879822896901 0.23915737298636927



 50%|█████     | 9/18 [01:34<01:28,  9.83s/it]

Document 8's STY accuracy:  0.5952380952380952
Document 8's POS accuracy with/out space: 0.7651048088779285 0.5520094562647754
Document 8's SUP:DEP accuracy with/out space:  0.5635018495684341 0.18557919621749408



 56%|█████▌    | 10/18 [01:51<01:36, 12.03s/it]

Document 9's STY accuracy:  0.72
Document 9's POS accuracy with/out space: 0.7804068078040681 0.5837282780410743
Document 9's SUP:DEP accuracy with/out space:  0.5541718555417185 0.17377567140600317



 61%|██████    | 11/18 [01:58<01:14, 10.69s/it]

Document 10's STY accuracy:  0.5428571428571428
Document 10's POS accuracy with/out space: 0.7709731543624161 0.5664
Document 10's SUP:DEP accuracy with/out space:  0.5444630872483222 0.16



 67%|██████▋   | 12/18 [02:09<01:05, 10.84s/it]

Document 11's STY accuracy:  0.8181818181818182
Document 11's POS accuracy with/out space: 0.7905027932960894 0.5923497267759563
Document 11's SUP:DEP accuracy with/out space:  0.5849162011173185 0.1989071038251366



 72%|███████▏  | 13/18 [02:20<00:54, 10.87s/it]

Document 12's STY accuracy:  0.6545454545454545
Document 12's POS accuracy with/out space: 0.775609756097561 0.5780287474332649
Document 12's SUP:DEP accuracy with/out space:  0.5398373983739837 0.15092402464065707



 78%|███████▊  | 14/18 [02:30<00:42, 10.64s/it]

Document 13's STY accuracy:  0.4418604651162791
Document 13's POS accuracy with/out space: 0.7782936738247244 0.5766666666666667
Document 13's SUP:DEP accuracy with/out space:  0.5536854323853744 0.16555555555555557



 83%|████████▎ | 15/18 [02:48<00:38, 12.85s/it]

Document 14's STY accuracy:  0.8918918918918919
Document 14's POS accuracy with/out space: 0.7904306220095694 0.6034639927073838
Document 14's SUP:DEP accuracy with/out space:  0.5550239234449761 0.1731996353691887



 89%|████████▉ | 16/18 [02:55<00:21, 10.89s/it]

Document 15's STY accuracy:  0.8421052631578947
Document 15's POS accuracy with/out space: 0.8127035830618893 0.632
Document 15's SUP:DEP accuracy with/out space:  0.5724755700325733 0.1888



 94%|█████████▍| 17/18 [03:02<00:09,  9.66s/it]

Document 16's STY accuracy:  0.32558139534883723
Document 16's POS accuracy with/out space: 0.7930214115781126 0.6027820710973725
Document 16's SUP:DEP accuracy with/out space:  0.591593973037272 0.2302936630602782



100%|██████████| 18/18 [03:10<00:00, 10.61s/it]

Document 17's STY accuracy:  0.4153846153846154
Document 17's POS accuracy with/out space: 0.824317362184441 0.6576305220883534
Document 17's SUP:DEP accuracy with/out space:  0.583204533745492 0.20983935742971888

Overall STY accuracy:  0.6588366890380313
Overall POS accuracy with/out space: 0.7885966608650808 0.5978046373951653
Overall SUP:DEP accuracy with/out space:  0.5659124605169857 0.19394425259003453





In [11]:
model.post_train(ptdocs, update = update, fine_tune = fine_tune, bits = bits)
if fine_tune and ptdocs:
    model.fine_tune(docs, covering = covering, all_layers = train_layers, streams = data_streams)

In [12]:
if ptdocs:
    model.interpret(tdocs, eval_layers = test_layers, eval_covering = tcovering, seed = seed, verbose_result = False, decode_method = decode_method)

In [13]:
if ptdocs:
    model.interpret(tdocs, eval_layers = test_layers, covering = tcovering, seed = seed, verbose_result = False, decode_method = decode_method)