# It's a Machine and Natural Language Tagger

In [1]:
from src.IaMaN.base import LM
from src.utils.data import load_ud
from collections import defaultdict
from collections import Counter
from tqdm import tqdm
import numpy as np
import os, re

seed = 691

print("Loading pre-training data...")
pretrain_path = '/data/newstweet/week_2019-40_article_texts/'
total_pretrain = len([pretrain_file for pretrain_file in os.listdir(pretrain_path) if re.search("^\d+.txt$", pretrain_file)])
num_pretrain = 5000 # total_pretrain

all_pretrain_files = [pretrain_file for pretrain_file in os.listdir(pretrain_path) if re.search("^\d+.txt$", pretrain_file)]
if num_pretrain:
    np.random.seed(seed)
    pretrain_files = np.random.choice(all_pretrain_files, size=num_pretrain, replace=False)
else:
    pretrain_files = np.array([])

ptdocs = [[[open(pretrain_path+pretrain_file).read()]] for pretrain_file in tqdm(pretrain_files)]

print("Loading gold-tagged UDs data...")
max_char = 200_000_000
load_set = 'GUM'; fine_tune = True; do_ife = False; update_ife = False; runners = 10
all_docs = load_ud("English", num_articles = 0, seed = 691, load_set = load_set, rebuild = True)
test_docs = [doc for doc in all_docs if 'test' in doc['id'] and len(doc['text']) <= max_char]# [:1]
train_docs = [doc for doc in all_docs if 'test' not in doc['id'] and len(doc['text']) <= max_char]# [:4]
nsamp = len(test_docs)
print('Avail. pre-train, total pre-train, Avail. gold, total gold-train, total test-gold: ', 
      total_pretrain, len(ptdocs), len(all_docs), len(train_docs), len(test_docs))

Loading pre-training data...


100%|██████████| 5000/5000 [00:18<00:00, 273.07it/s]


Loading gold-tagged UDs data...
Avail. pre-train, total pre-train, Avail. gold, total gold-train, total test-gold:  14198 5000 150 132 18


In [None]:
docs = [["".join([row[1] for row in s]) for s in d['conllu']] for d in train_docs]
tdocs = [["".join([row[1] for row in s]) for s in d['conllu']] for d in test_docs]
covering = [[[row[1] for row in s] for s in d['conllu']] for d in train_docs]
covering_vocab = set([t for d in covering for s in d for t in s])

all_layers = {d_i: {'pos': [[row[3] for row in s] for s in d['conllu']], 
                    'sup': [[(str(int(row[6]) - int(row[0])) if int(row[6]) else row[6]) for row in s] for s in d['conllu']], 
                    'dep': [[row[7] for row in s] for s in d['conllu']],
                    'sty': [[d['s_type'][s_i] for row in s] for s_i, s in enumerate(d['conllu'])]}
              for d_i, d in enumerate(train_docs)}

model = LM(covering_vocab = covering_vocab)
model.init(m = 10, noise = 0.001, positional = True, seed = seed, do_ife = do_ife, runners = runners)
model.fit(docs, f'{load_set}-{nsamp}', covering = covering, all_layers = all_layers)
model.pre_train(ptdocs, update_ife = update_ife)
if fine_tune:
    model.fine_tune(docs, covering = covering, all_layers = all_layers)

0it [00:00, ?it/s]


Training tokenizer...


Initializing: 100%|██████████| 6503/6503 [00:01<00:00, 3738.49it/s]
Fitting:  18%|█▊        | 18/100 [00:41<03:07,  2.29s/it]


Built a vocabulary of 10703 types
Tokenizing documents...


100%|██████████| 132/132 [00:17<00:00,  7.51it/s]


Counting documents and aggregating counts...


4296952it [03:00, 23824.65it/s] 


Collecting metadata...


100%|██████████| 132/132 [00:11<00:00, 11.30it/s]


Aggregating metadata...


100%|██████████| 132/132 [00:00<00:00, 460.47it/s]


Encoding parameters...


100%|██████████| 4296952/4296952 [00:20<00:00, 205131.09it/s]


Computing marginal statistics...


100%|██████████| 5137899/5137899 [00:21<00:00, 239584.10it/s]


Building dense output heads...


100%|██████████| 10/10 [01:51<00:00, 11.18s/it]


Done.
Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 4296952 10704 10704 224784 11000 177 0.179
Processing pre-training documents...
Tokenizing documents...


100%|██████████| 5000/5000 [23:41<00:00,  3.52it/s]


Counting documents and aggregating counts...


8958836it [30:30, 4893.06it/s] 


Collecting metadata...


100%|██████████| 5000/5000 [00:45<00:00, 109.04it/s]


Aggregating metadata...


100%|██████████| 5000/5000 [00:02<00:00, 2070.01it/s]


Encoding parameters...


100%|██████████| 11951248/11951248 [00:56<00:00, 211578.83it/s]


Re-computing marginal statistics...


100%|██████████| 17135672/17135672 [00:57<00:00, 296228.41it/s]


Re-building dense output heads...


100%|██████████| 10/10 [02:20<00:00, 14.03s/it]


Done.
Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 11951248 10704 10704 224784 11000 177 0.497
Fine-tuning dense output heads...


 30%|██▉       | 39/132 [22:38:06<50:19:25, 1948.02s/it]

__Currently__: ordering for the current fine tuning process:
1. train tokenizer and fit model to GUM
2. process NewsTweet documents to integrate sparse post-training statistics (requires mr implementation and updates to the vocabularies/indices)
3. update the ife and dense model, i.e., produce new statistics and dimensionalities
4. fine tune output heads to GUM, and _combine_ them with the dense model from (3), i.e., don't just replace as is current.

__Preliminarily__: this does seem to present performance benefits, but as is usual will require 'big data' statistics to become competitive. In particular, the (tokenization, least of all), counting, sorting, and aggregation of co-occurrence counts must all be distributed for the statistical resolution required to approach performance gains aking to more-advanced systems. Currently, a spark-based MR system is implemented for these (all but tokenization).

In [None]:
interpret_docs = list([docs[3][0:1]])
print(interpret_docs)
model.interpret(interpret_docs, seed = 691)
for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print('opening next sent:')
        for t in s._tokens:
            print('opening next token:')
            print((str(t), t._pos, t._sep, t._sup, t._dep, s._sty))

In [None]:
interpret_docs = list([docs[3][0:1]])
print(interpret_docs)
model.interpret(interpret_docs, seed = 691, dense_predict = True)
for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print('opening next sent:')
        for t in s._tokens:
            print('opening next token:')
            print((str(t), t._pos, t._sep, t._sup, t._dep, s._sty))

In [None]:
[[[(int(row[6])-int(row[0]) if int(row[6]) else int(row[6]), row[7]) for row in s] for s in d['conllu']] for d in train_docs][3][:1]

In [None]:
interpret_docs = list([tdocs[0][1:2]])
print(interpret_docs)
model.interpret(interpret_docs, seed = 691)
for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print('opening next sent:')
        for t in s._tokens:
            print('opening next token:')
            print((str(t), t._pos, t._sep, t._sup, t._dep, s._sty))

In [None]:
interpret_docs = list([tdocs[0][1:2]])
print(interpret_docs)
interpret_covering =  [[[row[1] for row in s] for s in d['conllu']][1:2] for d in test_docs[0:1]]
print(interpret_covering)
model.interpret(interpret_docs, seed = 691, covering = interpret_covering)

accuracy = defaultdict(list)
accuracy_nsp = defaultdict(list)
accuracy_all, accuracy_all_nsp, = [], []
sup_accuracy, sup_accuracy_nsp, = 0, 0
accuracy_sty = defaultdict(list)
accuracy_all_sty = []

for d_i, doc in enumerate(model._documents):
    for s_i, s in enumerate(doc._sentences):
        result = s._sty == test_docs[0:1][d_i]['s_type'][s_i+1]
        accuracy_sty[test_docs[0:1][d_i]['s_type'][s_i+1]].append(result)
        accuracy_all_sty.append(result)

pred_toks = [t._form for doc in model._documents for s in doc._sentences for t in s._tokens]
pred_arcs = set([(ix, str(t._sup), t._dep) for doc in model._documents for s in doc._sentences for ix, t in enumerate(s._tokens)])
pred_spans = list(np.cumsum([len(t) for t in pred_toks]))
pred_stream = [t._pos for doc in model._documents for s in doc._sentences for t in s._tokens]
pred_spans = {(sh-len(gt), sh): (gl, gt)
              for sh, gt, gl in zip(pred_spans, pred_toks, pred_stream)}

gold_toks = [row[1] for d in test_docs[:1] for s in d['conllu'][1:2] for row in s]
gold_arcs = set([(ix, (str(int(row[6]) - int(row[0])) if int(row[6]) else row[6]),
                  row[7]) for d in test_docs[:1] for s in d['conllu'][1:2] for ix, row in enumerate(s)])
gold_spans = list(np.cumsum([len(t) for t in gold_toks]))
gold_stream = [row[3] for d in test_docs[:1] for s in d['conllu'][1:2] for row in s]
gold_spans = {(sh-len(gt), sh): (gl, gt)
              for sh, gt, gl in zip(gold_spans, gold_toks, gold_stream)}

for gold_span in gold_spans:
    if gold_span in pred_spans:
        result = gold_spans[gold_span] == pred_spans[gold_span]
    else:
        result = False
    accuracy[gold_spans[gold_span][0]].append(result)
    accuracy_all.append(result)
    if gold_spans[gold_span][1] != ' ':
        accuracy_nsp[gold_spans[gold_span][0]].append(result)
        accuracy_all_nsp.append(result)
        
for ptok, parc in zip(pred_toks, pred_arcs):
    if parc in gold_arcs:
        sup_accuracy += 1
        if ptok != ' ':
            sup_accuracy_nsp += 1
sup_accuracy /= len(pred_toks)
sup_accuracy_nsp /= len([x for x in pred_toks if x != ' '])

print("Tag-wise POS accuracy with/out space", {tag: sum(accuracy[tag])/len(accuracy[tag]) for tag in accuracy}, 
                                          {tag: sum(accuracy_nsp[tag])/len(accuracy_nsp[tag]) for tag in accuracy_nsp})
print("Overall POS accuracy with/out space", sum(accuracy_all)/len(accuracy_all), sum(accuracy_all_nsp)/len(accuracy_all_nsp))
print("Overall SUP:DEP accuracy with/out space", sup_accuracy, sup_accuracy_nsp)
print("Overall s_type accuracy: ", sum(accuracy_all_sty)/len(accuracy_all_sty))
print("Tag-wise accuracy", list(Counter({tag: (sum(accuracy_sty[tag])/len(accuracy_sty[tag]), len(accuracy_sty[tag])) 
                                         for tag in accuracy_sty}).most_common()))

In [None]:
for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print('opening next sent:')
        for t in s._tokens:
            print('opening next token:')
            print((str(t), t._pos, t._sep, t._sup, t._dep, s._sty))

In [None]:
interpret_docs = list([tdocs[0][:2]])
print(interpret_docs)
interpret_covering =  [[[row[1] for row in s] for s in d['conllu']][:2] for d in test_docs[0:1]]
print(interpret_covering)
model.interpret(interpret_docs, seed = 691, covering = interpret_covering)

accuracy = defaultdict(list)
accuracy_nsp = defaultdict(list)
accuracy_all, accuracy_all_nsp, = [], []
sup_accuracy, sup_accuracy_nsp, = 0, 0
accuracy_sty = defaultdict(list)
accuracy_all_sty = []

for d_i, doc in enumerate(model._documents):
    for s_i, s in enumerate(doc._sentences):
        result = s._sty == test_docs[0:1][d_i]['s_type'][s_i]
        accuracy_sty[test_docs[0:1][d_i]['s_type'][s_i]].append(result)
        accuracy_all_sty.append(result)

pred_toks = [t._form for doc in model._documents for s in doc._sentences for t in s._tokens]
pred_arcs = set([(ix, str(t._sup), t._dep, s_i, d_i) for d_i, doc in enumerate(model._documents) for s_i, s in enumerate(doc._sentences) for ix, t in enumerate(s._tokens)])
pred_spans = list(np.cumsum([len(t) for t in pred_toks]))
pred_stream = [t._pos for doc in model._documents for s in doc._sentences for t in s._tokens]
pred_spans = {(sh-len(gt), sh): (gl, gt)
              for sh, gt, gl in zip(pred_spans, pred_toks, pred_stream)}

gold_toks = [row[1] for d in test_docs[:1] for s in d['conllu'][:2] for row in s]
gold_arcs = set([(ix, (str(int(row[6]) - int(row[0])) if int(row[6]) else row[6]), row[7], s_i, d_i) 
                 for d_i, d in enumerate(test_docs[:1]) for s_i, s in enumerate(d['conllu'][:2]) for ix, row in enumerate(s)])
gold_spans = list(np.cumsum([len(t) for t in gold_toks]))
gold_stream = [row[3] for d in test_docs[:1] for s in d['conllu'][:2] for row in s]
gold_spans = {(sh-len(gt), sh): (gl, gt)
              for sh, gt, gl in zip(gold_spans, gold_toks, gold_stream)}

for gold_span in gold_spans:
    if gold_span in pred_spans:
        result = gold_spans[gold_span] == pred_spans[gold_span]
    else:
        result = False
    accuracy[gold_spans[gold_span][0]].append(result)
    accuracy_all.append(result)
    if gold_spans[gold_span][1] != ' ':
        accuracy_nsp[gold_spans[gold_span][0]].append(result)
        accuracy_all_nsp.append(result)
        
for ptok, parc in zip(pred_toks, pred_arcs):
    if parc in gold_arcs:
        sup_accuracy += 1
        if ptok != ' ':
            sup_accuracy_nsp += 1
sup_accuracy /= len(pred_toks)
sup_accuracy_nsp /= len([x for x in pred_toks if x != ' '])

print("Tag-wise POS accuracy with/out space", {tag: sum(accuracy[tag])/len(accuracy[tag]) for tag in accuracy}, 
                                          {tag: sum(accuracy_nsp[tag])/len(accuracy_nsp[tag]) for tag in accuracy_nsp})
print("Overall POS accuracy with/out space", sum(accuracy_all)/len(accuracy_all), sum(accuracy_all_nsp)/len(accuracy_all_nsp))
print("Overall SUP:DEP accuracy with/out space", sup_accuracy, sup_accuracy_nsp)
print("Overall s_type accuracy: ", sum(accuracy_all_sty)/len(accuracy_all_sty))
print("Tag-wise accuracy", list(Counter({tag: (sum(accuracy_sty[tag])/len(accuracy_sty[tag]), len(accuracy_sty[tag])) 
                                         for tag in accuracy_sty}).most_common()))

In [None]:
for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print('opening next sent:')
        for t in s._tokens:
            print('opening next token:')
            print((str(t), t._pos, t._sep, t._sup, t._dep, s._sty))

In [None]:
model._documents[0]._sentences[0]

In [None]:
print(model._documents[0]._sentences[0], 
      [t_i for  t_i in model._documents[0]._sentences[0].yield_branch(13)],
      [model._documents[0]._sentences[0]._tokens[t_i] for  t_i in sorted(model._documents[0]._sentences[0].yield_branch(13))])

In [None]:
list(zip(sorted(pred_arcs, key = lambda x: (x[3],x[4],x[0])), sorted(gold_arcs, key = lambda x: (x[3],x[4],x[0]))))

In [None]:
model.interpret(interpret_docs, seed = 691, covering = interpret_covering, predict_tags = True)

In [None]:
import numpy as np
print("model vector dimension: ", model._documents[0]._sentences[0]._tokens[0]._whatevers[0]._vec.shape[0])
model._documents[0]._sentences[0]._tokens[0]._whatevers[0]._vec.shape[0]
nspv = model._documents[0]._sentences[0]._tokens[1]._whatevers[-1]._vec; nspvn = np.linalg.norm(nspv)
spv1 = model._documents[0]._sentences[0]._tokens[2]._whatevers[-1]._vec; spv1n = np.linalg.norm(spv1)
spv2 = model._documents[0]._sentences[0]._tokens[4]._whatevers[-1]._vec; spv2n = np.linalg.norm(spv2)

print("vector similarity of non-space to space whatevers (first two), \nvs. similarity of separate space tokens (last one): \n\n",
      nspv.dot(spv1)/(nspvn*spv1n), nspv.dot(spv2)/(nspvn*spv2n), spv1.dot(spv2)/(spv1n*spv2n))

In [None]:
model.interpret(interpret_docs, seed = 691, covering = interpret_covering, predict_tags = False)

In [None]:
import numpy as np
print("model vector dimension: ", model._documents[0]._sentences[0]._tokens[0]._whatevers[0]._vec.shape[0])
model._documents[0]._sentences[0]._tokens[0]._whatevers[0]._vec.shape[0]
nspv = model._documents[0]._sentences[0]._tokens[1]._whatevers[-1]._vec; nspvn = np.linalg.norm(nspv)
spv1 = model._documents[0]._sentences[0]._tokens[2]._whatevers[-1]._vec; spv1n = np.linalg.norm(spv1)
spv2 = model._documents[0]._sentences[0]._tokens[4]._whatevers[-1]._vec; spv2n = np.linalg.norm(spv2)

print("vector similarity of non-space to space whatevers (first two), \nvs. similarity of separate space tokens (last one): \n\n",
      nspv.dot(spv1)/(nspvn*spv1n), nspv.dot(spv2)/(nspvn*spv2n), spv1.dot(spv2)/(spv1n*spv2n))

In [None]:
(doc._vec.shape, [doc._atn, s._atn, t._atn, t._whatevers[0]._atn], 
 [doc._nrm, s._nrm, t._nrm, t._whatevers[0]._nrm], 
 [doc._vec[:10], s._vec[:10], t._vec[:10], t._whatevers[0]._vec[:10]])

In [None]:
from tqdm import tqdm
accuracy = defaultdict(list)
accuracy_nsp = defaultdict(list)
accuracy_all, accuracy_all_nsp, = [], []
sup_accuracy, sup_accuracy_nsp, = 0, 0
accuracy_sty = defaultdict(list)
accuracy_all_sty = []

model.interpret(tdocs, seed = 691, covering = [[[row[1] for row in s] for s in d['conllu']] for d in test_docs]) 
                # ltypes = ['lem'],  layers = [[[[row[2] for row in s] for s in d['conllu']] for d in test_docs]])

for d_i, doc in enumerate(model._documents):
    for s_i, s in enumerate(doc._sentences):
        result = s._sty == test_docs[d_i]['s_type'][s_i]
        accuracy_sty[test_docs[d_i]['s_type'][s_i]].append(result)
        accuracy_all_sty.append(result)

pred_toks = [t._form for doc in model._documents for s in doc._sentences for t in s._tokens]
pred_arcs = set([(ix, str(t._sup), t._dep, d_i, s_i) for d_i, doc in enumerate(model._documents) 
                 for s_i, s in enumerate(doc._sentences) for ix, t in enumerate(s._tokens)])
pred_spans = list(np.cumsum([len(t) for t in pred_toks]))
pred_stream = [t._pos for doc in model._documents for s in doc._sentences for t in s._tokens]
pred_spans = {(sh-len(gt), sh): (gl, gt)
              for sh, gt, gl in zip(pred_spans, pred_toks, pred_stream)}

gold_toks = [row[1] for d in test_docs for s in d['conllu'] for row in s]
gold_arcs = set([(ix, (str(int(row[6]) - int(row[0])) if int(row[6]) else row[6]), row[7], d_i, s_i) 
                 for d_i, d in enumerate(test_docs) for s_i, s in enumerate(d['conllu']) for ix, row in enumerate(s)])
gold_spans = list(np.cumsum([len(t) for t in gold_toks]))
gold_stream = [row[3] for d in test_docs for s in d['conllu'] for row in s]
gold_spans = {(sh-len(gt), sh): (gl, gt)
              for sh, gt, gl in zip(gold_spans, gold_toks, gold_stream)}

for gold_span in gold_spans:
    if gold_span in pred_spans:
        result = gold_spans[gold_span] == pred_spans[gold_span]
    else:
        result = False
    accuracy[gold_spans[gold_span][0]].append(result)
    accuracy_all.append(result)
    if gold_spans[gold_span][1] != ' ':
        accuracy_nsp[gold_spans[gold_span][0]].append(result)
        accuracy_all_nsp.append(result)
        
for ptok, parc in zip(pred_toks, pred_arcs):
    if parc in gold_arcs:
        sup_accuracy += 1
        if ptok != ' ':
            sup_accuracy_nsp += 1
sup_accuracy /= len(pred_toks)
sup_accuracy_nsp /= len([x for x in pred_toks if x != ' '])

print("Overall POS accuracy with/out space", sum(accuracy_all)/len(accuracy_all), sum(accuracy_all_nsp)/len(accuracy_all_nsp))
print("Overall SUP:DEP accuracy with/out space", sup_accuracy, sup_accuracy_nsp)
print("Overall s_type accuracy: ", sum(accuracy_all_sty)/len(accuracy_all_sty))
print("Tag-wise accuracy", list(Counter({tag: (sum(accuracy_sty[tag])/len(accuracy_sty[tag]), len(accuracy_sty[tag])) 
                                         for tag in accuracy_sty}).most_common()))

In [None]:
"Tag-wise accuracy with space", list(Counter({tag: (sum(accuracy[tag])/len(accuracy[tag]), len(accuracy[tag])) for tag in accuracy}).most_common())

In [None]:
"Tag-wise accuracy without space", list(Counter({tag: (sum(accuracy_nsp[tag])/len(accuracy_nsp[tag]), len(accuracy_nsp[tag])) for tag in accuracy_nsp}).most_common())

- Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 4296952 10704 10704 224784 11000 177 0.179
- Train = 132; post-train = 0; fine-tune = 132; test = 18; IFE = False

```
Overall POS accuracy with/out space 0.8729921677950352 0.7639985199802664
('Tag-wise accuracy with space',
 [('PUNCT', (0.9917376064096144, 15976)),
  ('PRON', (0.861652739090065, 1077)),
  ('DET', (0.8605150214592274, 1398)),
  ('ADP', (0.8532002348796242, 1703)),
  ('NOUN', (0.852911477616964, 2971)),
  ('CCONJ', (0.7448979591836735, 588)),
  ('VERB', (0.7441441441441441, 1665)),
  ('AUX', (0.7142857142857143, 721)),
  ('PART', (0.6507462686567164, 335)),
  ('SCONJ', (0.6147540983606558, 244)),
  ('ADJ', (0.566003616636528, 1106)),
  ('NUM', (0.565597667638484, 343)),
  ('ADV', (0.564437194127243, 613)),
  ('INTJ', (0.5113636363636364, 88)),
  ('PROPN', (0.4561544650040225, 1243)),
  ('SYM', (0.11428571428571428, 35)),
  ('X', (0.038461538461538464, 26))])
Overall SUP:DEP accuracy with/out space 0.6265944724953495 0.6232394366197183
Overall s_type accuracy:  0.7259507829977628
Tag-wise accuracy [('decl', (0.9830220713073005, 589)), 
                   ('intj', (0.5, 26)), 
                   ('q', (0.5, 16)), 
                   ('imp', (0.3877551020408163, 49)), 
                   ('frag', (0.25806451612903225, 93)), 
                   ('wh', (0.19047619047619047, 21)), 
                   ('multiple', (0.03125, 32)), 
                   ('sub', (0.024390243902439025, 41)), 
                   ('other', (0.0, 14)), 
                   ('ger', (0.0, 8)), 
                   ('inf', (0.0, 5))]
```

- Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 11951248 10704 10704 224784 11000 177 0.497
- Train = 132; post-train = 5000; fine-tune = 132; test = 18; IFE = False

```

```