# It's a Machine and Natural Language Model

In [1]:
from src.IaMaN.base import LM
from src.utils.data import load_wikitext
from src.utils.munge import stick_spaces
from collections import defaultdict
from collections import Counter
from tqdm import tqdm
import numpy as np
import os, re

seed = 691; max_char = 200_000_000
m = 10; space = True; fine_tune = False; num_posttrain = 0; noise = 0.001
positional = 'dependent'; positionally_encode = True; bits = 50; update = True; btype = 'df'; ms_init = 'waiting_time'
runners = 10; gpu = False; tokenizer = 'hr-bpe' # 'sentokenizer' #

total_posttrain = 0; ptdocs = []
if num_posttrain:
    num_posttrain = 0 if num_posttrain == 'all' else num_posttrain
    print("Loading WikiText-103 as post-training data...")
    posttrain_docs = load_wikitext(v = '103', split = "train", num_articles = num_posttrain, seed = seed, space = space, rebuild = True)
    ptdocs = [["".join(s) for s in d['document']] for d in posttrain_docs]; ptdocs = [d for d in ptdocs if len(d)]
print("Loading WikiText-2 LM'ing data...")
train_docs = load_wikitext(v = '2', split = "train", num_articles = 0, seed = seed, space = space, rebuild = True)
test_docs = load_wikitext(v = '2', split = "test", num_articles = 0, seed = seed, space = space, rebuild = True)
test_docs = [doc for doc in test_docs if len(doc['text']) and (len(doc['text']) <= max_char)]# [:2]
train_docs = [doc for doc in train_docs if len(doc['text']) and (len(doc['text']) <= max_char)]# [:4]
print('Avail. post-train, total post-train, Avail. gold, total gold-train, total test-gold: ', 
      total_posttrain, len(ptdocs), len(train_docs) + len(test_docs), len(train_docs), len(test_docs))

Loading WikiText-2 LM'ing data...
Avail. post-train, total post-train, Avail. gold, total gold-train, total test-gold:  0 0 672 611 61


In [2]:
docs = [["".join(s) for s in d['document']] for d in train_docs]
tdocs = [["".join(s) for s in d['document']] for d in test_docs]
covering = [[re.split('( )', ''.join(s)) for s in d['document']] for d in train_docs]
tcovering = [[re.split('( )', ''.join(s)) for s in d['document']] for d in test_docs]
if not space:
    for d_i, d in enumerate(covering):
        for s_i, s in enumerate(d):
            covering[d_i][s_i] = stick_spaces(s)
    for d_i, d in enumerate(tcovering):
        for s_i, s in enumerate(d):
            tcovering[d_i][s_i] = stick_spaces(s)            
covering_vocab = set([t for d in covering for s in d for t in s])
model = LM(m = m, tokenizer = tokenizer, noise = noise, seed = seed, space = space, positional = positional,
           positionally_encode = positionally_encode, bits = bits, runners = runners, gpu = gpu, 
           btype = btype, ms_init = ms_init)
data_streams = model.fit(docs, f'WT2-{len(test_docs)}', fine_tune = fine_tune, covering = covering)

Training tokenizer...


Initializing: 100%|██████████| 27707/27707 [00:24<00:00, 1147.86it/s]
Fitting:   7%|▋         | 7/100 [02:20<31:05, 20.05s/it]


Built a vocabulary of 34914 types
Tokenizing documents...


100%|██████████| 611/611 [08:08<00:00,  1.25it/s]


Pre-processing data...


100%|██████████| 611/611 [00:37<00:00, 16.31it/s]


Counting documents and aggregating counts...


13411524it [26:05, 8568.78it/s] 


Counting tag-tag transition frequencies...


100%|██████████| 611/611 [00:06<00:00, 98.26it/s]


Encoding parameters...


100%|██████████| 13411524/13411524 [03:22<00:00, 66147.37it/s]


Building target vocabularies...


100%|██████████| 16/16 [00:00<00:00, 262.05it/s]


Pre-computing BOW probabilities... done.
Pre-computing wave amplitudes... done.
Stacking output vocabularies for decoders...


100%|██████████| 7/7 [00:00<00:00, 57795.53it/s]


Building dense output heads...


100%|██████████| 15/15 [00:01<00:00, 11.13it/s]


Building transition matrices for tag-sequence decoding...


  self._trXs[ltype] = (lambda M: M/M.sum(axis = 1)[:,None])(self._trXs[ltype])
100%|██████████| 6/6 [00:00<00:00, 8322.03it/s]

Done.
Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 5080629 24042 50 1071 63 1647 31.874





In [3]:
model._ms

{'nov': 2,
 'iat': 2,
 'bot': 2,
 'eot': 2,
 'eos': 2,
 'eod': 2,
 'form': 6,
 'bits': 9}

In [4]:
model.generate(m = 50, seed = seed)

McCorduckGertrudeMcCorduckernick North , <unk> tongue whom . Public filmeum day , though a University in the downfall w the 1960s dropped performance the


```
document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  1 2.0 27.74 236.2 1696 982
```

In [5]:
out = model.stencil(docs = tdocs, return_output = True, covering = tcovering)

Tokenizing documents...


100%|██████████| 61/61 [00:52<00:00,  1.16it/s]


Pre-processing data...


100%|██████████| 61/61 [00:04<00:00, 14.30it/s]
100%|██████████| 61/61 [00:33<00:00,  1.81it/s]
  2%|▏         | 1/61 [00:08<08:39,  8.66s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  1 2.0 27.18 226.85 1696 982


  3%|▎         | 2/61 [01:02<34:23, 34.97s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  2 3.0 34.88 333.32 10440 5984


  5%|▍         | 3/61 [01:24<28:18, 29.29s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  3 5.0 39.84 449.67 4428 2487


  7%|▋         | 4/61 [02:44<46:53, 49.37s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  4 7.000000000000001 32.89 282.5 15789 9118


  8%|▊         | 5/61 [03:05<36:20, 38.94s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  5 8.0 30.28 312.56 4020 2263


 10%|▉         | 6/61 [03:29<31:15, 34.11s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  6 10.0 31.49 284.36 4902 2827


 11%|█▏        | 7/61 [05:24<54:31, 60.59s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  7 11.0 32.4 327.35 22655 12815


 13%|█▎        | 8/61 [05:48<42:57, 48.64s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  8 13.0 38.21 428.21 4554 2570


 15%|█▍        | 9/61 [07:48<1:01:30, 70.97s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  9 15.0 39.18 421.63 23719 13556


 16%|█▋        | 10/61 [08:06<46:26, 54.64s/it] 

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  10 16.0 32.42 320.38 3552 2028


 18%|█▊        | 11/61 [08:39<40:10, 48.21s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  11 18.0 35.66 371.07 6415 3669


 20%|█▉        | 12/61 [10:12<50:19, 61.63s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  12 20.0 30.78 254.96 16777 9687


 21%|██▏       | 13/61 [10:21<36:42, 45.88s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  13 21.0 32.24 310.98 1725 990


 23%|██▎       | 14/61 [10:45<30:42, 39.19s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  14 23.0 44.79 452.42 4299 2497


 25%|██▍       | 15/61 [10:49<21:58, 28.67s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  15 25.0 35.66 322.06 781 456


 26%|██▌       | 16/61 [11:08<19:17, 25.72s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  16 26.0 34.31 349.8 3447 1967


 28%|██▊       | 17/61 [12:25<30:05, 41.03s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  17 28.000000000000004 33.26 349.7 13866 7807


 30%|██▉       | 18/61 [12:36<23:04, 32.20s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  18 30.0 34.46 305.57 2054 1190


 31%|███       | 19/61 [13:21<25:12, 36.00s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  19 31.0 35.95 364.69 8114 4654


 33%|███▎      | 20/61 [14:02<25:39, 37.55s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  20 33.0 33.8 311.17 7457 4286


 34%|███▍      | 21/61 [14:54<27:55, 41.89s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  21 34.0 34.43 365.15 9450 5374


 36%|███▌      | 22/61 [15:09<21:54, 33.71s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  22 36.0 32.72 218.0 2676 1589


 38%|███▊      | 23/61 [15:50<22:42, 35.86s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  23 38.0 26.13 221.82 7461 4205


 39%|███▉      | 24/61 [17:57<38:53, 63.06s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  24 39.0 31.23 322.59 22928 12916


 41%|████      | 25/61 [18:18<30:17, 50.48s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  25 41.0 33.04 335.38 3794 2147


 43%|████▎     | 26/61 [19:36<34:22, 58.92s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  26 43.0 32.6 330.2 14315 8106


 44%|████▍     | 27/61 [21:37<43:49, 77.34s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  27 44.0 36.21 415.37 21750 12246


 46%|████▌     | 28/61 [21:52<32:18, 58.73s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  28 46.0 37.65 317.43 2749 1618


 48%|████▊     | 29/61 [21:55<22:21, 41.91s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  29 48.0 40.63 310.39 472 284


 49%|████▉     | 30/61 [22:46<23:04, 44.68s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  30 49.0 32.97 333.6 9314 5282


 51%|█████     | 31/61 [23:03<18:09, 36.33s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  31 51.0 42.68 428.93 3059 1750


 52%|█████▏    | 32/61 [24:13<22:27, 46.47s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  32 52.0 28.79 249.78 12786 7326


 54%|█████▍    | 33/61 [24:31<17:43, 38.00s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  33 54.0 36.91 413.71 3312 1864


 56%|█████▌    | 34/61 [26:20<26:43, 59.40s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  34 56.00000000000001 35.61 324.77 19852 11463


 57%|█████▋    | 35/61 [27:44<28:54, 66.71s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  35 56.99999999999999 37.88 425.54 15108 8498


 59%|█████▉    | 36/61 [28:03<21:50, 52.41s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  36 59.0 44.57 535.35 3396 1923


 61%|██████    | 37/61 [30:57<35:32, 88.87s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  37 61.0 38.88 405.0 31496 17961


 62%|██████▏   | 38/61 [31:46<29:27, 76.85s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  38 62.0 36.4 375.29 8795 5027


 64%|██████▍   | 39/61 [33:37<31:58, 87.21s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  39 64.0 31.41 281.46 20247 11558


 66%|██████▌   | 40/61 [35:39<34:09, 97.61s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  40 66.0 38.37 434.16 22055 12480


 67%|██████▋   | 41/61 [36:06<25:26, 76.31s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  41 67.0 34.47 258.66 4783 2821


 69%|██████▉   | 42/61 [36:17<17:57, 56.71s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  42 69.0 41.45 484.5 1993 1133


 70%|███████   | 43/61 [37:15<17:09, 57.22s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  43 70.0 36.41 327.4 10623 6136


 72%|███████▏  | 44/61 [37:39<13:23, 47.27s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  44 72.0 33.26 338.59 4392 2463


 74%|███████▍  | 45/61 [37:51<09:44, 36.54s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  45 74.0 43.96 425.84 2089 1209


 75%|███████▌  | 46/61 [38:05<07:27, 29.80s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  46 75.0 39.82 470.87 2534 1431


 77%|███████▋  | 47/61 [39:18<10:00, 42.90s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  47 77.0 28.7 279.17 13007 7301


 79%|███████▊  | 48/61 [39:30<07:17, 33.66s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  48 79.0 24.05 215.96 2211 1239


 80%|████████  | 49/61 [39:46<05:39, 28.26s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  49 80.0 41.51 475.05 2853 1617


 82%|████████▏ | 50/61 [40:15<05:15, 28.66s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  50 82.0 34.79 381.66 5383 3050


 84%|████████▎ | 51/61 [40:45<04:50, 29.02s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  51 84.0 40.96 473.69 5455 3126


 85%|████████▌ | 52/61 [40:56<03:31, 23.45s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  52 85.0 67.96 1057.22 1899 1084


 87%|████████▋ | 53/61 [41:06<02:36, 19.52s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  53 87.0 35.93 364.49 1888 1078


 89%|████████▊ | 54/61 [41:17<01:58, 16.90s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  54 89.0 37.87 371.09 1952 1127


 90%|█████████ | 55/61 [41:26<01:26, 14.44s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  55 90.0 31.43 318.44 1588 903


 92%|█████████▏| 56/61 [42:46<02:51, 34.36s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  56 92.0 34.73 328.34 14749 8501


 93%|█████████▎| 57/61 [42:51<01:41, 25.47s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  57 93.0 25.81 233.69 863 480


 95%|█████████▌| 58/61 [43:24<01:22, 27.59s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  58 95.0 36.92 419.84 5844 3302


 97%|█████████▋| 59/61 [44:25<01:15, 37.68s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  59 97.0 35.9 358.56 11172 6381


 98%|█████████▊| 60/61 [45:11<00:40, 40.22s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  60 98.0 30.2 273.01 8388 4807


100%|██████████| 61/61 [45:52<00:00, 45.12s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  61 100.0 40.85 440.31 7335 4164





In [6]:
print('micro: ', round(1/(10**(np.log10([p[0] for ps in out for p in ps]).mean())), 3),
      '; macro: ', round(np.mean([1/(10**(np.log10([p[0] for p in ps]).mean())) for ps in out]), 3),
      '; micro (w/out space): ', round(1/(10**(np.log10([p[1] for ps in out for p in ps if p[1] is not None]).mean())), 3),
      '; macro (w/out space): ', round(np.mean([1/(10**(np.log10([p[1] for p in ps if p[1] is not None]).mean())) for ps in out]), 3))
# micro:  25.921 ; macro:  26.308 ; micro (w/out space):  243.428 ; macro (w/out space):  248.857

micro:  34.719 ; macro:  35.668 ; micro (w/out space):  346.333 ; macro (w/out space):  362.616


In [7]:
model.post_train(ptdocs, update = update, fine_tune = fine_tune, bits = bits)
if fine_tune and ptdocs:
    model.fine_tune(docs, covering = covering, streams = data_streams)

In [8]:
if ptdocs:
    out = model.stencil(docs = tdocs, return_output = True, covering = tcovering)

In [9]:
if ptdocs:
    print('micro: ', round(1/(10**(np.log10([p[0] for ps in out for p in ps]).mean())), 3),
          '; macro: ', round(np.mean([1/(10**(np.log10([p[0] for p in ps]).mean())) for ps in out]), 3),
          '; micro (w/out space): ', round(1/(10**(np.log10([p[1] for ps in out for p in ps if p[1] is not None]).mean())), 3),
          '; macro (w/out space): ', round(np.mean([1/(10**(np.log10([p[1] for p in ps if p[1] is not None]).mean())) for ps in out]), 3))

In [10]:
model.build_base_model(fine_tune = fine_tune, btype = 'nf')

Encoding parameters...


100%|██████████| 13411524/13411524 [03:22<00:00, 66290.59it/s]


Building target vocabularies...


100%|██████████| 16/16 [00:00<00:00, 251.79it/s]


Pre-computing BOW probabilities... done.
Pre-computing wave amplitudes... done.
Stacking output vocabularies for decoders...


100%|██████████| 7/7 [00:00<00:00, 51781.53it/s]


Building dense output heads...


100%|██████████| 15/15 [00:01<00:00, 12.84it/s]


Building transition matrices for tag-sequence decoding...


100%|██████████| 6/6 [00:00<00:00, 11881.88it/s]

Done.
Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 5080629 24042 50 1071 63 1647 31.874





In [11]:
model.generate(m = 50, seed = seed)

McCorduckMcCorduckMcCorduckHerff Olivier the original favor order of mark Timeeum G to produc tropical story to be approach T <unk> 's Sun held ,


```
document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  1 2.0 27.26 227.12 1696 982
```

In [12]:
out = model.stencil(docs = tdocs, return_output = True, covering = tcovering)

Tokenizing documents...


100%|██████████| 61/61 [00:52<00:00,  1.16it/s]


Pre-processing data...


100%|██████████| 61/61 [00:04<00:00, 14.06it/s]
100%|██████████| 61/61 [00:31<00:00,  1.91it/s]
  2%|▏         | 1/61 [00:08<08:35,  8.59s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  1 2.0 27.74 236.2 1696 982


  3%|▎         | 2/61 [01:00<33:46, 34.34s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  2 3.0 35.18 339.82 10440 5984


  5%|▍         | 3/61 [01:23<27:50, 28.80s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  3 5.0 39.7 450.77 4428 2487


  7%|▋         | 4/61 [02:42<46:21, 48.80s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  4 7.000000000000001 32.79 281.63 15789 9118


  8%|▊         | 5/61 [03:02<35:56, 38.51s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  5 8.0 30.6 319.95 4020 2263


 10%|▉         | 6/61 [03:29<31:28, 34.33s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  6 10.0 30.53 269.98 4902 2827


 11%|█▏        | 7/61 [05:33<57:24, 63.79s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  7 11.0 31.84 318.27 22655 12815


 13%|█▎        | 8/61 [05:58<45:27, 51.47s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  8 13.0 38.34 433.79 4554 2570


 15%|█▍        | 9/61 [08:09<1:05:59, 76.14s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  9 15.0 40.11 441.1 23719 13556


 16%|█▋        | 10/61 [08:28<49:53, 58.69s/it] 

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  10 16.0 32.98 331.37 3552 2028


 18%|█▊        | 11/61 [09:03<42:56, 51.53s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  11 18.0 35.99 378.46 6415 3669


 20%|█▉        | 12/61 [10:35<52:08, 63.84s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  12 20.0 31.4 264.96 16777 9687


 21%|██▏       | 13/61 [10:45<37:54, 47.39s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  13 21.0 32.93 325.73 1725 990


 23%|██▎       | 14/61 [11:09<31:29, 40.21s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  14 23.0 45.32 461.68 4299 2497


 25%|██▍       | 15/61 [11:13<22:31, 29.38s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  15 25.0 34.03 299.29 781 456


 26%|██▌       | 16/61 [11:32<19:39, 26.22s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  16 26.0 33.36 335.94 3447 1967


 28%|██▊       | 17/61 [12:48<30:13, 41.21s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  17 28.000000000000004 33.11 347.21 13866 7807


 30%|██▉       | 18/61 [12:59<23:04, 32.21s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  18 30.0 34.19 303.05 2054 1190


 31%|███       | 19/61 [13:44<25:07, 35.89s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  19 31.0 36.32 372.31 8114 4654


 33%|███▎      | 20/61 [14:24<25:33, 37.41s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  20 33.0 34.06 316.42 7457 4286


 34%|███▍      | 21/61 [15:16<27:49, 41.73s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  21 34.0 34.8 373.88 9450 5374


 36%|███▌      | 22/61 [15:31<21:52, 33.64s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  22 36.0 32.72 220.73 2676 1589


 38%|███▊      | 23/61 [16:12<22:40, 35.80s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  23 38.0 25.86 219.08 7461 4205


 39%|███▉      | 24/61 [18:18<38:49, 62.95s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  24 39.0 30.19 304.63 22928 12916


 41%|████      | 25/61 [18:39<30:12, 50.35s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  25 41.0 34.07 358.46 3794 2147


 43%|████▎     | 26/61 [19:58<34:19, 58.85s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  26 43.0 33.3 344.26 14315 8106


 44%|████▍     | 27/61 [21:58<43:48, 77.30s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  27 44.0 36.68 427.87 21750 12246


 46%|████▌     | 28/61 [22:14<32:19, 58.78s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  28 46.0 38.65 336.1 2749 1618


 48%|████▊     | 29/61 [22:16<22:22, 41.96s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  29 48.0 41.13 319.89 472 284


 49%|████▉     | 30/61 [23:08<23:09, 44.81s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  30 49.0 33.51 343.97 9314 5282


 51%|█████     | 31/61 [23:25<18:12, 36.41s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  31 51.0 41.16 402.1 3059 1750


 52%|█████▏    | 32/61 [24:35<22:29, 46.52s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  32 52.0 29.05 253.93 12786 7326


 54%|█████▍    | 33/61 [24:53<17:44, 38.01s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  33 54.0 37.88 435.71 3312 1864


 56%|█████▌    | 34/61 [26:42<26:39, 59.26s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  34 56.00000000000001 35.49 323.55 19852 11463


 57%|█████▋    | 35/61 [28:05<28:50, 66.57s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  35 56.99999999999999 38.2 433.07 15108 8498


 59%|█████▉    | 36/61 [28:24<21:46, 52.27s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  36 59.0 45.83 566.96 3396 1923


 61%|██████    | 37/61 [31:18<35:25, 88.57s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  37 61.0 39.61 418.62 31496 17961


 62%|██████▏   | 38/61 [32:07<29:23, 76.69s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  38 62.0 37.12 389.93 8795 5027


 64%|██████▍   | 39/61 [33:58<31:55, 87.07s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  39 64.0 31.84 289.91 20247 11558


 66%|██████▌   | 40/61 [35:59<34:04, 97.36s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  40 66.0 38.85 446.17 22055 12480


 67%|██████▋   | 41/61 [36:26<25:22, 76.11s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  41 67.0 34.75 263.14 4783 2821


 69%|██████▉   | 42/61 [36:37<17:54, 56.58s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  42 69.0 41.53 486.39 1993 1133


 70%|███████   | 43/61 [37:37<17:20, 57.80s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  43 70.0 36.62 331.88 10623 6136


 72%|███████▏  | 44/61 [38:01<13:30, 47.67s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  44 72.0 33.25 340.07 4392 2463


 74%|███████▍  | 45/61 [38:13<09:48, 36.81s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  45 74.0 44.48 435.05 2089 1209


 75%|███████▌  | 46/61 [38:27<07:29, 29.94s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  46 75.0 39.75 473.54 2534 1431


 77%|███████▋  | 47/61 [39:38<09:53, 42.37s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  47 77.0 28.8 282.18 13007 7301


 79%|███████▊  | 48/61 [39:50<07:12, 33.30s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  48 79.0 24.09 217.42 2211 1239


 80%|████████  | 49/61 [40:06<05:35, 28.00s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  49 80.0 41.89 485.75 2853 1617


 82%|████████▏ | 50/61 [40:36<05:13, 28.46s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  50 82.0 35.15 390.93 5383 3050


 84%|████████▎ | 51/61 [41:06<04:49, 28.93s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  51 84.0 40.39 464.17 5455 3126


 85%|████████▌ | 52/61 [41:16<03:30, 23.38s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  52 85.0 68.64 1095.57 1899 1084


 87%|████████▋ | 53/61 [41:26<02:35, 19.47s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  53 87.0 34.63 343.07 1888 1078


 89%|████████▊ | 54/61 [41:37<01:57, 16.83s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  54 89.0 38.11 378.01 1952 1127


 90%|█████████ | 55/61 [41:46<01:26, 14.39s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  55 90.0 31.29 317.75 1588 903


 92%|█████████▏| 56/61 [43:07<02:51, 34.39s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  56 92.0 34.7 329.56 14749 8501


 93%|█████████▎| 57/61 [43:11<01:41, 25.49s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  57 93.0 26.25 240.75 863 480


 95%|█████████▌| 58/61 [43:43<01:22, 27.44s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  58 95.0 35.24 391.06 5844 3302


 97%|█████████▋| 59/61 [44:45<01:15, 37.59s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  59 97.0 36.17 364.87 11172 6381


 98%|█████████▊| 60/61 [45:31<00:40, 40.11s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  60 98.0 30.74 283.11 8388 4807


100%|██████████| 61/61 [46:11<00:00, 45.43s/it]

document no., pct. complete, 1/<P>, 1/<P> (nsp), M, M (nsp):  61 100.0 41.47 451.31 7335 4164





In [13]:
print('micro: ', round(1/(10**(np.log10([p[0] for ps in out for p in ps]).mean())), 3),
      '; macro: ', round(np.mean([1/(10**(np.log10([p[0] for p in ps]).mean())) for ps in out]), 3),
      '; micro (w/out space): ', round(1/(10**(np.log10([p[1] for ps in out for p in ps if p[1] is not None]).mean())), 3),
      '; macro (w/out space): ', round(np.mean([1/(10**(np.log10([p[1] for p in ps if p[1] is not None]).mean())) for ps in out]), 3))

micro:  34.893 ; macro:  35.809 ; micro (w/out space):  350.773 ; macro (w/out space):  367.252
