# It's a Machine and Natural Language Model

In [1]:
from src.IaMaN.base import LM
from src.utils.data import load_ud
from collections import defaultdict
from collections import Counter
from tqdm import tqdm
import numpy as np
import os, re

seed = 691

print("Loading pre-training data...")
pretrain_path = '/data/newstweet/week_2019-40_article_texts/'
total_pretrain = len([pretrain_file for pretrain_file in os.listdir(pretrain_path) if re.search("^\d+.txt$", pretrain_file)])
num_pretrain = 5000 # total_pretrain

all_pretrain_files = [pretrain_file for pretrain_file in os.listdir(pretrain_path) if re.search("^\d+.txt$", pretrain_file)]
if num_pretrain:
    np.random.seed(seed)
    pretrain_files = np.random.choice(all_pretrain_files, size=num_pretrain, replace=False)
else:
    pretrain_files = np.array([])

ptdocs = [[[open(pretrain_path+pretrain_file).read()]] for pretrain_file in tqdm(pretrain_files)]

print("Loading gold-tagged UDs data...")
max_char = 200_000_000
load_set = 'GUM'; fine_tune = False; do_ife = True; update_ife = False; runners = 10
all_docs = load_ud("English", num_articles = 0, seed = 691, load_set = load_set, rebuild = True)
test_docs = [doc for doc in all_docs if 'test' in doc['id'] and len(doc['text']) <= max_char]# [:1]
train_docs = [doc for doc in all_docs if 'test' not in doc['id'] and len(doc['text']) <= max_char]# [:4]
nsamp = len(test_docs)
print('Avail. pre-train, total pre-train, Avail. gold, total gold-train, total test-gold: ', 
      total_pretrain, len(ptdocs), len(all_docs), len(train_docs), len(test_docs))

Loading pre-training data...


100%|██████████| 5000/5000 [00:01<00:00, 2758.39it/s]


Loading gold-tagged UDs data...
Avail. pre-train, total pre-train, Avail. gold, total gold-train, total test-gold:  14198 5000 150 132 18


In [2]:
docs = [["".join([row[1] for row in s]) for s in d['conllu']] for d in train_docs]
tdocs = [["".join([row[1] for row in s]) for s in d['conllu']] for d in test_docs]
covering = [[[row[1] for row in s] for s in d['conllu']] for d in train_docs]
covering_vocab = set([t for d in covering for s in d for t in s])

all_layers = {d_i: {'lem': [[row[2] for row in s] for s in d['conllu']], 
                    'pos': [[row[3] for row in s] for s in d['conllu']], 
                    'sup': [[(str(int(row[6]) - int(row[0])) if int(row[6]) else row[6]) for row in s] for s in d['conllu']], 
                    'dep': [[row[7] for row in s] for s in d['conllu']],
                    'sty': [[d['s_type'][s_i] for row in s] for s_i, s in enumerate(d['conllu'])]}
              for d_i, d in enumerate(train_docs)}

model = LM(covering_vocab = covering_vocab)
model.init(m = 10, noise = 0.001, positional = True, seed = seed, do_ife = do_ife, runners = runners)
model.fit(docs, f'{load_set}-{nsamp}', covering = covering, all_layers = all_layers)
model.pre_train(ptdocs, update_ife = update_ife)
if fine_tune:
    model.fine_tune(docs, covering = covering, all_layers = all_layers)

0it [00:00, ?it/s]


Training tokenizer...


Initializing: 100%|██████████| 6503/6503 [00:02<00:00, 2853.10it/s]
Fitting:  18%|█▊        | 18/100 [00:47<03:38,  2.66s/it]


Built a vocabulary of 10703 types
Tokenizing documents...


100%|██████████| 132/132 [00:20<00:00,  6.52it/s]


Counting documents and aggregating counts...


5787912it [07:03, 13652.06it/s]


Collecting metadata...


100%|██████████| 132/132 [00:10<00:00, 12.33it/s]


Aggregating metadata...


100%|██████████| 132/132 [00:00<00:00, 398.00it/s]


Encoding parameters...


100%|██████████| 5787912/5787912 [00:39<00:00, 145149.47it/s]


Computing marginal statistics...


100%|██████████| 3096777/3096777 [00:16<00:00, 190081.92it/s]


Building dense output heads...


100%|██████████| 11/11 [00:21<00:00,  1.94s/it]


Done.
Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 5787912 10704 485 10185 12054 177 5.309
Processing pre-training documents...
Tokenizing documents...


100%|██████████| 5000/5000 [28:04<00:00,  2.97it/s]


Counting documents and aggregating counts...


8958836it [29:52, 4998.71it/s] 


Collecting metadata...


100%|██████████| 5000/5000 [00:29<00:00, 169.52it/s]


Aggregating metadata...


100%|██████████| 5000/5000 [00:01<00:00, 2548.07it/s]


Encoding parameters...


100%|██████████| 13442208/13442208 [01:14<00:00, 181547.59it/s]


Re-computing marginal statistics...


100%|██████████| 8564400/8564400 [00:26<00:00, 326886.45it/s]


Re-building dense output heads...


100%|██████████| 11/11 [00:27<00:00,  2.54s/it]

Done.
Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 13442208 10704 485 10185 12054 177 12.33





__Currently__: ordering for the current fine tuning process:
1. train tokenizer and fit model to GUM
2. process NewsTweet documents to integrate sparse post-training statistics (requires mr implementation and updates to the vocabularies/indices)
3. update the ife and dense model, i.e., produce new statistics and dimensionalities
4. fine tune output heads to GUM, and _combine_ them with the dense model from (3), i.e., don't just replace as is current.

__Preliminarily__: this does seem to present performance benefits, but as is usual will require 'big data' statistics to become competitive. In particular, the (tokenization, least of all), counting, sorting, and aggregation of co-occurrence counts must all be distributed for the statistical resolution required to approach performance gains aking to more-advanced systems. Currently, a spark-based MR system is implemented for these (all but tokenization).

In [3]:
output = model.generate(m = 25, seed = seed, return_output = True)

 who have describeded by al health carefirem.”SinceStill


In [4]:
output[0][1].most_common(5)

[((' ', 'form'), 0.32389457628790824),
 (('e', 'form'), 0.07128479788751822),
 (('t', 'form'), 0.04682393813109191),
 (('s', 'form'), 0.04024055275105154),
 (('o', 'form'), 0.029840061056482355)]

In [5]:
model.generate(m = 25, seed = seed, top = 0.05)

 this season growth of the post."This"We'rethinknothappensurprisingrevolutionizingide5


In [6]:
model.generate(m = 25, seed = seed, top = 0.95)

 the first half of the season.S. I think that’s why


In [7]:
model.generate(m = 25, seed = seed, top = [0.05, 0.95])

 the season in the UnitedeStates."Our W said that would 


In [8]:
model.generate(prompt = " In the", m = 25, seed = seed)

 In the 2016, accordingsto the unique for document season?outHefive 


In [9]:
model.generate(prompt = " Hey, what are you thinking?", seed = seed, revise = [19,24])

 Hey, what are you know?


In [10]:
model.generate(prompt = " In the", m = 25, seed = seed, rhyme = 0.000000001)

 In the most of pregnancy season and impeachment communityearein NewandYorkattemptcar,


In [11]:
model.generate(prompt = " In the", m = 25, seed = seed, prose = 0.75)

 In the NFL, whichingwill be tor the nexterefluidene 


In [12]:
model.generate(prompt = " In the", m = 25, seed = seed, style = 0.25)

 In the U.S.Ftroops TEXSYRYININYINYTY#ASY


In [13]:
model.generate(prompt = " In the", m = 25, seed = seed, chunk = 0.9)

 In the 2016, accordingsto the unique for document season?outHefive 


In [14]:
model.generate(prompt = " In the", m = 25, seed = seed, punct = 0.999)

 In the 2016, accordingsto the campaign her two touchs, according 


In [15]:
model.generate(prompt = " In the", m = 50, seed = seed, 
               prose = 0.75, style = 0.25, chunk = 0.9, punct = 0.999)

 In the NFL, whichashed blood,"ashe add dirtail."For example temperature not athletic headst d by the UnitedoStates.


In [16]:
model.interpret([["".join(model._s)]], seed = seed)
for doc in model._documents:
    print('opening next doc:')
    for s in doc._sentences:
        print('opening next sent:')
        for t in s._tokens:
            print('opening next token:')
            print((str(t), t._lem, t._pos, t._sep, t._sup, t._dep, s._sty))

100%|██████████| 1/1 [00:04<00:00,  4.23s/it]

opening next doc:
opening next sent:
opening next token:
(' ', ' ', 'PUNCT', False, '1', 'punct', 'decl')
opening next token:
('In', 'in', 'ADP', False, '4', 'case', 'decl')
opening next token:
(' ', ' ', 'PUNCT', False, '1', 'punct', 'decl')
opening next token:
('the', 'the', 'DET', False, '2', 'det', 'decl')
opening next token:
(' ', ' ', 'PUNCT', False, '1', 'punct', 'decl')
opening next token:
('NFL', 'world', 'PROPN', False, '5', 'obl', 'decl')
opening next token:
(',', ',', 'PUNCT', False, '-1', 'punct', 'decl')
opening next token:
(' ', ' ', 'PUNCT', False, '1', 'punct', 'decl')
opening next token:
('whi', 'and', 'PROPN', False, '5', 'appos', 'decl')
opening next token:
('cha', ' ', 'PUNCT', False, '1', 'punct', 'decl')
opening next token:
('sh', 'she', 'PRON', False, '4', 'nsubj', 'decl')
opening next token:
('e', 'she', 'VERB', False, '3', 'nsubj', 'decl')
opening next token:
('d', 'die', 'VERB', False, '2', 'compound', 'decl')
opening next token:
(' ', ' ', 'PUNCT', False, '1




In [17]:
model.generate(docs = tdocs, m = 50, seed = seed)

Tokenizing documents..


100%|██████████| 18/18 [00:02<00:00,  8.80it/s]


Evaluating language model..


100%|██████████| 2664/2664 [07:23<00:00,  6.01it/s]


document no., pct. complete, 1/<P>, M:  1 6.0 101.64 2664


100%|██████████| 2540/2540 [06:57<00:00,  6.08it/s]


document no., pct. complete, 1/<P>, M:  2 11.0 1224.9 2540


100%|██████████| 1869/1869 [05:16<00:00,  5.91it/s]


document no., pct. complete, 1/<P>, M:  3 17.0 869.15 1869


100%|██████████| 2596/2596 [07:07<00:00,  6.07it/s]


document no., pct. complete, 1/<P>, M:  4 22.0 198.22 2596


100%|██████████| 2014/2014 [05:30<00:00,  6.09it/s]


document no., pct. complete, 1/<P>, M:  5 28.000000000000004 59.24 2014


100%|██████████| 2610/2610 [07:21<00:00,  5.92it/s]


document no., pct. complete, 1/<P>, M:  6 33.0 218.71 2610


100%|██████████| 2894/2894 [07:56<00:00,  6.08it/s]


document no., pct. complete, 1/<P>, M:  7 39.0 82.13 2894


100%|██████████| 2099/2099 [05:48<00:00,  6.03it/s]


document no., pct. complete, 1/<P>, M:  8 44.0 99.2 2099


100%|██████████| 2299/2299 [06:30<00:00,  5.89it/s]


document no., pct. complete, 1/<P>, M:  9 50.0 280.71 2299


100%|██████████| 3463/3463 [09:31<00:00,  6.06it/s]


document no., pct. complete, 1/<P>, M:  10 56.00000000000001 164.21 3463


100%|██████████| 1656/1656 [04:49<00:00,  5.72it/s]


document no., pct. complete, 1/<P>, M:  11 61.0 194.34 1656


100%|██████████| 2576/2576 [08:26<00:00,  5.09it/s]


document no., pct. complete, 1/<P>, M:  12 67.0 89.3 2576


100%|██████████| 2726/2726 [07:50<00:00,  5.80it/s]


document no., pct. complete, 1/<P>, M:  13 72.0 147.42 2726


100%|██████████| 2154/2154 [06:19<00:00,  5.67it/s]


document no., pct. complete, 1/<P>, M:  14 78.0 38.17 2154


100%|██████████| 2945/2945 [10:30<00:00,  4.67it/s]


document no., pct. complete, 1/<P>, M:  15 83.0 117.08 2945


100%|██████████| 1804/1804 [12:28<00:00,  2.41it/s]


document no., pct. complete, 1/<P>, M:  16 89.0 142.28 1804


100%|██████████| 1683/1683 [06:10<00:00,  4.54it/s]


document no., pct. complete, 1/<P>, M:  17 94.0 100.67 1683


100%|██████████| 2749/2749 [08:16<00:00,  5.54it/s]

document no., pct. complete, 1/<P>, M:  18 100.0 78.67 2749





The below models are summarized as:
```
- Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 5787912 10704 485 10185 12054 177 5.309
- Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 13442208 10704 485 10185 12054 177 12.33
- Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 15964670 10704 485 10185 12054 177 14.644
- Model params, types, encoding size, contexts, vec dim, max sent, and % capacity used: 20152225 10704 485 10185 12054 177 18.485
```

- With IFE, trained on 132 GUM train documents, then stenciled on all 18 GUM test documents:

```
Tokenizing documents..
100%|██████████| 18/18 [00:02<00:00,  8.81it/s]
Evaluating language model..
100%|██████████| 2664/2664 [04:52<00:00,  9.12it/s]
document no., pct. complete, 1/<P>, M:  1 6.0 435.18 2664
100%|██████████| 2540/2540 [04:27<00:00,  9.50it/s]
document no., pct. complete, 1/<P>, M:  2 11.0 2654.38 2540
100%|██████████| 1869/1869 [03:15<00:00,  9.55it/s]
document no., pct. complete, 1/<P>, M:  3 17.0 1822.74 1869
100%|██████████| 2596/2596 [04:33<00:00,  9.48it/s]
document no., pct. complete, 1/<P>, M:  4 22.0 415.06 2596
100%|██████████| 2014/2014 [03:43<00:00,  9.01it/s]
document no., pct. complete, 1/<P>, M:  5 28.000000000000004 81.67 2014
100%|██████████| 2610/2610 [04:36<00:00,  9.43it/s]
document no., pct. complete, 1/<P>, M:  6 33.0 579.85 2610
100%|██████████| 2894/2894 [05:05<00:00,  9.46it/s]
document no., pct. complete, 1/<P>, M:  7 39.0 180.72 2894
100%|██████████| 2099/2099 [03:41<00:00,  9.50it/s]
document no., pct. complete, 1/<P>, M:  8 44.0 369.72 2099
100%|██████████| 2299/2299 [04:03<00:00,  9.43it/s]
document no., pct. complete, 1/<P>, M:  9 50.0 825.7 2299
100%|██████████| 3463/3463 [06:19<00:00,  9.14it/s]
document no., pct. complete, 1/<P>, M:  10 56.00000000000001 745.92 3463
100%|██████████| 1656/1656 [02:55<00:00,  9.42it/s]
document no., pct. complete, 1/<P>, M:  11 61.0 649.48 1656
100%|██████████| 2576/2576 [04:33<00:00,  9.42it/s]
document no., pct. complete, 1/<P>, M:  12 67.0 263.56 2576
100%|██████████| 2726/2726 [04:49<00:00,  9.42it/s]
document no., pct. complete, 1/<P>, M:  13 72.0 285.6 2726
100%|██████████| 2154/2154 [04:01<00:00,  8.91it/s]
document no., pct. complete, 1/<P>, M:  14 78.0 75.1 2154
100%|██████████| 2945/2945 [05:13<00:00,  9.40it/s]
document no., pct. complete, 1/<P>, M:  15 83.0 328.98 2945
100%|██████████| 1804/1804 [03:12<00:00,  9.37it/s]
document no., pct. complete, 1/<P>, M:  16 89.0 324.12 1804
100%|██████████| 1683/1683 [02:59<00:00,  9.40it/s]
document no., pct. complete, 1/<P>, M:  17 94.0 251.48 1683
100%|██████████| 2749/2749 [05:04<00:00,  9.02it/s]
document no., pct. complete, 1/<P>, M:  18 100.0 150.55 2749
```

- With IFE, trained on 132 GUM train documents, then 5,000 NewsTweet Pre/Post-training articles, then stenciled on all 18 GUM test documents:

```
Tokenizing documents..
100%|██████████| 18/18 [00:02<00:00,  8.80it/s]
Evaluating language model..
100%|██████████| 2664/2664 [07:23<00:00,  6.01it/s]
document no., pct. complete, 1/<P>, M:  1 6.0 101.64 2664
100%|██████████| 2540/2540 [06:57<00:00,  6.08it/s]
document no., pct. complete, 1/<P>, M:  2 11.0 1224.9 2540
100%|██████████| 1869/1869 [05:16<00:00,  5.91it/s]
document no., pct. complete, 1/<P>, M:  3 17.0 869.15 1869
100%|██████████| 2596/2596 [07:07<00:00,  6.07it/s]
document no., pct. complete, 1/<P>, M:  4 22.0 198.22 2596
100%|██████████| 2014/2014 [05:30<00:00,  6.09it/s]
document no., pct. complete, 1/<P>, M:  5 28.000000000000004 59.24 2014
100%|██████████| 2610/2610 [07:21<00:00,  5.92it/s]
document no., pct. complete, 1/<P>, M:  6 33.0 218.71 2610
100%|██████████| 2894/2894 [07:56<00:00,  6.08it/s]
document no., pct. complete, 1/<P>, M:  7 39.0 82.13 2894
100%|██████████| 2099/2099 [05:48<00:00,  6.03it/s]
document no., pct. complete, 1/<P>, M:  8 44.0 99.2 2099
100%|██████████| 2299/2299 [06:30<00:00,  5.89it/s]
document no., pct. complete, 1/<P>, M:  9 50.0 280.71 2299
100%|██████████| 3463/3463 [09:31<00:00,  6.06it/s]
document no., pct. complete, 1/<P>, M:  10 56.00000000000001 164.21 3463
100%|██████████| 1656/1656 [04:49<00:00,  5.72it/s]
document no., pct. complete, 1/<P>, M:  11 61.0 194.34 1656
100%|██████████| 2576/2576 [08:26<00:00,  5.09it/s]
document no., pct. complete, 1/<P>, M:  12 67.0 89.3 2576
100%|██████████| 2726/2726 [07:50<00:00,  5.80it/s]
document no., pct. complete, 1/<P>, M:  13 72.0 147.42 2726
100%|██████████| 2154/2154 [06:19<00:00,  5.67it/s]
document no., pct. complete, 1/<P>, M:  14 78.0 38.17 2154
100%|██████████| 2945/2945 [10:30<00:00,  4.67it/s]
document no., pct. complete, 1/<P>, M:  15 83.0 117.08 2945
100%|██████████| 1804/1804 [12:28<00:00,  2.41it/s]
document no., pct. complete, 1/<P>, M:  16 89.0 142.28 1804
100%|██████████| 1683/1683 [06:10<00:00,  4.54it/s]
document no., pct. complete, 1/<P>, M:  17 94.0 100.67 1683
100%|██████████| 2749/2749 [08:16<00:00,  5.54it/s]
document no., pct. complete, 1/<P>, M:  18 100.0 78.67 2749
```

- With IFE, trained on 132 GUM train documents, then 10,000 NewsTweet Pre/Post-training articles, then stenciled on all 18 GUM test documents:

```
Tokenizing documents..
100%|██████████| 18/18 [00:02<00:00,  8.85it/s]
Evaluating language model..
100%|██████████| 2664/2664 [05:43<00:00,  7.75it/s]
document no., pct. complete, 1/<P>, M:  1 6.0 153.41 2664
100%|██████████| 2540/2540 [05:41<00:00,  7.44it/s]
document no., pct. complete, 1/<P>, M:  2 11.0 1324.68 2540
100%|██████████| 1869/1869 [04:02<00:00,  7.70it/s]
document no., pct. complete, 1/<P>, M:  3 17.0 977.94 1869
100%|██████████| 2596/2596 [05:36<00:00,  7.70it/s]
document no., pct. complete, 1/<P>, M:  4 22.0 224.7 2596
100%|██████████| 2014/2014 [04:34<00:00,  7.34it/s]
document no., pct. complete, 1/<P>, M:  5 28.000000000000004 60.83 2014
100%|██████████| 2610/2610 [05:38<00:00,  7.71it/s]
document no., pct. complete, 1/<P>, M:  6 33.0 270.45 2610
100%|██████████| 2894/2894 [06:15<00:00,  7.70it/s]
document no., pct. complete, 1/<P>, M:  7 39.0 98.84 2894
100%|██████████| 2099/2099 [04:33<00:00,  7.67it/s]
document no., pct. complete, 1/<P>, M:  8 44.0 121.84 2099
100%|██████████| 2299/2299 [05:13<00:00,  7.33it/s]
document no., pct. complete, 1/<P>, M:  9 50.0 343.57 2299
100%|██████████| 3463/3463 [07:31<00:00,  7.67it/s]
document no., pct. complete, 1/<P>, M:  10 56.00000000000001 239.94 3463
100%|██████████| 1656/1656 [03:36<00:00,  7.66it/s]
document no., pct. complete, 1/<P>, M:  11 61.0 251.67 1656
100%|██████████| 2576/2576 [05:48<00:00,  7.39it/s]
document no., pct. complete, 1/<P>, M:  12 67.0 114.0 2576
100%|██████████| 2726/2726 [05:56<00:00,  7.64it/s]
document no., pct. complete, 1/<P>, M:  13 72.0 171.13 2726
100%|██████████| 2154/2154 [04:40<00:00,  7.69it/s]
document no., pct. complete, 1/<P>, M:  14 78.0 44.69 2154
100%|██████████| 2945/2945 [06:38<00:00,  7.39it/s]
document no., pct. complete, 1/<P>, M:  15 83.0 151.02 2945
100%|██████████| 1804/1804 [03:55<00:00,  7.65it/s]
document no., pct. complete, 1/<P>, M:  16 89.0 160.74 1804
100%|██████████| 1683/1683 [03:40<00:00,  7.63it/s]
document no., pct. complete, 1/<P>, M:  17 94.0 118.86 1683
100%|██████████| 2749/2749 [05:59<00:00,  7.65it/s]
document no., pct. complete, 1/<P>, M:  18 100.0 90.08 2749
```

- With IFE, trained on 132 GUM train documents, then 14,198 (all) NewsTweet Pre/Post-training articles, then stenciled on all 18 GUM test documents:

```
Tokenizing documents..
100%|██████████| 18/18 [00:02<00:00,  8.94it/s]
Evaluating language model..
100%|██████████| 2664/2664 [07:51<00:00,  5.65it/s]
document no., pct. complete, 1/<P>, M:  1 6.0 87.5 2664
100%|██████████| 2540/2540 [08:46<00:00,  4.82it/s]
document no., pct. complete, 1/<P>, M:  2 11.0 1246.43 2540
100%|██████████| 1869/1869 [06:26<00:00,  4.84it/s]
document no., pct. complete, 1/<P>, M:  3 17.0 907.98 1869
100%|██████████| 2596/2596 [09:09<00:00,  4.72it/s]
document no., pct. complete, 1/<P>, M:  4 22.0 185.52 2596
100%|██████████| 2014/2014 [06:57<00:00,  4.82it/s]
document no., pct. complete, 1/<P>, M:  5 28.000000000000004 60.31 2014
100%|██████████| 2610/2610 [09:11<00:00,  4.73it/s]
document no., pct. complete, 1/<P>, M:  6 33.0 211.75 2610
100%|██████████| 2894/2894 [10:00<00:00,  4.82it/s]
document no., pct. complete, 1/<P>, M:  7 39.0 77.63 2894
100%|██████████| 2099/2099 [07:28<00:00,  4.68it/s]
document no., pct. complete, 1/<P>, M:  8 44.0 94.97 2099
100%|██████████| 2299/2299 [07:57<00:00,  4.81it/s]
document no., pct. complete, 1/<P>, M:  9 50.0 275.11 2299
100%|██████████| 3463/3463 [12:12<00:00,  4.73it/s]
document no., pct. complete, 1/<P>, M:  10 56.00000000000001 153.55 3463
100%|██████████| 1656/1656 [05:43<00:00,  4.81it/s]
document no., pct. complete, 1/<P>, M:  11 61.0 181.12 1656
100%|██████████| 2576/2576 [08:58<00:00,  4.78it/s]
document no., pct. complete, 1/<P>, M:  12 67.0 79.46 2576
100%|██████████| 2726/2726 [09:38<00:00,  4.71it/s]
document no., pct. complete, 1/<P>, M:  13 72.0 134.32 2726
100%|██████████| 2154/2154 [07:31<00:00,  4.77it/s]
document no., pct. complete, 1/<P>, M:  14 78.0 36.48 2154
100%|██████████| 2945/2945 [10:24<00:00,  4.71it/s]
document no., pct. complete, 1/<P>, M:  15 83.0 105.72 2945
100%|██████████| 1804/1804 [06:16<00:00,  4.79it/s]
document no., pct. complete, 1/<P>, M:  16 89.0 138.55 1804
100%|██████████| 1683/1683 [05:59<00:00,  4.68it/s]
document no., pct. complete, 1/<P>, M:  17 94.0 95.61 1683
100%|██████████| 2749/2749 [09:35<00:00,  4.78it/s]
document no., pct. complete, 1/<P>, M:  18 100.0 76.1 2749
```

In [18]:
model.generate(prompt = " Hey, what are you thinking? ", seed = seed, m = 25)

 Hey, what are you thinking? I thought y'ar know aboutYouwhatou'rewantthinking?weepingrankings600?theoriesAM


In [19]:
model.generate(prompt = " Hey, what are you thinking? I have beon waiting since,”2019 More L f-c it ", seed = seed, 
               revise = [35, 65], prose = 0.75, chunk = 0.15)

 Hey, what are you thinking? I have fon cou, who has L f-c it 


In [20]:
model.generate(prompt = " A portable surgical visualization kit comprising: ", seed = seed, m = 25)

 A portable surgical visualization kit comprising: h Jerryllesein her e f,"It’s


In [21]:
model.generate(prompt = " A portable surgical visualization kit comprising: The Yankee Press ", seed = seed, 
               revise = [12, 19], prose = 0.75, chunk = 0.15)

 A portable ton visualization kit comprising: The Yankee Press 
