In [1]:
import torch
import re
from tqdm.notebook import tqdm
import importlib
from pprint import pprint

In [2]:
import sys
sys.path.append("../../")
import autoregressive

In [3]:
with open("alice.txt") as file:
    text = file.read()
len(text)

164047

In [4]:
text[:5000]

'\ufeffThe Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll\n\nThis eBook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this eBook or online at\nwww.gutenberg.org. If you are not located in the United States, you\nwill have to check the laws of the country where you are located before\nusing this eBook.\n\nTitle: Alice’s Adventures in Wonderland\n\nAuthor: Lewis Carroll\n\nRelease Date: January, 1991 [eBook #11]\n[Most recently updated: October 12, 2020]\n\nLanguage: English\n\nCharacter set encoding: UTF-8\n\nProduced by: Arthur DiBianca and David Widger\n\n*** START OF THE PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***\n\n[Illustration]\n\n\n\n\nAlice’s Adventures in Wonderland\n\nby Lewis Carroll\n\nTHE MILLENNIUM FULCRUM EDITION 3.0\n\n

In [5]:
# tokens = re.findall(r"[a-zA-Z]+|[^a-zA-Z]", text.lower())[1:]
tokens = re.findall(r"[a-zA-Z]+|[^a-zA-Z]", text)[1:]
# tokens = re.findall(r"[a-zA-Z]+|[^a-zA-Z\s]", text)[1:]
print(len(tokens))
tokens[:40]

71004


['The',
 ' ',
 'Project',
 ' ',
 'Gutenberg',
 ' ',
 'eBook',
 ' ',
 'of',
 ' ',
 'Alice',
 '’',
 's',
 ' ',
 'Adventures',
 ' ',
 'in',
 ' ',
 'Wonderland',
 ',',
 ' ',
 'by',
 ' ',
 'Lewis',
 ' ',
 'Carroll',
 '\n',
 '\n',
 'This',
 ' ',
 'eBook',
 ' ',
 'is',
 ' ',
 'for',
 ' ',
 'the',
 ' ',
 'use',
 ' ']

In [6]:
steps = 15

X = []
for i in tqdm(range(len(tokens) - steps)):
    X.append(tokens[i:i + steps])

  0%|          | 0/70989 [00:00<?, ?it/s]

In [7]:
vocabulary = set(tokens)

# from collections import Counter
# vocabulary = Counter(tokens)

# from nltk.lm import Vocabulary
# vocabulary = Vocabulary(tokens)

len(vocabulary)

3421

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

# fit

In [9]:
importlib.reload(autoregressive)    
# net = autoregressive.LSTM(vocabulary)
net = autoregressive.TransformerEncoder(vocabulary)
net.to(device)

encoded = net.text2tensor(X)
print("encoded", encoded.shape)

performance = net.fit(encoded, save_path = "word_model.pt")
net.save_architecture("word_model.arch")

Model: Autoregressive Transformer Encoder
Tokens in the vocabulary: 3,421
Max sequence length: 16
Embedding dimension: 32
Feedforward dimension: 128
Layers: 2
Attention heads: 2
Activation: relu
Dropout: 0.0
Trainable parameters: 261,184

encoded torch.Size([70989, 15])


  0%|          | 0/5 [00:00<?, ?it/s]

Training started
Epochs: 5
Learning rate: 0.0001
Weight decay: 0
Epoch | Train                 | Minutes
      | Loss     | Error Rate |
---------------------------------------


  0%|          | 0/710 [00:00<?, ?it/s]

    1 |   5.7163 |     65.718 |     0.2


  0%|          | 0/710 [00:00<?, ?it/s]

    2 |   4.0689 |     61.193 |     0.5


  0%|          | 0/710 [00:00<?, ?it/s]

    3 |   3.6327 |     58.894 |     0.7


  0%|          | 0/710 [00:00<?, ?it/s]

    4 |   3.4652 |     57.896 |     0.9


  0%|          | 0/710 [00:00<?, ?it/s]

    5 |   3.3597 |     56.890 |     1.2


In [10]:
# includes all the information about the epoch and the model, useful for reproducibility

performance

Unnamed: 0,epoch,train_loss,train_error_rate,training_minutes,learning_rate,weight_decay,model,max_sequence_length,embedding_dimension,feedforward_dimension,layers,attention_heads,activation,dropout,parameters
0,1,5.716263,65.718331,0.236554,0.0001,0,Autoregressive Transformer Encoder,16,32,128,2,2,relu,0.0,261184
1,2,4.068888,61.192579,0.471183,0.0001,0,Autoregressive Transformer Encoder,16,32,128,2,2,relu,0.0,261184
2,3,3.632661,58.894235,0.703376,0.0001,0,Autoregressive Transformer Encoder,16,32,128,2,2,relu,0.0,261184
3,4,3.465171,57.896294,0.935484,0.0001,0,Autoregressive Transformer Encoder,16,32,128,2,2,relu,0.0,261184
4,5,3.359733,56.890404,1.167737,0.0001,0,Autoregressive Transformer Encoder,16,32,128,2,2,relu,0.0,261184


In [11]:
# the input for testing

net.tensor2text(encoded[:5])

['The Project Gutenberg eBook of Alice’s Adventures',
 ' Project Gutenberg eBook of Alice’s Adventures ',
 'Project Gutenberg eBook of Alice’s Adventures in',
 ' Gutenberg eBook of Alice’s Adventures in ',
 'Gutenberg eBook of Alice’s Adventures in Wonderland']

# predict

In [12]:
importlib.reload(autoregressive)    
net = autoregressive.load_architecture("word_model.arch")
net.load_state_dict(torch.load("word_model.pt"))
net.to(device)
idx, log_probabilities = net.predict(encoded[:5], main_progress_bar = False, progress_bar = 0)

net.tensor2text(idx)

Model: Autoregressive Transformer Encoder
Tokens in the vocabulary: 3,421
Max sequence length: 16
Embedding dimension: 32
Feedforward dimension: 128
Layers: 2
Attention heads: 2
Activation: relu
Dropout: 0.0
Trainable parameters: 261,184





['The Project Gutenberg eBook of Alice’s Adventures,” the the” the” the”\n\n\n\n“I ',
 ' Project Gutenberg eBook of Alice’s Adventures the” the”\n“I said the the” the”\n“',
 'Project Gutenberg eBook of Alice’s Adventures in,” the” the”\n“I said the”\n“I ',
 ' Gutenberg eBook of Alice’s Adventures in the the” the”\n“I said the” the”\n“',
 'Gutenberg eBook of Alice’s Adventures in Wonderland,” the”\n“I said the” the”\n“I ']

In [13]:
log_probabilities

tensor([[-30.1911, -31.1859, -31.7390, -31.7955, -31.9300],
        [-32.0567, -32.1917, -32.3972, -32.5625, -32.6471],
        [-29.4588, -30.3732, -30.5793, -31.2369, -31.2391],
        [-33.0850, -33.4887, -33.5350, -33.5808, -33.6737],
        [-30.1437, -30.6507, -31.1363, -31.2395, -31.4437]], device='cuda:0')

# greedy_search

In [14]:
importlib.reload(autoregressive)    
net = autoregressive.load_architecture("word_model.arch")
net.load_state_dict(torch.load("word_model.pt"))
net.to(device)
indexes, log_probabilities = net.greedy_search(encoded[:5], progress_bar = False)

print(log_probabilities)
net.tensor2text(indexes)

Model: Autoregressive Transformer Encoder
Tokens in the vocabulary: 3,421
Max sequence length: 16
Embedding dimension: 32
Feedforward dimension: 128
Layers: 2
Attention heads: 2
Activation: relu
Dropout: 0.0
Trainable parameters: 261,184

tensor([-43.1224, -43.5087, -42.7997, -43.6141, -42.6298], device='cuda:0')


['The Project Gutenberg eBook of Alice’s Adventures the the the the the the the the the the',
 ' Project Gutenberg eBook of Alice’s Adventures the the the the the the the the the the ',
 'Project Gutenberg eBook of Alice’s Adventures in the the the the the the the the the the',
 ' Gutenberg eBook of Alice’s Adventures in the the the the the the the the the the ',
 'Gutenberg eBook of Alice’s Adventures in Wonderland the the the the the the the the the the']

# sample

In [15]:
importlib.reload(autoregressive)    
net = autoregressive.load_architecture("word_model.arch")
net.load_state_dict(torch.load("word_model.pt"))
net.to(device)
indexes, log_probabilities = net.sample(encoded[:5], progress_bar = False)

print(log_probabilities)
net.tensor2text(indexes)

Model: Autoregressive Transformer Encoder
Tokens in the vocabulary: 3,421
Max sequence length: 16
Embedding dimension: 32
Feedforward dimension: 128
Layers: 2
Attention heads: 2
Activation: relu
Dropout: 0.0
Trainable parameters: 261,184

tensor([-83.7754, -65.8632, -86.6725, -80.6308, -75.7290], device='cuda:0')


['The Project Gutenberg eBook of Alice’s Adventures readat tothe:tell\ntea, being Queen with lessen.',
 ' Project Gutenberg eBook of Alice’s Adventures Alice so that\ngive,”\nspoke the put speaker I ',
 'Project Gutenberg eBook of Alice’s Adventures in—said“Theyedition\ntwinkle\nthought\nII bleeds\nstop said as ',
 ' Gutenberg eBook of Alice’s Adventures in likeyou,” ways\n _I. last givenpepper ““They',
 'Gutenberg eBook of Alice’s Adventures in Wonderland immediately“they seemed on the go was for employeeofpaused']

# beam_search

In [16]:
importlib.reload(autoregressive)    
net = autoregressive.load_architecture("word_model.arch")
net.load_state_dict(torch.load("word_model.pt"))
net.to(device)
indexes, log_probabilities = net.beam_search(encoded[:5], progress_bar = False)

print(log_probabilities)
pprint([net.tensor2text(t) for t in indexes])

Model: Autoregressive Transformer Encoder
Tokens in the vocabulary: 3,421
Max sequence length: 16
Embedding dimension: 32
Feedforward dimension: 128
Layers: 2
Attention heads: 2
Activation: relu
Dropout: 0.0
Trainable parameters: 261,184

tensor([[-30.1911, -31.1859, -31.7390, -31.7955, -31.9300],
        [-32.0567, -32.1917, -32.3972, -32.5625, -32.6471],
        [-29.4588, -30.3732, -30.5793, -31.2369, -31.2391],
        [-33.0850, -33.4887, -33.5350, -33.5808, -33.6737],
        [-30.1437, -30.6507, -31.1363, -31.2395, -31.4437]], device='cuda:0')
[['The Project Gutenberg eBook of Alice’s Adventures,” the the” the” the”\n'
  '\n'
  '\n'
  '\n'
  '“I ',
  'The Project Gutenberg eBook of Alice’s Adventures,” the the” the” the” said '
  'said said ',
  'The Project Gutenberg eBook of Alice’s Adventures,” the the” the” the”\n'
  '“I I the',
  'The Project Gutenberg eBook of Alice’s Adventures,” the the” the” the”\n'
  '“I said the',
  'The Project Gutenberg eBook of Alice’s Adventures,”