In [1]:
import torch
import re
from tqdm.notebook import tqdm
import importlib
from pprint import pprint

In [2]:
import sys
sys.path.append("../../")
import autoregressive

In [3]:
with open("alice_in_wonderland.txt") as file:
    text = file.read()
len(text)

164046

In [4]:
text[:5000]

'The Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll\n\nThis eBook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this eBook or online at\nwww.gutenberg.org. If you are not located in the United States, you\nwill have to check the laws of the country where you are located before\nusing this eBook.\n\nTitle: Alice’s Adventures in Wonderland\n\nAuthor: Lewis Carroll\n\nRelease Date: January, 1991 [eBook #11]\n[Most recently updated: October 12, 2020]\n\nLanguage: English\n\nCharacter set encoding: UTF-8\n\nProduced by: Arthur DiBianca and David Widger\n\n*** START OF THE PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***\n\n[Illustration]\n\n\n\n\nAlice’s Adventures in Wonderland\n\nby Lewis Carroll\n\nTHE MILLENNIUM FULCRUM EDITION 3.0\n\nConten

In [5]:
# tokens = re.findall(r"[a-zA-Z]+|[^a-zA-Z]", text.lower())
tokens = re.findall(r"[a-zA-Z]+|[^a-zA-Z]", text)
# tokens = re.findall(r"[a-zA-Z]+|[^a-zA-Z\s]", text)
print(len(tokens))
tokens[:40]

71004


['The',
 ' ',
 'Project',
 ' ',
 'Gutenberg',
 ' ',
 'eBook',
 ' ',
 'of',
 ' ',
 'Alice',
 '’',
 's',
 ' ',
 'Adventures',
 ' ',
 'in',
 ' ',
 'Wonderland',
 ',',
 ' ',
 'by',
 ' ',
 'Lewis',
 ' ',
 'Carroll',
 '\n',
 '\n',
 'This',
 ' ',
 'eBook',
 ' ',
 'is',
 ' ',
 'for',
 ' ',
 'the',
 ' ',
 'use',
 ' ']

In [6]:
steps = 15

X = []
for i in tqdm(range(len(tokens) - steps)):
    X.append(tokens[i:i + steps])

  0%|          | 0/70989 [00:00<?, ?it/s]

In [7]:
vocabulary = set(tokens)

# from collections import Counter
# vocabulary = Counter(tokens)

# from nltk.lm import Vocabulary
# vocabulary = Vocabulary(tokens)

len(vocabulary)

3421

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

# fit

In [9]:
importlib.reload(autoregressive)    
net = autoregressive.LSTM(vocabulary)
# net = autoregressive.TransformerEncoder(vocabulary)
net.to(device)

encoded = net.text2tensor(X)
print("encoded", encoded.shape)

performance = net.fit(encoded, save_path = "model.pt")
net.save_architecture("model.arch")

Model: Autoregressive LSTM
Tokens in the vocabulary: 3,421
Embedding dimension: 32
Hidden units: 128
Layers: 2
Dropout: 0.0
Trainable parameters: 765,982

encoded torch.Size([70989, 15])


  0%|          | 0/5 [00:00<?, ?it/s]

Training started
Epochs: 5
Learning rate: 0.0001
Weight decay: 0
Epoch | Train                 | Minutes
      | Loss     | Error Rate |
---------------------------------------


  0%|          | 0/710 [00:00<?, ?it/s]

    1 |   4.9840 |     64.702 |     0.2


  0%|          | 0/710 [00:00<?, ?it/s]

    2 |   4.3385 |     61.875 |     0.5


  0%|          | 0/710 [00:00<?, ?it/s]

    3 |   4.2324 |     61.592 |     0.7


  0%|          | 0/710 [00:00<?, ?it/s]

    4 |   4.0424 |     61.437 |     1.0


  0%|          | 0/710 [00:00<?, ?it/s]

    5 |   3.9016 |     61.376 |     1.2


In [10]:
# includes all the information about the epoch and the model, useful for reproducibility

performance

Unnamed: 0,epoch,train_loss,train_error_rate,training_minutes,learning_rate,weight_decay,model,embedding_dimension,hidden_units,layers,dropout,parameters
0,1,4.983996,64.702479,0.244938,0.0001,0,Autoregressive LSTM,32,128,2,0.0,765982
1,2,4.338495,61.87518,0.488944,0.0001,0,Autoregressive LSTM,32,128,2,0.0,765982
2,3,4.232397,61.591534,0.733566,0.0001,0,Autoregressive LSTM,32,128,2,0.0,765982
3,4,4.042367,61.437084,0.978441,0.0001,0,Autoregressive LSTM,32,128,2,0.0,765982
4,5,3.901643,61.37641,1.222836,0.0001,0,Autoregressive LSTM,32,128,2,0.0,765982


In [11]:
# the input for testing

net.tensor2text(encoded[:5])

['The Project Gutenberg eBook of Alice’s Adventures',
 ' Project Gutenberg eBook of Alice’s Adventures ',
 'Project Gutenberg eBook of Alice’s Adventures in',
 ' Gutenberg eBook of Alice’s Adventures in ',
 'Gutenberg eBook of Alice’s Adventures in Wonderland']

# predict

In [12]:
importlib.reload(autoregressive)    
net = autoregressive.load_architecture("model.arch")
net.load_state_dict(torch.load("model.pt"))
net.to(device)
idx, log_probabilities = net.predict(encoded[:5], main_progress_bar = False, progress_bar = 0)

net.tensor2text(idx)

Model: Autoregressive LSTM
Tokens in the vocabulary: 3,421
Embedding dimension: 32
Hidden units: 128
Layers: 2
Dropout: 0.0
Trainable parameters: 765,982



['The Project Gutenberg eBook of Alice’s Adventures                    ',
 ' Project Gutenberg eBook of Alice’s Adventures                     ',
 'Project Gutenberg eBook of Alice’s Adventures in                    ',
 ' Gutenberg eBook of Alice’s Adventures in                     ',
 'Gutenberg eBook of Alice’s Adventures in Wonderland                    ']

In [13]:
log_probabilities

tensor([[-22.3473, -23.5690, -23.6661, -23.8738, -24.1139],
        [-22.9469, -24.1696, -24.2632, -24.4664, -24.5354],
        [-22.3464, -23.5681, -23.6656, -23.8748, -24.1129],
        [-22.9464, -24.1691, -24.2631, -24.4677, -24.5352],
        [-22.3486, -23.5703, -23.6663, -23.8733, -24.1151]], device='cuda:0')

# greedy_search

In [14]:
importlib.reload(autoregressive)    
net = autoregressive.load_architecture("model.arch")
net.load_state_dict(torch.load("model.pt"))
net.to(device)
indexes, log_probabilities = net.greedy_search(encoded[:5], progress_bar = False)

print(log_probabilities)
net.tensor2text(indexes)

Model: Autoregressive LSTM
Tokens in the vocabulary: 3,421
Embedding dimension: 32
Hidden units: 128
Layers: 2
Dropout: 0.0
Trainable parameters: 765,982

tensor([-22.3473, -22.9469, -22.3464, -22.9464, -22.3486], device='cuda:0')


['The Project Gutenberg eBook of Alice’s Adventures                    ',
 ' Project Gutenberg eBook of Alice’s Adventures                     ',
 'Project Gutenberg eBook of Alice’s Adventures in                    ',
 ' Gutenberg eBook of Alice’s Adventures in                     ',
 'Gutenberg eBook of Alice’s Adventures in Wonderland                    ']

# sample

In [15]:
importlib.reload(autoregressive)    
net = autoregressive.load_architecture("model.arch")
net.load_state_dict(torch.load("model.pt"))
net.to(device)
indexes, log_probabilities = net.sample(encoded[:5], progress_bar = False)

print(log_probabilities)
net.tensor2text(indexes)

Model: Autoregressive LSTM
Tokens in the vocabulary: 3,421
Embedding dimension: 32
Hidden units: 128
Layers: 2
Dropout: 0.0
Trainable parameters: 765,982

tensor([-65.1489, -68.3802, -49.9237, -70.7739, -70.6281], device='cuda:0')


['The Project Gutenberg eBook of Alice’s Adventures any\n   headit  and nearhave in””  ',
 ' Project Gutenberg eBook of Alice’s Adventures “ “ down  herself ,” therat itHowyou  ',
 'Project Gutenberg eBook of Alice’s Adventures in     was   she\n cried or Alice enough ',
 ' Gutenberg eBook of Alice’s Adventures in a “. leave*!,\nwith\n Queen, could.  ',
 'Gutenberg eBook of Alice’s Adventures in Wonderland as down\ntwo thought Rabbitthe she as lifenow  ']

# beam_search

In [16]:
importlib.reload(autoregressive)    
net = autoregressive.load_architecture("model.arch")
net.load_state_dict(torch.load("model.pt"))
net.to(device)
indexes, log_probabilities = net.beam_search(encoded[:5], progress_bar = False)

print(log_probabilities)
pprint([net.tensor2text(t) for t in indexes])

Model: Autoregressive LSTM
Tokens in the vocabulary: 3,421
Embedding dimension: 32
Hidden units: 128
Layers: 2
Dropout: 0.0
Trainable parameters: 765,982

tensor([[-22.3473, -23.5690, -23.6661, -23.8738, -24.1139],
        [-22.9469, -24.1696, -24.2632, -24.4664, -24.5354],
        [-22.3464, -23.5681, -23.6656, -23.8748, -24.1129],
        [-22.9464, -24.1691, -24.2631, -24.4677, -24.5352],
        [-22.3486, -23.5703, -23.6663, -23.8733, -24.1151]], device='cuda:0')
[['The Project Gutenberg eBook of Alice’s Adventures                    ',
  'The Project Gutenberg eBook of Alice’s Adventures                  \n ',
  'The Project Gutenberg eBook of Alice’s Adventures the                  ',
  'The Project Gutenberg eBook of Alice’s Adventures   the                ',
  'The Project Gutenberg eBook of Alice’s Adventures                   \n'],
 [' Project Gutenberg eBook of Alice’s Adventures                     ',
  ' Project Gutenberg eBook of Alice’s Adventures                   \n '