In [1]:
import torch
import re
from collections import Counter
from tqdm.notebook import tqdm

In [2]:
import sys
sys.path.append("../../")
import autoregressive

In [3]:
with open("alice.txt") as file:
    text = file.read()
len(text)

164047

In [4]:
text[:5000]

'\ufeffThe Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll\n\nThis eBook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this eBook or online at\nwww.gutenberg.org. If you are not located in the United States, you\nwill have to check the laws of the country where you are located before\nusing this eBook.\n\nTitle: Alice’s Adventures in Wonderland\n\nAuthor: Lewis Carroll\n\nRelease Date: January, 1991 [eBook #11]\n[Most recently updated: October 12, 2020]\n\nLanguage: English\n\nCharacter set encoding: UTF-8\n\nProduced by: Arthur DiBianca and David Widger\n\n*** START OF THE PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***\n\n[Illustration]\n\n\n\n\nAlice’s Adventures in Wonderland\n\nby Lewis Carroll\n\nTHE MILLENNIUM FULCRUM EDITION 3.0\n\n

In [5]:
tokens = re.findall(r"[a-zA-Z]+|[^a-zA-Z]", text.lower())[1:]
# tokens = re.findall(r"[a-zA-Z]+|[^a-zA-Z\s]", text)[1:]
print(len(tokens))
tokens[:40]

71004


['the',
 ' ',
 'project',
 ' ',
 'gutenberg',
 ' ',
 'ebook',
 ' ',
 'of',
 ' ',
 'alice',
 '’',
 's',
 ' ',
 'adventures',
 ' ',
 'in',
 ' ',
 'wonderland',
 ',',
 ' ',
 'by',
 ' ',
 'lewis',
 ' ',
 'carroll',
 '\n',
 '\n',
 'this',
 ' ',
 'ebook',
 ' ',
 'is',
 ' ',
 'for',
 ' ',
 'the',
 ' ',
 'use',
 ' ']

In [6]:
vocabulary = Counter(tokens)
print(len(vocabulary))
vocabulary.most_common(20)

3043


[(' ', 27431),
 ('\n', 3761),
 (',', 2571),
 ('the', 1839),
 ('.', 1222),
 ('“', 1118),
 ('”', 1114),
 ('and', 942),
 ('to', 811),
 ('’', 710),
 ('a', 695),
 ('of', 638),
 ('it', 610),
 ('she', 553),
 ('i', 546),
 ('you', 486),
 ('said', 462),
 ('!', 452),
 ('_', 440),
 ('in', 435)]

In [7]:
word2i = {w:i for i, w in enumerate(sorted(vocabulary))}
i2word = {w:i for w, i in enumerate(sorted(vocabulary))}

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [9]:
steps = 20

X = []
for i in tqdm(range(len(tokens) - steps)):
    X.append([word2i[w] for w in tokens[i:i + steps]])
    
X = torch.tensor(X, device = device)
X.shape

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=70984.0), HTML(value='')))




torch.Size([70984, 20])

In [10]:
model_1 = autoregressive.LSTM(word2i, i2word)
model_1.to(device)

log = model_1.fit(X, epochs = 50, progress_bar = 1)

Model: Autoregressive LSTM
Tokens in the in vocabulary: 3,043
Tokens in the out vocabulary: 3,043
Embedding dimension: 32
Hidden units: 128
Layers: 2
Dropout: 0.0
Trainable parameters: 704,963



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))

Epoch | Train                 | Minutes
      | Loss     | Error Rate |
---------------------------------------
    1 |   3.7176 |     59.619 |     0.3
    2 |   3.1794 |     56.612 |     0.7
    3 |   2.9368 |     54.685 |     1.0
    4 |   2.7088 |     52.025 |     1.4
    5 |   2.5320 |     50.409 |     1.7
    6 |   2.3827 |     49.011 |     2.0
    7 |   2.2485 |     47.656 |     2.4
    8 |   2.1242 |     46.372 |     2.7
    9 |   2.0041 |     44.998 |     3.1
   10 |   1.8862 |     43.501 |     3.4
   11 |   1.7708 |     41.839 |     3.7
   12 |   1.6608 |     40.108 |     4.1
   13 |   1.5571 |     38.269 |     4.4
   14 |   1.4631 |     36.561 |     4.8
   15 |   1.3768 |     34.865 |     5.1
   16 |   1.2975 |     33.228 |     5.4
   17 |   1.2241 |     31.702 |     5.8
   18 |   1.1574 |     30.236 |     6.1
   19 |   1.0964 |     28.849 |     6.4
   20 |   1.0411 |     27.587 |     6.8
   21 |   0.9908 |     26.451 |     7.1
   22 |   0.9448 |     25.386 |     7.5
   23 | 

In [11]:
model_1.tensor2text(X[:1], separator = "")

['the project gutenberg ebook of alice’s adventures in wonderland,']

In [12]:
idx_1, probs_1 = model_1.predict(X[:1], max_predictions = 200, method = "beam_search")

print(idx_1.shape)

model_1.tensor2text(idx_1, separator = "")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=199.0), HTML(value='')))



torch.Size([1, 220])


['the project gutenberg ebook of alice’s adventures in wonderland, by lewis carroll. it was all ridges and furrows; the balls were live at last, and managed to swallow a morsel of the sea, “and in that case i can go back by the whole thing, and longed to change the subject.” she said aloud. “i must be shutting up like a telescope! i think i could, if i only knew it was a little door about fifteen inches high: she tried the middle of her favourite word ‘moral,’ and the arm that was in the middle of her favourite word ‘']