### In this exercise, the full novel of moby dick is train using LSTMs to predict the next word given a sequence of words.

In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

In [3]:
# read_file('melville-moby_dick.txt')

In [4]:
import spacy

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
nlp.max_length = 1198623

In [7]:
def separate_punctuation(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [8]:
doc = read_file('melville-moby_dick.txt')
tokens = separate_punctuation(doc)

In [9]:
tokens

['chapter',
 '1',
 'loomings',
 'call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i',
 'thought',
 'i',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 'it',
 'is',
 'a',
 'way',
 'i',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 'whenever',
 'i',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 'drizzly',
 'november',
 'in',
 'my',
 'soul',
 'whenever',
 'i',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'i',
 'meet',
 'and',
 'especially',
 'whenever',
 'my',
 'hypos',
 'get',
 'such

In [10]:
len(tokens)

214708

## Creating sequence of tokens

In [11]:
# the sequence length plus 1 where 1 is the label
train_len = 25 + 1

text_sequence =[]

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequence.append(seq)

In [12]:
len(text_sequence)

214682

In [13]:
len(text_sequence[0])

26

In [14]:
from keras.preprocessing.text import Tokenizer

In [15]:
tokenizer = Tokenizer()
# Updates internal vocabulary based on a list of texts.
tokenizer.fit_on_texts(text_sequence)
# Transforms each text in texts to a sequence of integers.
sequences = tokenizer.texts_to_sequences(text_sequence)

In [16]:
tokenizer.index_word

{1: 'the',
 2: 'of',
 3: 'and',
 4: 'a',
 5: 'to',
 6: 'in',
 7: 'that',
 8: 'his',
 9: 'it',
 10: 'i',
 11: 'he',
 12: 'but',
 13: "'s",
 14: 'as',
 15: 'with',
 16: 'is',
 17: 'was',
 18: 'for',
 19: 'all',
 20: 'this',
 21: 'at',
 22: 'not',
 23: 'by',
 24: 'whale',
 25: 'from',
 26: 'so',
 27: 'him',
 28: 'on',
 29: 'be',
 30: 'one',
 31: 'you',
 32: 'there',
 33: 'now',
 34: 'had',
 35: 'have',
 36: 'or',
 37: 'were',
 38: 'they',
 39: 'like',
 40: 'which',
 41: 'then',
 42: 'me',
 43: 'some',
 44: 'their',
 45: 'what',
 46: 'when',
 47: 'an',
 48: 'are',
 49: 'my',
 50: 'no',
 51: 'upon',
 52: 'out',
 53: 'man',
 54: 'into',
 55: 'ship',
 56: 'up',
 57: 'more',
 58: 'ahab',
 59: 'if',
 60: 'them',
 61: 'old',
 62: 'we',
 63: 'sea',
 64: 'would',
 65: "'",
 66: 'ye',
 67: 'do',
 68: 'other',
 69: 'been',
 70: 'over',
 71: 'these',
 72: 'will',
 73: 'though',
 74: 'only',
 75: 'its',
 76: 'down',
 77: 'such',
 78: 'who',
 79: 'yet',
 80: 'head',
 81: 'time',
 82: 'long',
 83: 'boat

In [17]:
sequences[0]

[158,
 9443,
 17526,
 402,
 42,
 1043,
 43,
 247,
 659,
 140,
 296,
 116,
 82,
 787,
 347,
 113,
 36,
 50,
 1788,
 6,
 49,
 3028,
 3,
 218,
 442,
 5]

Let's see how each word corresponds to a unique index

In [18]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

158 : chapter
9443 : 1
17526 : loomings
402 : call
42 : me
1043 : ishmael
43 : some
247 : years
659 : ago
140 : never
296 : mind
116 : how
82 : long
787 : precisely
347 : having
113 : little
36 : or
50 : no
1788 : money
6 : in
49 : my
3028 : purse
3 : and
218 : nothing
442 : particular
5 : to


In [19]:
tokenizer.word_counts

OrderedDict([('chapter', 4447),
             ('1', 28),
             ('loomings', 3),
             ('call', 1382),
             ('me', 16095),
             ('ishmael', 500),
             ('some', 15789),
             ('years', 2400),
             ('ago', 815),
             ('never', 5262),
             ('mind', 2039),
             ('how', 6330),
             ('long', 8567),
             ('precisely', 690),
             ('having', 1679),
             ('little', 6412),
             ('or', 17879),
             ('no', 14916),
             ('money', 305),
             ('in', 105799),
             ('my', 15231),
             ('purse', 178),
             ('and', 164029),
             ('nothing', 2936),
             ('particular', 1273),
             ('to', 117832),
             ('interest', 442),
             ('on', 26910),
             ('shore', 572),
             ('i', 53430),
             ('thought', 3874),
             ('would', 11232),
             ('sail', 2522),
             ('about', 

In [20]:
len(tokenizer.word_counts)

17526

In [21]:
vocab_size = len(tokenizer.word_counts)

In [22]:
type(sequences)

list

In [23]:
import numpy as np

In [24]:
sequences = np.array(sequences)

In [25]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [26]:
def run_model(vocab_size, seq_len):
    model = Sequential()

    #Embedding - Turns positive integers (indexes) into dense vectors of fixed size. This layer can only be used as the first layer in a model
    model.add(Embedding(vocab_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocab_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()
    
    return model

In [27]:
from tensorflow.keras.utils import to_categorical

In [28]:
len(sequences[0])

26

In [29]:
sequences

array([[  158,  9443, 17526, ...,   218,   442,     5],
       [ 9443, 17526,   402, ...,   442,     5,  1165],
       [17526,   402,    42, ...,     5,  1165,    42],
       ...,
       [  240,   938,   351, ...,  1419,  1313,    74],
       [  938,   351,  1418, ...,  1313,    74,   219],
       [  351,  1418,     3, ...,    74,   219,   222]])

In [30]:
# grabbing the first 25 words
sequences[:, :-1]

array([[  158,  9443, 17526, ...,     3,   218,   442],
       [ 9443, 17526,   402, ...,   218,   442,     5],
       [17526,   402,    42, ...,   442,     5,  1165],
       ...,
       [  240,   938,   351, ...,    84,  1419,  1313],
       [  938,   351,  1418, ...,  1419,  1313,    74],
       [  351,  1418,     3, ...,  1313,    74,   219]])

In [31]:
# grabbing the last word
sequences[:, -1]

array([   5, 1165,   42, ...,   74,  219,  222])

In [32]:
X = sequences[:, :-1]

In [33]:
y = sequences[:, -1]

In [34]:
len(y)

214682

In [35]:
y = to_categorical(y, num_classes=vocab_size+1)

In [36]:
vocab_size

17526

In [37]:
len(y[0])

17527

In [38]:
X.shape

(214682, 25)

In [39]:
seq_len = X.shape[1]

## training the model

In [40]:
model = run_model(vocab_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            438175    
                                                                 
 lstm (LSTM)                 (None, 25, 150)           105600    
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 150)               22650     
                                                                 
 dense_1 (Dense)             (None, 17527)             2646577   
                                                                 
Total params: 3,393,602
Trainable params: 3,393,602
Non-trainable params: 0
_________________________________________________________________


In [41]:
from pickle import dump,load

In [42]:
model.fit(X, y, batch_size=128, epochs=450,verbose=1)

Epoch 1/450
Epoch 2/450
Epoch 3/450
Epoch 4/450
Epoch 5/450
Epoch 6/450
Epoch 7/450
Epoch 8/450
Epoch 9/450
Epoch 10/450
Epoch 11/450
Epoch 12/450
Epoch 13/450
Epoch 14/450
Epoch 15/450
Epoch 16/450
Epoch 17/450
Epoch 18/450
Epoch 19/450
Epoch 20/450
Epoch 21/450
Epoch 22/450
Epoch 23/450
Epoch 24/450
Epoch 25/450
Epoch 26/450
Epoch 27/450
Epoch 28/450
Epoch 29/450
Epoch 30/450
Epoch 31/450
Epoch 32/450
Epoch 33/450
Epoch 34/450
Epoch 35/450
Epoch 36/450
Epoch 37/450
Epoch 38/450
Epoch 39/450
Epoch 40/450
Epoch 41/450
Epoch 42/450
Epoch 43/450
Epoch 44/450
Epoch 45/450
Epoch 46/450
Epoch 47/450
Epoch 48/450
Epoch 49/450
Epoch 50/450
Epoch 51/450
Epoch 52/450
Epoch 53/450
Epoch 54/450
Epoch 55/450
Epoch 56/450
Epoch 57/450
Epoch 58/450
Epoch 59/450
Epoch 60/450
Epoch 61/450
Epoch 62/450
Epoch 63/450
Epoch 64/450
Epoch 65/450
Epoch 66/450
Epoch 67/450
Epoch 68/450
Epoch 69/450
Epoch 70/450
Epoch 71/450
Epoch 72/450
Epoch 73/450
Epoch 74/450
Epoch 75/450
Epoch 76/450


Epoch 77/450
Epoch 78/450
Epoch 79/450
Epoch 80/450
Epoch 81/450
Epoch 82/450
Epoch 83/450
Epoch 84/450
Epoch 85/450
Epoch 86/450
Epoch 87/450
Epoch 88/450
Epoch 89/450
Epoch 90/450
Epoch 91/450
Epoch 92/450
Epoch 93/450
Epoch 94/450
Epoch 95/450
Epoch 96/450
Epoch 97/450
Epoch 98/450
Epoch 99/450
Epoch 100/450
Epoch 101/450
Epoch 102/450
Epoch 103/450
Epoch 104/450
Epoch 105/450
Epoch 106/450
Epoch 107/450
Epoch 108/450
Epoch 109/450
Epoch 110/450
Epoch 111/450
Epoch 112/450
Epoch 113/450
Epoch 114/450
Epoch 115/450
Epoch 116/450
Epoch 117/450
Epoch 118/450
Epoch 119/450
Epoch 120/450
Epoch 121/450
Epoch 122/450
Epoch 123/450
Epoch 124/450
Epoch 125/450
Epoch 126/450
Epoch 127/450
Epoch 128/450
Epoch 129/450
Epoch 130/450
Epoch 131/450
Epoch 132/450
Epoch 133/450
Epoch 134/450
Epoch 135/450
Epoch 136/450
Epoch 137/450
Epoch 138/450
Epoch 139/450
Epoch 140/450
Epoch 141/450
Epoch 142/450
Epoch 143/450
Epoch 144/450
Epoch 145/450
Epoch 146/450
Epoch 147/450
Epoch 148/450
Epoch 149/450
E

Epoch 153/450
Epoch 154/450
Epoch 155/450
Epoch 156/450
Epoch 157/450
Epoch 158/450
Epoch 159/450
Epoch 160/450
Epoch 161/450
Epoch 162/450
Epoch 163/450
Epoch 164/450
Epoch 165/450
Epoch 166/450
Epoch 167/450
Epoch 168/450
Epoch 169/450
Epoch 170/450
Epoch 171/450
Epoch 172/450
Epoch 173/450
Epoch 174/450
Epoch 175/450
Epoch 176/450
Epoch 177/450
Epoch 178/450
Epoch 179/450
Epoch 180/450
Epoch 181/450
Epoch 182/450
Epoch 183/450
Epoch 184/450
Epoch 185/450
Epoch 186/450
Epoch 187/450
Epoch 188/450
Epoch 189/450
Epoch 190/450
Epoch 191/450
Epoch 192/450
Epoch 193/450
Epoch 194/450
Epoch 195/450
Epoch 196/450
Epoch 197/450
Epoch 198/450
Epoch 199/450
Epoch 200/450
Epoch 201/450
Epoch 202/450
Epoch 203/450
Epoch 204/450
Epoch 205/450
Epoch 206/450
Epoch 207/450
Epoch 208/450
Epoch 209/450
Epoch 210/450
Epoch 211/450
Epoch 212/450
Epoch 213/450
Epoch 214/450
Epoch 215/450
Epoch 216/450
Epoch 217/450
Epoch 218/450
Epoch 219/450
Epoch 220/450
Epoch 221/450
Epoch 222/450
Epoch 223/450
Epoch 

Epoch 228/450
Epoch 229/450
Epoch 230/450
Epoch 231/450
Epoch 232/450
Epoch 233/450
Epoch 234/450
Epoch 235/450
Epoch 236/450
Epoch 237/450
Epoch 238/450
Epoch 239/450
Epoch 240/450
Epoch 241/450
Epoch 242/450
Epoch 243/450
Epoch 244/450
Epoch 245/450
Epoch 246/450
Epoch 247/450
Epoch 248/450
Epoch 249/450
Epoch 250/450
Epoch 251/450
Epoch 252/450
Epoch 253/450
Epoch 254/450
Epoch 255/450
Epoch 256/450
Epoch 257/450
Epoch 258/450
Epoch 259/450
Epoch 260/450
Epoch 261/450
Epoch 262/450
Epoch 263/450
Epoch 264/450
Epoch 265/450
Epoch 266/450
Epoch 267/450
Epoch 268/450
Epoch 269/450
Epoch 270/450
Epoch 271/450
Epoch 272/450
Epoch 273/450
Epoch 274/450
Epoch 275/450
Epoch 276/450
Epoch 277/450
Epoch 278/450
Epoch 279/450
Epoch 280/450
Epoch 281/450
Epoch 282/450
Epoch 283/450
Epoch 284/450
Epoch 285/450
Epoch 286/450
Epoch 287/450
Epoch 288/450
Epoch 289/450
Epoch 290/450
Epoch 291/450
Epoch 292/450
Epoch 293/450
Epoch 294/450
Epoch 295/450
Epoch 296/450
Epoch 297/450
Epoch 298/450
Epoch 

Epoch 303/450
Epoch 304/450
Epoch 305/450
Epoch 306/450
Epoch 307/450
Epoch 308/450
Epoch 309/450
Epoch 310/450
Epoch 311/450
Epoch 312/450
Epoch 313/450
Epoch 314/450
Epoch 315/450
Epoch 316/450
Epoch 317/450
Epoch 318/450
Epoch 319/450
Epoch 320/450
Epoch 321/450
Epoch 322/450
Epoch 323/450
Epoch 324/450
Epoch 325/450
Epoch 326/450
Epoch 327/450
Epoch 328/450
Epoch 329/450
Epoch 330/450
Epoch 331/450
Epoch 332/450
Epoch 333/450
Epoch 334/450
Epoch 335/450
Epoch 336/450
Epoch 337/450
Epoch 338/450
Epoch 339/450
Epoch 340/450
Epoch 341/450
Epoch 342/450
Epoch 343/450
Epoch 344/450
Epoch 345/450
Epoch 346/450
Epoch 347/450
Epoch 348/450
Epoch 349/450
Epoch 350/450
Epoch 351/450
Epoch 352/450
Epoch 353/450
Epoch 354/450
Epoch 355/450
Epoch 356/450
Epoch 357/450
Epoch 358/450
Epoch 359/450
Epoch 360/450
Epoch 361/450
Epoch 362/450
Epoch 363/450
Epoch 364/450
Epoch 365/450
Epoch 366/450
Epoch 367/450
Epoch 368/450
Epoch 369/450
Epoch 370/450
Epoch 371/450
Epoch 372/450
Epoch 373/450
Epoch 

Epoch 378/450
Epoch 379/450
Epoch 380/450
Epoch 381/450
Epoch 382/450
Epoch 383/450
Epoch 384/450
Epoch 385/450
Epoch 386/450
Epoch 387/450
Epoch 388/450
Epoch 389/450
Epoch 390/450
Epoch 391/450
Epoch 392/450
Epoch 393/450
Epoch 394/450
Epoch 395/450
Epoch 396/450
Epoch 397/450
Epoch 398/450
Epoch 399/450
Epoch 400/450
Epoch 401/450
Epoch 402/450
Epoch 403/450
Epoch 404/450
Epoch 405/450
Epoch 406/450
Epoch 407/450
Epoch 408/450
Epoch 409/450
Epoch 410/450
Epoch 411/450
Epoch 412/450
Epoch 413/450
Epoch 414/450
Epoch 415/450
Epoch 416/450
Epoch 417/450
Epoch 418/450
Epoch 419/450
Epoch 420/450
Epoch 421/450
Epoch 422/450
Epoch 423/450
Epoch 424/450
Epoch 425/450
Epoch 426/450
Epoch 427/450
Epoch 428/450
Epoch 429/450
Epoch 430/450
Epoch 431/450
Epoch 432/450
Epoch 433/450
Epoch 434/450
Epoch 435/450
Epoch 436/450
Epoch 437/450
Epoch 438/450
Epoch 439/450
Epoch 440/450
Epoch 441/450
Epoch 442/450
Epoch 443/450
Epoch 444/450
Epoch 445/450
Epoch 446/450
Epoch 447/450
Epoch 448/450
Epoch 

<keras.callbacks.History at 0x1a7ac269430>

In [43]:
model.save('mobydick_full.h5')
dump(tokenizer, open('mobydick_full', 'wb'))

## Generating New Text

In [44]:
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [57]:
'''
INPUTS:
model : model that was trained on text data
tokenizer : tokenizer that was fit on text data
seq_len : length of training sequence
seed_text : raw string text to serve as the seed
num_gen_words : number of words to be generated by model
'''

'\nINPUTS:\nmodel : model that was trained on text data\ntokenizer : tokenizer that was fit on text data\nseq_len : length of training sequence\nseed_text : raw string text to serve as the seed\nnum_gen_words : number of words to be generated by model\n'

In [58]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0), axis=1)[0]
        pred_word = tokenizer.index_word[pred_word_ind] 
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [53]:
' '.join(text_sequence[25])

'to interest me on shore i thought i would sail about a little and see the watery part of the world it is a way i'

In [55]:
' '.join(text_sequence[30])

'i thought i would sail about a little and see the watery part of the world it is a way i have of driving off the'

In [56]:
generate_text(model,tokenizer,seq_len,seed_text=text_sequence[25],num_gen_words=30)

"have sir consider 's a convenient 's evermore aloft.--thunder present murmured nor wife 's the comes my you old expandingly ropes the them porus thin willed also truth smooth compare"

In [57]:
' '.join(text_sequence[25])

"to interest me on shore i thought i would sail about a little and see the watery part of the world it is a way i   h a v e   s i r   c o n s i d e r   ' s   a   c o n v e n i e n t   ' s   e v e r m o r e   a l o f t . - - t h u n d e r   p r e s e n t   m u r m u r e d   n o r   w i f e   ' s   t h e   c o m e s   m y   y o u   o l d   e x p a n d i n g l y   r o p e s   t h e   t h e m   p o r u s   t h i n   w i l l e d   a l s o   t r u t h   s m o o t h   c o m p a r e"