In [1]:
# Adapted from Udemy lecture

In [2]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [3]:
# read_file('moby_dick_four_chapters.txt')

In [4]:
import spacy

nlp = spacy.load('en',disable=['parser','tagger','ner'])

In [5]:
nlp.max_length = 1198623 

In [6]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [7]:
d = read_file('moby_dick_four_chapters.txt')

In [8]:
tokens = separate_punc(d)

In [9]:
tokens[0:10]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long']

In [10]:
len(tokens)

11394

In [11]:
# send 25 words and have the network predict 26th word
train_len = 25 + 1
text_sequences = []

for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [12]:
# Just sanity check
# when i = 26 (the first value)
# i-train_len --> 0
# i --> 26

In [13]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [14]:
# One word over
' '.join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [15]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [16]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [17]:
sequences[0]

[964,
 14,
 265,
 51,
 263,
 416,
 87,
 222,
 129,
 111,
 962,
 262,
 50,
 43,
 37,
 321,
 7,
 23,
 555,
 3,
 150,
 261,
 6,
 2704,
 14,
 24]

In [23]:
# Translating above first sequence into the words
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}")

964 : call
14 : me
265 : ishmael
51 : some
263 : years
416 : ago
87 : never
222 : mind
129 : how
111 : long
962 : precisely
262 : having
50 : little
43 : or
37 : no
321 : money
7 : in
23 : my
555 : purse
3 : and
150 : nothing
261 : particular
6 : to
2704 : interest
14 : me
24 : on


In [26]:
vocabulary_size = len(tokenizer.word_counts)
print(vocabulary_size)

2709


In [27]:
import numpy as np
sequences = np.array(sequences)
sequences # now as numpy array instead of lists

array([[ 964,   14,  265, ..., 2704,   14,   24],
       [  14,  265,   51, ...,   14,   24,  965],
       [ 265,   51,  263, ...,   24,  965,    5],
       ...,
       [ 960,   12,  168, ...,  264,   53,    2],
       [  12,  168, 2703, ...,   53,    2, 2709],
       [ 168, 2703,    3, ...,    2, 2709,   26]])

In [28]:
from keras.utils import to_categorical

sequences[:,:-1] # all rows and all columns except last one

array([[ 964,   14,  265, ...,    6, 2704,   14],
       [  14,  265,   51, ..., 2704,   14,   24],
       [ 265,   51,  263, ...,   14,   24,  965],
       ...,
       [ 960,   12,  168, ...,   11,  264,   53],
       [  12,  168, 2703, ...,  264,   53,    2],
       [ 168, 2703,    3, ...,   53,    2, 2709]])

In [29]:
sequences[:,-1]

array([  24,  965,    5, ...,    2, 2709,   26])

In [30]:
X = sequences[:,:-1]
y = sequences[:,-1]

In [31]:
y = to_categorical(y,num_classes=vocabulary_size+1)

In [32]:
seq_len = X.shape[1]

In [33]:
X.shape

(11368, 25)

In [34]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

def create_model(vocabulary_size,seq_len):
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size,output_dim=seq_len,input_length=seq_len))
    model.add(LSTM(100,return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100,activation='relu'))
    model.add(Dense(vocabulary_size,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',\
                 metrics=['accuracy'])
    model.summary()
    return model

In [35]:
model = create_model(vocabulary_size+1,seq_len)

2022-08-30 16:43:09.623270: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            67750     
                                                                 
 lstm (LSTM)                 (None, 25, 100)           50400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 2710)              273710    
                                                                 
Total params: 482,360
Trainable params: 482,360
Non-trainable params: 0
_________________________________________________________________


### Increased training period to 30 epochs

In [39]:
from pickle import dump,load

model.fit(X,y,batch_size=128,epochs=30,verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fcc201ebed0>

In [40]:
model.save('my_mobydick_model_08302022.h5')

In [41]:
dump(tokenizer,open('my_simpletokenizer_08302022','wb'))

In [43]:
from keras_preprocessing.sequence import pad_sequences

In [92]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        # make sure if it's 25
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len,truncating='pre')
        
        pred_word_ind = np.argmax(model.predict(pad_encoded,verbose=0)[0], axis=-1)
        
        pred_word = tokenizer.index_word[pred_word_ind]
        
        input_text += ' '+pred_word # this is for prediction purposes
        
        output_text.append(pred_word) # this is for actual output
    
    
    return ' '.join(output_text)

In [85]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [86]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [87]:
random_seed_text = text_sequences[random_pick]

In [88]:
seed_text = ' '.join(random_seed_text)
seed_text

'and throwing the clothes to one side he really did this in not only a civil but a really kind and charitable way i stood looking'

In [93]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'up the counterpane and a harpooneer and a whale and be be be be be be be be be be be be be be be'

In [94]:
from keras.models import load_model

In [95]:
model = load_model('epochBIG.h5') # larger model

In [96]:
tokenizer = load(open('epochBIG','rb'))

In [97]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

"at that stubb ' my frame roman eyes of his own power for the whale 's grain to wrenched progeny for a fever drawn up"

In [98]:
# going back to the previous model

model = load_model('my_mobydick_model_08302022.h5') # larger model
tokenizer = load(open('my_simpletokenizer_08302022','rb'))

In [99]:
seed_text = 'It is better to fail in originality than to succeed in' # imitation is what the actual word is 
# this is a quote by Herman Melville

generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'a bed and a harpooneer and a whale and be be be be be be be be be be be be be be be be'