# Training the model

## Data preprocessing

In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

In [2]:
import spacy

In [3]:
#nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
nlp = spacy.load('en_core_web_sm')

In [4]:
nlp.max_length = 1198623

In [5]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [6]:
d = read_file('moby_dick_four_chapters.txt')

In [7]:
tokens = separate_punc(d)

In [8]:
len(tokens)

11338

take 25 words and predict 26th word

In [9]:
train_len = 25 + 1

text_sequence = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequence.append(seq)

In [10]:
' '.join(text_sequence[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [11]:
' '.join(text_sequence[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [12]:
from keras.preprocessing.text import Tokenizer

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequence)

In [14]:
sequences = tokenizer.texts_to_sequences(text_sequence)

In [15]:
# sequences[0]

In [16]:
# tokenizer.index_word

In [17]:
for i in sequences[0]:
    print(f'{i:{5}}. {tokenizer.index_word[i]}')

  956. call
   14. me
  263. ishmael
   51. some
  261. years
  408. ago
   87. never
  219. mind
  129. how
  111. long
  954. precisely
  260. having
   50. little
   43. or
   38. no
  314. money
    7. in
   23. my
  546. purse
    3. and
  150. nothing
  259. particular
    6. to
 2713. interest
   14. me
   24. on


In [18]:
# tokenizer.word_counts

In [19]:
vocabulary_size = len(tokenizer.word_counts)

In [20]:
vocabulary_size

2718

In [21]:
type(sequences)

list

In [22]:
import numpy as np
sequences = np.array(sequences)

In [23]:
sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

In [24]:
X = sequences[:,:-1]

In [25]:
y = sequences[:,-1]

In [26]:
seq_len = X.shape[1]

In [27]:
seq_len

25

In [28]:
from keras.utils.np_utils import to_categorical
y = to_categorical(y, num_classes=vocabulary_size+1)

## LSTM model

In [29]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [30]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length = seq_len))
    # Embedding(input_dim, output_dim, input_length=None)
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [31]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            67975     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 50)            15200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 2719)              138669    
Total params: 244,594
Trainable params: 244,594
Non-trainable params: 0
_________________________________________________________________


In [32]:
from pickle import dump, load

In [33]:
model.fit(X, y, batch_size=128, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1d368555f10>

In [34]:
# save the model to file
model.save('my_moby_dick_model.h5')
# save the tokenizer
dump(tokenizer, open('my_simple_tokenizer', 'wb'))

# Predicting

In [35]:
from keras.preprocessing.sequence import pad_sequences

In [58]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        # pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        pred_word_ind = np.argmax(model.predict(pad_encoded), axis=-1)[0]
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [39]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequence))

In [41]:
random_seed_text = text_sequence[random_pick]

In [45]:
seed_text = ' '.join(random_seed_text)

In [46]:
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [59]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'the the the the the the the the the the the the the the the the the the the the the the the the the'

## Load a pre trained model

In [60]:
from keras.models import load_model

In [61]:
new_model = load_model('epochBIG.h5')

In [62]:
new_tokenizer = load(open('epochBIG','rb'))

In [63]:
generate_text(new_model,new_tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

"to be seen there was no bad olfactories my own letter was cheerily listening over his hearers who 's more can go have a wearing"