# Next Word Prediction:

### Import libraries

In [6]:
import numpy as np
import pickle
import re
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

### Clean data

In [9]:
"""
    Dataset: http://www.gutenberg.org/cache/epub/5200/pg5200.txt
    Remove all the unnecessary data and label it as Metamorphosis.
    The starting and ending lines should be as follows.
    The First Line: One morning, when Gregor Samsa woke from troubled dreams, he found
    The Last Line:  first to get up and stretch out her young body.
"""


file = open('metamorphosis.txt', 'r', encoding='utf8').read()
data = re.findall('\w+', file)

### Tokenize

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
vocab_size = len(tokenizer.word_index) + 1
vocab_size

2573

In [12]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  22374


array([[  54,  142],
       [ 142,   57],
       [  57,   12],
       [  12,   95],
       [  95,  920],
       [ 920,   29],
       [  29, 1295],
       [1295,  921],
       [ 921,    4],
       [   4,  242]])

In [13]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [14]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [ 54 142  57  12  95]
The responses are:  [142  57  12  95 920]


In [15]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Model

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             25730     
_________________________________________________________________
lstm (LSTM)                  (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 2573)              2575573   
Total params: 15,650,303
Trainable params: 15,650,303
Non-trainable params: 0
_________________________________________________________________


### Callbacks

In [19]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose=1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

### Compile The Model:

In [21]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

### Fit The Model:

In [22]:
model.fit(X, y, epochs=10, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Epoch 1/10

Epoch 00001: loss improved from inf to 6.21581, saving model to nextword1.h5
Epoch 2/10

Epoch 00002: loss improved from 6.21581 to 5.78984, saving model to nextword1.h5
Epoch 3/10

Epoch 00003: loss improved from 5.78984 to 5.54408, saving model to nextword1.h5
Epoch 4/10

Epoch 00004: loss improved from 5.54408 to 5.32169, saving model to nextword1.h5
Epoch 5/10

Epoch 00005: loss improved from 5.32169 to 5.12535, saving model to nextword1.h5
Epoch 6/10

Epoch 00006: loss improved from 5.12535 to 4.98037, saving model to nextword1.h5
Epoch 7/10

Epoch 00007: loss improved from 4.98037 to 4.86819, saving model to nextword1.h5
Epoch 8/10

Epoch 00008: loss improved from 4.86819 to 4.76670, saving model to nextword1.h5
Epoch 9/10

Epoch 00009: loss improved from 4.76670 to 4.66701, saving model to nextword1.h5
Epoch 10/10

Epoch 00010: loss improved from 4.66701 to 4.56329, saving model to nextword1.h5


<keras.callbacks.History at 0x164de2e1760>