# Next Word Prediction Using LSTM

https://github.com/Vishwaaaah/Next_word_prediction_using_LSTM


In [48]:
import os
import nltk
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [49]:
# nltk.download("gutenberg")
# from nltk.corpus import gutenberg
# data = gutenberg.raw("shakespeare-hamlet.txt")
# with open("hamlet.txt", "w") as file:
#     file.write(data)

In [91]:
baseDir = os.getcwd()
curDir = os.path.join(baseDir, "T10 - LLM", "S02 - Recurrent")
filePath = os.path.join(curDir, "hamlet.txt")
print(filePath)

if not os.path.exists(filePath):
    raise FileNotFoundError("File not found")

with open(filePath) as file:
    text = file.read().lower()

C:\Users\nr\Coding\ai-class\ai-2567-2\T10 - LLM\S02 - Recurrent\hamlet.txt


In [51]:
for idx, line in enumerate(text.split("\n")):
    print(line)
    if idx > 10:
        break

[the tragedie of hamlet by william shakespeare 1599]


actus primus. scoena prima.

enter barnardo and francisco two centinels.

  barnardo. who's there?
  fran. nay answer me: stand & vnfold
your selfe

   bar. long liue the king


In [52]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
print(total_words)

4818


In [53]:
for idx, (k, v) in enumerate(tokenizer.word_index.items()):
    print(f"{k:5s} -> {v:5d}")
    if idx > 20:
        break

the   ->     1
and   ->     2
to    ->     3
of    ->     4
i     ->     5
you   ->     6
a     ->     7
my    ->     8
it    ->     9
in    ->    10
that  ->    11
ham   ->    12
is    ->    13
not   ->    14
his   ->    15
this  ->    16
with  ->    17
your  ->    18
but   ->    19
for   ->    20
me    ->    21
lord  ->    22


In [54]:
# Creating input-sequence
input_sequences = []
for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[: i + 1]
        input_sequences.append(n_gram_sequence)

input_sequences[:5]

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886]]

In [55]:
max_sequence_length = max([len(x) for x in input_sequences])
max_sequence_length

14

In [56]:
input_sequences = np.array(
    pad_sequences(input_sequences, maxlen=max_sequence_length, padding="pre")
)
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [71]:
# Create predictors and labels
X, yt = input_sequences[:, :-1], input_sequences[:, -1]

In [77]:
print(X.shape)
print(X[:5])

(25732, 13)
[[  0   0   0   0   0   0   0   0   0   0   0   0   1]
 [  0   0   0   0   0   0   0   0   0   0   0   1 687]
 [  0   0   0   0   0   0   0   0   0   0   1 687   4]
 [  0   0   0   0   0   0   0   0   0   1 687   4  45]
 [  0   0   0   0   0   0   0   0   1 687   4  45  41]]


In [73]:
yt[:5]

array([ 687,    4,   45,   41, 1886])

In [79]:
y = tf.keras.utils.to_categorical(yt, num_classes=total_words)
print(y.shape)
print(y[:10])

(25732, 4818)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [80]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
tf.keras.backend.clear_session()
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization

# Model 1
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
#  accuracy: 0.8373 - loss: 0.6894

# Model 2
# model = Sequential()
# model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
# model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
# model.add(Dropout(0.3))
# model.add(Bidirectional(LSTM(units=64)))
# model.add(Dropout(0.3))
# model.add(BatchNormalization())
# model.add(Dense(64, activation='relu'))
# model.add(Dense(total_words, activation='softmax'))
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.build(input_shape=(None, max_sequence_length - 1))
model.summary()

In [89]:
# Train the model
history = model.fit(x_train, y_train, epochs=1, verbose=1)
# history=model.fit(x_train,y_train,epochs=100,validation_data=(x_test,y_test),verbose=1)


[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 17ms/step - accuracy: 0.0326 - loss: 7.0962


In [64]:
def predict_next_word(model, tokenizer, text, max_sequence_length):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_length:
        token_list = token_list[-(max_sequence_length):]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_length - 1, padding="pre"
    )
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None


In [92]:
# Save the model
model.save(os.path.join(curDir, "my_model.keras"))

# Save tokenizer
with open(os.path.join(curDir, "tokenizer.pkl"), "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [66]:
input_text = "With mirth the king"
print(f"Input text: {input_text}")
max_sequence_length = model.input_shape[1] + 1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_length)
print(f"Next word: {next_word}")

Input text: With mirth the king
Next word: of


In [67]:
input_text = "In the"
print(f"Input text: {input_text}")
max_sequence_length = model.input_shape[1] + 1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_length)
print(f"Next word: {next_word}")

Input text: In the
Next word: king


In [68]:
input_text = "Taken to"
print(f"Input text: {input_text}")
max_sequence_length = model.input_shape[1] + 1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_length)
print(f"Next word: {next_word}")

Input text: Taken to
Next word: the


In [69]:
input_text = "I am"
# print("original text: Laertes, and his sister")
print(f"Input text: {input_text}")
max_sequence_length = model.input_shape[1] + 1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_length)
print(f"Next word: {next_word}")

Input text: I am
Next word: the


In [70]:
input_text = "I am"
for i in range(50):
    max_sequence_length = model.input_shape[1] + 1
    next_word = predict_next_word(model, tokenizer, input_text, max_sequence_length)
    print(next_word, end=" ")
    input_text = input_text + " " + next_word

the king of the king of the king of the king of stone betime the rat betime of the betime of of betime the betime stone stone outherod's stone outherod's outherod's outherod's the rat stone traitorous traitorous of of rat traitorous traitorous traitorous traitorous traitorous traitorous traitorous traitorous traitorous traitorous 