# Next Word Prediction Using LSTM

Adapted from https://github.com/Vishwaaaah/Next_word_prediction_using_LSTM


In [1]:
import os
import pickle
import datetime
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
# Retrain the model
#IS_TRAIN_MODE = True

# Load the model from save
IS_TRAIN_MODE = False

In [None]:
# Load the data

baseDir = os.getcwd()
# curDir = os.path.join(baseDir, "T10 - LLM", "S02 - Recurrent")
curDir = baseDir
filePath = os.path.join(curDir, "hamlet.txt")
print(filePath)

if not os.path.exists(filePath):
    import nltk
    nltk.download('gutenberg')
    from nltk.corpus import gutenberg

    data = gutenberg.raw("shakespeare-hamlet.txt")
    with open(filePath, "w") as file:
        file.write(data)


with open(filePath) as file:
    text = file.read().lower()

In [None]:
# Show the first 10 lines
for idx, line in enumerate(text.split("\n")):
    print(line)
    if idx > 10:
        break

In [None]:
# Create the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
print(total_words)

In [None]:
# Show the first 20 words as a dictionary
for idx, (k, v) in enumerate(tokenizer.word_index.items()):
    print(f"{k:5s} -> {v:5d}")
    if idx > 20:
        break

In [None]:
# Creating input-sequence

input_sequences = []
for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[: i + 1]
        input_sequences.append(n_gram_sequence)

input_sequences[:5]

In [None]:
max_sequence_length = max([len(x) for x in input_sequences])
print(max_sequence_length)

In [None]:
# Pad sequences
input_sequences = np.array(
    pad_sequences(input_sequences, maxlen=max_sequence_length, padding="pre")
)
input_sequences

In [10]:
# Create predictors and labels
X, yt = input_sequences[:, :-1], input_sequences[:, -1]

In [None]:
print(X.shape)
print(X[:5])

In [None]:
yt[:5]

In [None]:
# Convert labels to categorical
y = tf.keras.utils.to_categorical(yt, num_classes=total_words)
print(y.shape)
print(y[:10])

In [14]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Create the model

tf.keras.backend.clear_session()
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization

if IS_TRAIN_MODE:
    # Model 1
    model = Sequential()
    model.add(Embedding(total_words, 100))
    model.add(Bidirectional(LSTM(150)))
    model.add(Dense(total_words, activation="softmax"))
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    #  accuracy: 0.8373 - loss: 0.6894

    # Model 2
    # model = Sequential()
    # model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
    # model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
    # model.add(Dropout(0.3))
    # model.add(Bidirectional(LSTM(units=64)))
    # model.add(Dropout(0.3))
    # model.add(BatchNormalization())
    # model.add(Dense(64, activation='relu'))
    # model.add(Dense(total_words, activation='softmax'))
    # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.build(input_shape=(None, max_sequence_length - 1))
    model.summary()

In [16]:
# Train the model

if IS_TRAIN_MODE:
    history = model.fit(x_train, y_train, epochs=40, verbose=1)
    # history=model.fit(x_train,y_train,epochs=100,validation_data=(x_test,y_test),verbose=1)

    # Save the model
    dateTime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    model.save(os.path.join(curDir, f"model-{dateTime}.keras"))

    # Save tokenizer
    with open(os.path.join(curDir, "tokenizer.pkl"), "wb") as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Load the model
if not IS_TRAIN_MODE:
    model = tf.keras.models.load_model('model-20250303-050608.keras')
    model.summary()

In [19]:
# Function to predict next word

def predict_next_word(model, tokenizer, text):
    max_sequence_length = model.input_shape[1] + 1
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_length:
        token_list = token_list[-(max_sequence_length):]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_length - 1, padding="pre"
    )
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [None]:
# Predict next word

input_text = "With mirth the king"
print(f"Input text: {input_text}")
#
token_list = tokenizer.texts_to_sequences([input_text])[0]
print(f"Padded token list: {token_list}")
#
token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding="pre")
print(f"Token list: {token_list}")
#
predicted = model.predict(token_list, verbose=0)
print(f"Predicted: {predicted}")
#
predicted_word_index = np.argmax(predicted, axis=1)
print(f"Predicted word index: {predicted_word_index}")
#
for word, index in tokenizer.word_index.items():
    if index == predicted_word_index:
        print(f"Predicted next word: {word}")
        break


In [None]:
input_text = "I am"

textStr = input_text
print(textStr, end=" ")
for i in range(1, 150):
    next_word = predict_next_word(model, tokenizer, textStr)
    print(next_word, end=" ")
    if i % 20 == 0:
        print("\n", end="")
    textStr = textStr + " " + next_word