In [1]:
import nltk
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import tensorflow as tf
import pickle

In [2]:
# file_path = '/kaggle/input/shakespears-hamlet-text/hamlet.txt'

with open("/kaggle/input/shakespears-hamlet-text/hamlet.txt", "r") as file:
  text = file.read().lower()

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
total_words

4818

In [3]:
input_sequences = []
for line in text.split('\n'):
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

In [4]:
max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len

14

In [5]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [6]:
X,y = input_sequences[:, :-1], input_sequences[:, -1]

In [7]:
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       ...,
       [   0,    0,    0, ...,  687,    4,   45],
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4]], dtype=int32)

In [8]:
y

array([ 687,    4,   45, ..., 1047,    4,  193], dtype=int32)

In [9]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [10]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [12]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=100))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.build(input_shape=(None, max_sequence_len))  # Add batch size and sequence length


model.summary()


In [13]:
history = model.fit(X_train, y_train, epochs=150, verbose=1, validation_data=(X_test, y_test))

Epoch 1/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 43ms/step - accuracy: 0.0303 - loss: 7.1705 - val_accuracy: 0.0392 - val_loss: 6.7370
Epoch 2/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 41ms/step - accuracy: 0.0353 - loss: 6.4548 - val_accuracy: 0.0460 - val_loss: 6.8088
Epoch 3/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 44ms/step - accuracy: 0.0434 - loss: 6.3230 - val_accuracy: 0.0517 - val_loss: 6.8636
Epoch 4/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 42ms/step - accuracy: 0.0485 - loss: 6.1640 - val_accuracy: 0.0519 - val_loss: 6.8869
Epoch 5/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 40ms/step - accuracy: 0.0549 - loss: 6.0340 - val_accuracy: 0.0542 - val_loss: 6.9307
Epoch 6/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 41ms/step - accuracy: 0.0600 - loss: 5.9164 - val_accuracy: 0.0596 - val_loss: 6.9391
Epoch 7/15

In [14]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len - 1):]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
    predicted = model.predict(token_list, verbose='0')
    predicted_word_index = np.argmax(predicted, axis = 1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None        

In [15]:
input_text = "how are "
print(f"Input text:{input_text}")
max_sequence_len = model.input_shape[1] + 1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Next word prediction:{next_word}")

Input text:how are 
Next word prediction:a


In [16]:
model.save("/kaggle/working/next_word_lstm.h5")

with open("/kaggle/working/tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)
    

In [17]:
# app_code =

# """import streamlit as lt
# import numpy as np
# import pickle
# from tensorflow.keras.models import load_model
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# model = load_model('/kaggle/working/next_word_lstm.h5')

# with open('/kaggle/working/tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

# def predict_next_word(model, tokenizer, text, max_sequence_len):
#     token_list = tokenizer.texts_to_sequences([text])[0]
#     if len(token_list) >= max_sequence_len:
#         token_list = token_list[-(max_sequence_len - 1):]
#     token_list = pad_sequences([token_list], maxlen = max_sequence_len - 1, padding = "pre")
#     predicted = model.predict(token_list, verbose = 0)
#     predicted_word_index = np.argmax(predicted, axis = 1)
#     for word, index in tokenizer.word_index.items():
#         if index == predicted_word_index:
#             return word
#     return None


# st.title("Next word prediction with LSTM")
# input_text = st.text.input("Please enter one sentence", "To be or not to be")
# if st.button("Predict next word"):
#     max_sequence_len = model.input_shape[1] + 1
#     next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
#     st.write(f"Next word: {next_word}")"""

# with open('/kaggle/working/app.py', 'w') as f:
#     f.write(app_code)

# print("app.py has been created successfully!")