<a href="https://colab.research.google.com/github/ishankhurana27/next-word-prediction/blob/main/Next_word_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# data collection
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

# load the dataset
df=gutenberg.raw('shakespeare-hamlet.txt')

#save to file
with open('hamlet.txt','w') as f:
  f.write(df)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [28]:
# data processing

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer  # convert text to vectors
from tensorflow.keras.preprocessing.sequence import pad_sequences # makes sure all sentences have same length while training lstm rnn
from sklearn.model_selection import train_test_split

# load the dataset
with open('hamlet.txt','r') as f:
  text=f.read().lower()

# tokenize the text-- creating index for words
tokenizer=Tokenizer(num_words=5000,oov_token='<OOV>')
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1
total_words


4819

In [29]:
total_words=min(5000,len(tokenizer.word_index)+1)

In [30]:
# create input sequence. converting every sentence into sequence of indexes
input_sequences=[]
for line in text.split('\n'):
  token_list=tokenizer.texts_to_sequences([line])[0]
  for i in range(1,len(token_list)):
    n_gram_sequence=token_list[:i+1]
    input_sequences.append(n_gram_sequence)

In [31]:
input_sequences

[[2, 688],
 [2, 688, 5],
 [2, 688, 5, 46],
 [2, 688, 5, 46, 42],
 [2, 688, 5, 46, 42, 1887],
 [2, 688, 5, 46, 42, 1887, 1888],
 [2, 688, 5, 46, 42, 1887, 1888, 1889],
 [1181, 1890],
 [1181, 1890, 1891],
 [1181, 1890, 1891, 1892],
 [58, 408],
 [58, 408, 3],
 [58, 408, 3, 1182],
 [58, 408, 3, 1182, 178],
 [58, 408, 3, 1182, 178, 1893],
 [408, 1183],
 [408, 1183, 64],
 [409, 163],
 [409, 163, 378],
 [409, 163, 378, 22],
 [409, 163, 378, 22, 248],
 [409, 163, 378, 22, 248, 883],
 [19, 67],
 [452, 225],
 [452, 225, 249],
 [452, 225, 249, 2],
 [452, 225, 249, 2, 31],
 [409, 408],
 [452, 26],
 [409, 7],
 [409, 7, 44],
 [409, 7, 44, 63],
 [409, 7, 44, 63, 1894],
 [409, 7, 44, 63, 1894, 97],
 [409, 7, 44, 63, 1894, 97, 19],
 [409, 7, 44, 63, 1894, 97, 19, 567],
 [452, 72],
 [452, 72, 52],
 [452, 72, 52, 1895],
 [452, 72, 52, 1895, 568],
 [452, 72, 52, 1895, 568, 379],
 [452, 72, 52, 1895, 568, 379, 81],
 [452, 72, 52, 1895, 568, 379, 81, 4],
 [452, 72, 52, 1895, 568, 379, 81, 4, 274],
 [452, 72

In [32]:
# now we will make sure every above sentence are of equal length
# apply pad sequences

max_sequence_len=max([len(x) for x in input_sequences])
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))

In [33]:
input_sequences

array([[   0,    0,    0, ...,    0,    2,  688],
       [   0,    0,    0, ...,    2,  688,    5],
       [   0,    0,    0, ...,  688,    5,   46],
       ...,
       [   0,    0,    0, ...,    5,   46, 1048],
       [   0,    0,    0, ...,   46, 1048,    5],
       [   0,    0,    0, ..., 1048,    5,  194]], dtype=int32)

In [34]:
# therefore now wvery sentence has equal length

In [35]:
# create predictors and labels
import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]
# x= all the words except for last word
# y=last word

In [36]:
x

array([[   0,    0,    0, ...,    0,    0,    2],
       [   0,    0,    0, ...,    0,    2,  688],
       [   0,    0,    0, ...,    2,  688,    5],
       ...,
       [   0,    0,    0, ...,  688,    5,   46],
       [   0,    0,    0, ...,    5,   46, 1048],
       [   0,    0,    0, ...,   46, 1048,    5]], dtype=int32)

In [37]:
y

array([ 688,    5,   46, ..., 1048,    5,  194], dtype=int32)

In [38]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
#train the dataset
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [42]:
# train our lstm rnn

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout

#define the model
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150,return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()



In [43]:
# train the model

model.fit(x_train,y_train,epochs=50,batch_size=128,validation_data=(x_test,y_test),verbose=1)

Epoch 1/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 200ms/step - accuracy: 0.0301 - loss: 7.4213 - val_accuracy: 0.0336 - val_loss: 6.7804
Epoch 2/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 177ms/step - accuracy: 0.0345 - loss: 6.5508 - val_accuracy: 0.0385 - val_loss: 6.7303
Epoch 3/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 174ms/step - accuracy: 0.0377 - loss: 6.4286 - val_accuracy: 0.0443 - val_loss: 6.7733
Epoch 4/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 173ms/step - accuracy: 0.0463 - loss: 6.2618 - val_accuracy: 0.0449 - val_loss: 6.7710
Epoch 5/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 177ms/step - accuracy: 0.0529 - loss: 6.1586 - val_accuracy: 0.0464 - val_loss: 6.7689
Epoch 6/50
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 177ms/step - accuracy: 0.0517 - loss: 6.1191 - val_accuracy: 0.0497 - val_loss: 6.7938
Epoch 7/50

<keras.src.callbacks.history.History at 0x7cb7b72bdd10>

In [44]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [45]:
input_text=" to be or not to be"
print(f"input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"next word prediction:{next_word}")

input text: to be or not to be
next word prediction:against


In [46]:
# save the model
model.save("next_word_lstm.h5")

# save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
  pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)




In [54]:
# DEPLOYING WITH STREAMLIT

!pip install streamlit
import streamlit as st
import numpy as np
import pickle
from tensorflow import keras
from keras.utils import pad_sequences

model=keras.models.load_model('next_word_lstm.h5')

with open('tokenizer.pickle','rb') as handle:
  tokenizer=pickle.load(handle)

def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

# streamlit app
st.title("Next Word Prediction With LSTM And Early Stopping")
input_text=st.text_input("Enter the sequence of Words","To be or not to")
if st.button("Predict Next Word"):
    max_sequence_len = model.input_shape[1] + 1  # Retrieve the max sequence length from the model input shape
    next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
    st.write(f'Next word: {next_word}')




2025-04-20 07:39:38.174 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-04-20 07:39:38.185 Session state does not function when running a script without `streamlit run`
