In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential,load_model
import tensorflow.keras.utils as ku
import pandas as pd
import numpy as np
import string, os
import re

In [4]:
word=pd.read_csv("/kaggle/input/play-s/Shakespeare_data.csv")
df = word['PlayerLine'].dropna().tolist()
print(df[:10])

['ACT I', 'SCENE I. London. The palace.', 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others', 'So shaken as we are, so wan with care,', 'Find we a time for frighted peace to pant,', 'And breathe short-winded accents of new broils', 'To be commenced in strands afar remote.', 'No more the thirsty entrance of this soil', "Shall daub her lips with her own children's blood,", 'Nor more shall trenching war channel her fields,']


In [5]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
s_w=set(stopwords.words('english'))
print(s_w)

{"you'd", 'or', "aren't", "couldn't", 'haven', 'any', 'what', 'doing', "mightn't", 'doesn', 'between', 'by', "wasn't", 's', "it's", 'needn', 'after', 're', 'in', 'their', 'these', 'before', 'o', 'you', 'who', 'not', 'yours', 'shan', 'the', 'was', "shouldn't", 'they', 'only', 'again', 'we', 'out', 'here', 'ma', 'her', 'very', "should've", 'herself', 'through', 've', 'about', 'there', "didn't", 'y', "you've", 'whom', 'this', 'a', 'such', 'had', 'should', 'is', 'he', 'below', 'him', 'weren', 'are', 'no', 'wasn', 'but', 'of', 'she', 'each', 'were', "isn't", 'why', 'both', 'other', 'being', 'to', 'm', "you're", 'just', 'wouldn', 'mustn', 'because', 'our', 'has', 'didn', 'those', "weren't", 'while', 'than', "hasn't", 'ours', 'now', 'does', 'all', 'can', 'mightn', 'on', 'which', 'itself', 'theirs', 'himself', 'then', 'above', "you'll", 'further', 'aren', 'own', 'where', 'some', "mustn't", 'off', "she's", 'will', 'won', 'hadn', 'too', 'and', 'it', "don't", 'my', "hadn't", 'from', "needn't", 'i

In [7]:
def remove_(sentence):
    temp=re.sub(r'[^a-zA-Z\s]','',sentence.lower())
    temp=' '.join([i for i in temp.split() if i not in s_w])
    return temp

In [8]:
df_no_stop_word=[remove_(i) for i in df]
df_no_stop_word[:10]

['act',
 'scene london palace',
 'enter king henry lord john lancaster earl westmoreland sir walter blunt others',
 'shaken wan care',
 'find time frighted peace pant',
 'breathe shortwinded accents new broils',
 'commenced strands afar remote',
 'thirsty entrance soil',
 'shall daub lips childrens blood',
 'shall trenching war channel fields']

In [9]:
token=Tokenizer()
dictionary=df_no_stop_word[:30000]
token.fit_on_texts(dictionary)
total_word=len(token.word_index)+1

seq_tok=[]
for i in dictionary:
    temp_lis=token.texts_to_sequences([i])[0]
    
    for j in range(1,len(temp_lis),1):
        n_g_t=temp_lis[:j+1]
        seq_tok.append(n_g_t)
seq_tok[:10]

[[46, 315],
 [46, 315, 293],
 [8, 14],
 [8, 14, 59],
 [8, 14, 59, 6],
 [8, 14, 59, 6, 227],
 [8, 14, 59, 6, 227, 461],
 [8, 14, 59, 6, 227, 461, 341],
 [8, 14, 59, 6, 227, 461, 341, 890],
 [8, 14, 59, 6, 227, 461, 341, 890, 10]]

In [10]:
padded_sequences = pad_sequences(seq_tok, maxlen=5, padding='pre')
print(padded_sequences[:10])

[[  0   0   0  46 315]
 [  0   0  46 315 293]
 [  0   0   0   8  14]
 [  0   0   8  14  59]
 [  0   8  14  59   6]
 [  8  14  59   6 227]
 [ 14  59   6 227 461]
 [ 59   6 227 461 341]
 [  6 227 461 341 890]
 [227 461 341 890  10]]


In [11]:
print(total_word)

13789


In [20]:
x_train = padded_sequences[:, :-1]
y_train = padded_sequences[:, -1]
y_train=ku.to_categorical(y_train,num_classes=total_word)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout

model = Sequential()
model.add(Embedding(input_dim=total_word, output_dim=200, input_length=x_train.shape[1]))

# First LSTM layer with return_sequences=True
model.add(LSTM(200, return_sequences=True))
model.add(Dropout(0.2))

# Second LSTM layer with return_sequences=True
model.add(LSTM(200, return_sequences=True))
model.add(Dropout(0.2))

# Third LSTM layer (the last one) with return_sequences=False
model.add(LSTM(200))
model.add(Dropout(0.2))

# Output layer
model.add(Dense(total_word, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
model.fit(x_train, y_train, epochs=80, batch_size=32)
model.save('//kaggle//working//lstm_next_word_model.keras')
print("Model saved successfully.")

# Load the model from .keras file
loaded_model = load_model('/kaggle/working/lstm_next_word_model.keras')
print("Model loaded successfully.")

Epoch 1/80
[1m2599/2599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 19ms/step - accuracy: 0.5093 - loss: 2.3325
Epoch 2/80
[1m2599/2599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 19ms/step - accuracy: 0.5133 - loss: 2.3186
Epoch 3/80
[1m2599/2599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 19ms/step - accuracy: 0.5099 - loss: 2.3384
Epoch 4/80
[1m2599/2599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 19ms/step - accuracy: 0.5164 - loss: 2.3120
Epoch 5/80
[1m2599/2599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 19ms/step - accuracy: 0.5184 - loss: 2.3110
Epoch 6/80
[1m2599/2599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 19ms/step - accuracy: 0.5146 - loss: 2.3187
Epoch 7/80
[1m2599/2599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 19ms/step - accuracy: 0.5177 - loss: 2.3095
Epoch 8/80
[1m2599/2599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 19ms/step - accuracy: 0.5205 - loss: 2.2935
Epoch 9/

In [16]:
import pickle
loaded_model = load_model('/kaggle/working/lstm_next_word_model.keras')
print("Model loaded successfully.")
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(token, f)

Model loaded successfully.


In [18]:
def predict_next_words(model, tokenizer, input_text, num_words=5, max_length=5):
    # Preprocess input_text to match the training format
    input_seq = tokenizer.texts_to_sequences([input_text])[0]
    
    predicted_words = []

    for _ in range(num_words):
        
        input_seq = pad_sequences([input_seq], maxlen=max_length, padding='pre')
        
        predicted = model.predict(input_seq, verbose=0)
        
        predicted_word_index = np.argmax(predicted, axis=-1)[0]
        
        predicted_word = tokenizer.index_word[predicted_word_index]
        
        predicted_words.append(predicted_word)
        
        input_seq = np.append(input_seq[0][1:], predicted_word_index)
    
    return ' '.join(predicted_words)

In [19]:
input_text = "My glow is good"
next_words = predict_next_words(loaded_model, token, input_text, num_words=3, max_length=5)
print(f"{input_text} : {next_words}")

My glow is good : mans adriana indeed
