In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
with open('dialogues_text.txt', 'r') as file:
    text = file.read().lower()

In [3]:
utterances = text.split('eou')
utterances = [u.strip() for u in utterances if u.strip()]

In [4]:
max_samples = 10000
utterances = utterances[:max_samples]

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(utterances)
total_words = len(tokenizer.word_index) + 1

In [6]:
input_sequences = []
for line in utterances:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

In [7]:
max_sequence_len = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
x, y = input_sequences[:, :-1], input_sequences[:, -1]

In [8]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    3,  596],
       [   0,    0,    0, ...,    3,  596, 2340],
       [   0,    0,    0, ...,    0,   48, 1215],
       ...,
       [   0,    0,    0, ...,    2,   55,   48],
       [   0,    0,    0, ...,   55,   48,   54],
       [   0,    0,    0, ...,   48,   54, 1089]])

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [12]:
## Train our LSTM RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU

## Define the model
batch_size=128
model = Sequential()
model.add(Input(shape=(x_train.shape[1],)))
model.add(Embedding(total_words, 50, input_length=x_train.shape[1]))  # Reduce embedding size
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(total_words, activation="softmax"))

# #Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
model.summary()



In [13]:
history = model.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test), verbose=1, batch_size=batch_size, callbacks=[early_stopping])

Epoch 1/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 499ms/step - accuracy: 0.0385 - loss: 6.8723 - val_accuracy: 0.0446 - val_loss: 6.2054
Epoch 2/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 622ms/step - accuracy: 0.0511 - loss: 6.1105 - val_accuracy: 0.0569 - val_loss: 6.1509
Epoch 3/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 428ms/step - accuracy: 0.0612 - loss: 5.9851 - val_accuracy: 0.0749 - val_loss: 6.0275
Epoch 4/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 417ms/step - accuracy: 0.0851 - loss: 5.8048 - val_accuracy: 0.0900 - val_loss: 5.8949
Epoch 5/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 370ms/step - accuracy: 0.0968 - loss: 5.6373 - val_accuracy: 0.1017 - val_loss: 5.7989
Epoch 6/10
[1m520/520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 344ms/step - accuracy: 0.1126 - loss: 5.4870 - val_accuracy: 0.1106 - val_loss: 5.7227
Epoc

In [14]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list[-(max_sequence_len - 1):]], maxlen=max_sequence_len - 1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)[0]
    return [word for word, index in tokenizer.word_index.items() if index == predicted_word_index][0]

In [22]:
## Save the model
model.save("LSTM.keras")
## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
input_text="  Barn. Last night of all,When you"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word:{next_word}")

Input text:  Barn. Last night of all,When you
Next Word:have
