In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os


In [8]:
from google.colab import files
uploaded = files.upload()

Saving text.txt to text (1).txt


In [9]:
file = open("text.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines)

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces
data = data.split()
data = ' '.join(data)
data[:493]

'The Project Gutenberg eBook of Pride and Prejudice This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.'

In [10]:
len(data)

733694

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[1, 181, 390, 1000, 3, 298, 4, 946, 41, 1000, 23, 21, 1, 507, 3]

In [13]:
len(sequence_data)

131180

In [14]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

7252


In [15]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  131177


array([[   1,  181,  390, 1000],
       [ 181,  390, 1000,    3],
       [ 390, 1000,    3,  298],
       [1000,    3,  298,    4],
       [   3,  298,    4,  946],
       [ 298,    4,  946,   41],
       [   4,  946,   41, 1000],
       [ 946,   41, 1000,   23],
       [  41, 1000,   23,   21],
       [1000,   23,   21,    1]])

In [16]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])

X = np.array(X)
y = np.array(y)

In [17]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[   1  181  390]
 [ 181  390 1000]
 [ 390 1000    3]
 [1000    3  298]
 [   3  298    4]
 [ 298    4  946]
 [   4  946   41]
 [ 946   41 1000]
 [  41 1000   23]
 [1000   23   21]]
Response:  [1000    3  298    4  946   41 1000   23   21    1]


In [18]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [19]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             72520     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 7252)              7259252   
                                                                 
Total params: 20380772 (77.75 MB)
Trainable params: 20380772 (77.75 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])


Epoch 1/70
Epoch 1: loss improved from inf to 6.22173, saving model to next_words.h5


  saving_api.save_model(


Epoch 2/70
Epoch 2: loss improved from 6.22173 to 5.61378, saving model to next_words.h5
Epoch 3/70
Epoch 3: loss improved from 5.61378 to 5.29342, saving model to next_words.h5
Epoch 4/70
Epoch 4: loss improved from 5.29342 to 5.06574, saving model to next_words.h5
Epoch 5/70
Epoch 5: loss improved from 5.06574 to 4.86644, saving model to next_words.h5
Epoch 6/70
Epoch 6: loss improved from 4.86644 to 4.66808, saving model to next_words.h5
Epoch 7/70
Epoch 7: loss improved from 4.66808 to 4.47146, saving model to next_words.h5
Epoch 8/70
Epoch 8: loss improved from 4.47146 to 4.27100, saving model to next_words.h5
Epoch 9/70
Epoch 9: loss improved from 4.27100 to 4.06513, saving model to next_words.h5
Epoch 10/70
Epoch 10: loss improved from 4.06513 to 3.85426, saving model to next_words.h5
Epoch 11/70
Epoch 11: loss improved from 3.85426 to 3.64047, saving model to next_words.h5
Epoch 12/70
Epoch 12: loss improved from 3.64047 to 3.42310, saving model to next_words.h5
Epoch 13/70
Epo

<keras.src.callbacks.History at 0x798912498e20>

In [23]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""

  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break

  print(predicted_word)
  return predicted_word

In [24]:
while(True):
  text = input("Enter your line: ")

  if text == "0":
      print("Execution completed.....")
      break

  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)

          Predict_Next_Words(model, tokenizer, text)

      except Exception as e:
        print("Error occurred: ",e)
        continue

Enter your line: how can you abuse your own
['abuse', 'your', 'own']
children
Enter your line: He was quite
['He', 'was', 'quite']
young
Enter your line: He could not help seeing that you were about five times as
['five', 'times', 'as']
pretty
Enter your line: quit
['quit']
netherfield
Enter your line: 0
Execution completed.....
