In [6]:
# Importing Libraries

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import os


In [7]:
# Load and Pre-Process the data

file = open("blue_castle.txt", "r", encoding = "utf8" )

#store file in list
lines = []
for i in file:
     lines.append(i)

#Convert list to string
data = ""
for i in lines:
    data = '  '. join(lines)

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces
data = data.split()
data = ' '.join(data)
data[:500]

'The Project Gutenberg eBook of The Blue Castle, by Lucy Maud Montgomery This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before usi'

In [8]:
len(data)

402906

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

#saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl','wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]


[1, 112, 97, 587, 4, 1, 94, 147, 58, 2383, 2384, 1818, 51, 587, 42]

In [10]:
len(sequence_data)

72052

In [11]:
len(tokenizer.word_index)

8412

In [12]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8413


In [13]:
sequences = []

for i in range (3, len(sequence_data)):
  words = sequence_data[i-3:i+1]
  sequences.append(words)

print("The Length if sequences are:" , len(sequences))
sequences = np.array(sequences)
sequences[:20]


The Length if sequences are: 72049


array([[   1,  112,   97,  587],
       [ 112,   97,  587,    4],
       [  97,  587,    4,    1],
       [ 587,    4,    1,   94],
       [   4,    1,   94,  147],
       [   1,   94,  147,   58],
       [  94,  147,   58, 2383],
       [ 147,   58, 2383, 2384],
       [  58, 2383, 2384, 1818],
       [2383, 2384, 1818,   51],
       [2384, 1818,   51,  587],
       [1818,   51,  587,   42],
       [  51,  587,   42,   20],
       [ 587,   42,   20,    1],
       [  42,   20,    1,  252],
       [  20,    1,  252,    4],
       [   1,  252,    4, 1250],
       [ 252,    4, 1250,  846],
       [   4, 1250,  846,    9],
       [1250,  846,    9,    1]])

In [14]:
X = []
y = []

for i in sequences:
  X.append(i[0:3])
  y.append(i[3])

X = np.array(X)
y = np.array(y)

In [15]:
X

array([[   1,  112,   97],
       [ 112,   97,  587],
       [  97,  587,    4],
       ...,
       [8412,    3,  393],
       [   3,  393,   62],
       [ 393,   62,  219]])

In [16]:
y

array([ 587,    4,    1, ...,   62,  219, 1050])

In [17]:
X.shape,y.shape

((72049, 3), (72049,))

In [18]:
print("Data:", X[:10])
print("Response:", y[:10])

Data: [[   1  112   97]
 [ 112   97  587]
 [  97  587    4]
 [ 587    4    1]
 [   4    1   94]
 [   1   94  147]
 [  94  147   58]
 [ 147   58 2383]
 [  58 2383 2384]
 [2383 2384 1818]]
Response: [ 587    4    1   94  147   58 2383 2384 1818   51]


In [19]:
y = to_categorical(y, num_classes=vocab_size)
y[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense # import the Input layer

model = Sequential()
model.add(Input(shape=(3,))) # now Input is defined
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation= "relu"))
model.add(Dense(vocab_size, activation="softmax"))



In [21]:
model.summary()

In [22]:
from tensorflow import keras

from keras.utils import plot_model # changed from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='plot.png', show_layer_names=True)

You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.


In [36]:
from tensorflow.keras.callbacks import ModelCheckpoint

# Changed 'next_word.h5' to 'next_word.keras'
checkpoint = ModelCheckpoint("next_word.keras", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=20, batch_size=64, callbacks=[checkpoint])

Epoch 1/20
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step - loss: 7.0195
Epoch 1: loss improved from inf to 6.76594, saving model to next_word.keras
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 184ms/step - loss: 7.0193
Epoch 2/20
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step - loss: 6.2638
Epoch 2: loss improved from 6.76594 to 6.21900, saving model to next_word.keras
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 193ms/step - loss: 6.2637
Epoch 3/20
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step - loss: 5.8490
Epoch 3: loss improved from 6.21900 to 5.81626, saving model to next_word.keras
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 193ms/step - loss: 5.8489
Epoch 4/20
[1m1126/1126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - loss: 5.5362
Epoch 4: loss improved from 5.81626 to 5.51724, saving m

<keras.src.callbacks.history.History at 0x23211a58940>

**Lets Predict**

In [52]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

#Load the model and tokenizer
model = load_model('next_word.keras')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predict_word = ""

  for key, value in tokenizer.word_index.items():
    if value == preds:
      predicted_word = key
      break

  print(predicted_word)
  return predicted_word


# In[ ]:




In [53]:
while(True):
  text = input("Enter your line:")

  if text == "0":
    print("Execution completed....")
    break

  else:
    try:
      text = text.split(" ")
      text = text[-3:]
      print(text)

      Predict_Next_Words(model, tokenizer, text)


    except Exception as e:
       print("Error occured: ",e)
       continue


['natural', 'language', 'processing']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309ms/step
in
Execution completed....
