In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np 
import os

In [None]:
content= open ("NEXT WORD DATASET.txt","r", encoding="utf8")

In [6]:
#storing the text file in a list

lines = []
for i in content:
    lines.append(i)
    
#converting list to string

data = ""
for i in lines:
    data= ''.join(lines)
    
    
#replace unnecessary stuffs with spaces 
data = data.replace('\n', '').replace('\r','').replace('\ufeff','').replace('“','').replace('”','').replace('*','')

#remove unnecessary spaces
data = data.split()
data = ' '.join(data) 
data[:500]

"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan DoyleThis eBook is for the use of anyone anywhere at no cost and withalmost no restrictions whatsoever. You may copy it, give it away orre-use it under the terms of the Project Gutenberg License includedwith this eBook or online at www.gutenberg.netTitle: The Adventures of Sherlock HolmesAuthor: Arthur Conan DoyleRelease Date: November 29, 2002 [EBook #1661]Last Updated: May 20, 2019Language: EnglishCharacter set encoding: UT"

In [7]:
len(data)

567092

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

#Saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl','wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[159, 4872, 1, 963, 5, 124, 32, 45, 554, 2015, 4873, 1042, 13, 21, 1]

In [9]:
len(sequence_data)

103104

In [10]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

13188


In [11]:
sequences=[]

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The length of sequences are: ",len(sequences))
sequences = np.array(sequences)
sequences[:10]

The length of sequences are:  103101


array([[ 159, 4872,    1,  963],
       [4872,    1,  963,    5],
       [   1,  963,    5,  124],
       [ 963,    5,  124,   32],
       [   5,  124,   32,   45],
       [ 124,   32,   45,  554],
       [  32,   45,  554, 2015],
       [  45,  554, 2015, 4873],
       [ 554, 2015, 4873, 1042],
       [2015, 4873, 1042,   13]])

In [13]:
X= []
y= []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X=np.array(X)
y=np.array(y)

In [14]:
print("Data: ",X[:10])
print("Response: ",y[:10])

Data:  [[ 159 4872    1]
 [4872    1  963]
 [   1  963    5]
 [ 963    5  124]
 [   5  124   32]
 [ 124   32   45]
 [  32   45  554]
 [  45  554 2015]
 [ 554 2015 4873]
 [2015 4873 1042]]
Response:  [ 963    5  124   32   45  554 2015 4873 1042   13]


In [15]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences = True))
model.add(LSTM(1000))
model.add(Dense(1000, activation = "relu"))
model.add(Dense(vocab_size, activation = "softmax"))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             131880    
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 13188)             13201188  
                                                                 
Total params: 26382068 (100.64 MB)
Trainable params: 26382068 (100.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only= True)
model.compile(loss="categorical_crossentropy", optimizer= Adam(learning_rate=0.001), metrics=['accuracy'])
model.fit(X,y, epochs=70, batch_size=64 , callbacks=[checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 6.87204, saving model to next_words.h5


  saving_api.save_model(


Epoch 2/70
Epoch 2: loss improved from 6.87204 to 6.35439, saving model to next_words.h5
Epoch 3/70
Epoch 3: loss improved from 6.35439 to 5.96949, saving model to next_words.h5
Epoch 4/70
Epoch 4: loss improved from 5.96949 to 5.67866, saving model to next_words.h5
Epoch 5/70
Epoch 5: loss improved from 5.67866 to 5.42441, saving model to next_words.h5
Epoch 6/70
Epoch 6: loss improved from 5.42441 to 5.18141, saving model to next_words.h5
Epoch 7/70
Epoch 7: loss improved from 5.18141 to 4.94473, saving model to next_words.h5
Epoch 8/70
Epoch 8: loss improved from 4.94473 to 4.71342, saving model to next_words.h5
Epoch 9/70
Epoch 9: loss improved from 4.71342 to 4.47486, saving model to next_words.h5
Epoch 10/70
Epoch 10: loss improved from 4.47486 to 4.22682, saving model to next_words.h5
Epoch 11/70
Epoch 11: loss improved from 4.22682 to 3.96983, saving model to next_words.h5
Epoch 12/70
Epoch 12: loss improved from 3.96983 to 3.70628, saving model to next_words.h5
Epoch 13/70
Epo

Epoch 32/70
Epoch 32: loss improved from 0.69596 to 0.65031, saving model to next_words.h5
Epoch 33/70
Epoch 33: loss improved from 0.65031 to 0.61992, saving model to next_words.h5
Epoch 34/70
Epoch 34: loss improved from 0.61992 to 0.58935, saving model to next_words.h5
Epoch 35/70
Epoch 35: loss improved from 0.58935 to 0.56558, saving model to next_words.h5
Epoch 36/70
Epoch 36: loss improved from 0.56558 to 0.54868, saving model to next_words.h5
Epoch 37/70
Epoch 37: loss improved from 0.54868 to 0.52730, saving model to next_words.h5
Epoch 38/70
Epoch 38: loss improved from 0.52730 to 0.50804, saving model to next_words.h5
Epoch 39/70
Epoch 39: loss improved from 0.50804 to 0.50216, saving model to next_words.h5
Epoch 40/70
Epoch 40: loss improved from 0.50216 to 0.47951, saving model to next_words.h5
Epoch 41/70
Epoch 41: loss improved from 0.47951 to 0.47387, saving model to next_words.h5
Epoch 42/70
Epoch 42: loss improved from 0.47387 to 0.46150, saving model to next_words.h5

In [None]:
#PREDICTION

from tensorflow.keras.models import load_model
import numpy as np
import pickle

#Load the model and tokenizer

model = load_model('next_words.h5')
tokenizer= pickle.load(open('token.pkl',  'rb'))

def Predict_Next_Words(model, tokenizer, text):
    
    sequence = tokenizer.texts_to_sequences([text])
    sequence = np.array(sequence)
    preds = np.argmax(model.predict(sequence))
    predicted_word = ""
    
    for key, value in tokenizer.word_index.items():
        if value == preds:
            predicted_word = key
            break
    
    print(predicted_word)
    return predicted_word


In [None]:
while(True):
    text = input("Enter your line: ")
    
    if text == "0":
        print("Executuion completed.....")
        break
    else:
        try:
            text = text.split(" ")
            text = text[-3:]
            print(text)
            
            Predict_Next_Words(model, tokenizer,  text)
            
        except Exception as e:
            print("Error occurred: ",e)
            continue