<h1>Next Word Prediction Model Using Tensorflow & keras<h1> 

<h5>Importing Libraries<h5>

In [None]:
from numpy import array
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import load_model


<h5>Preprocessing Data<h5>

In [None]:
class Preprocessing():
    
    def __init__(self,input_file):
        self.input_data_file = input_file
        self.data = None
        self.vocab_size = None
        self.encoded_data = None
        self.max_length = None
        self.sequences = None
        self.x = None
        self.y = None
        self.tokenizer = None
    
    def load_data(self):
        fp = open(self.input_data_file,'r')
        self.data = fp.read().splitlines()        
        fp.close()
        
    def encode_data(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(self.data)
        self.encoded_data = self.tokenizer.texts_to_sequences(self.data)
        self.vocab_size = len(self.tokenizer.word_counts)+1
        
    def generate_sequence(self):
        seq_list = list()
        for item in self.encoded_data:
            l = len(item)
            for id in range(1,l):
                seq_list.append(item[:id+1])
        self.max_length = max([len(seq) for seq in seq_list])
        self.sequences = pad_sequences(seq_list, maxlen=self.max_length, padding='pre')
        self.sequences = array(self.sequences)
            
    def get_data(self):
        self.x = self.sequences[:,:-1]
        self.y = self.sequences[:,-1]
        self.y = to_categorical(self.y,num_classes=self.vocab_size)

In [None]:
pr = Preprocessing('cab_booking.txt')
pr.load_data()
pr.encode_data()
pr.generate_sequence()
pr.get_data()

<h3>Model

In [None]:
class Model():
    def __init__(self):
        self.model = None
        self.history = None
        self.x = None
        self.y = None
        self.vocab_size = pr.vocab_size
        self.max_len = pr.max_length
        
        
    def create_model(self):
        self.model = Sequential()
        self.model.add(Embedding(self.vocab_size,10,input_length=self.max_len-1))
        self.model.add(LSTM(50))
        self.model.add(Dropout(0.1))
        self.model.add(Dense(self.vocab_size,activation='softmax'))
        self.model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
        print(self.model.summary())
    def run(self,epochs,batch_size):
        self.history = self.model.fit(self.x,self.y,epochs=epochs,batch_size=batch_size,validation_split=0.2)
        
    def save(self):
        self.model.save("word_prediction_model.h5")
        

In [None]:
model_obj = Model()
model_obj.x = pr.x
model_obj.y = pr.y
model_obj.create_model()

In [None]:
model_obj.run(700,2)
model_obj.save()

<h4>Prediction

In [None]:
class Prediction():
    def __init__(self,tokenizer,max_len):
        self.model = None
        self.tokenizer = tokenizer
        self.idx2word = {v:k for k,v in self.tokenizer.word_index.items()}
        self.max_length = max_len
    
    def load_model(self):
        self.model = load_model("word_prediction_model.h5")
        
    def predict_words(self,text,num_words):
        encoded_data = self.tokenizer.texts_to_sequences([text])[0]
        padded_data = pad_sequences([encoded_data],maxlen = self.max_length-1,padding='pre')
        y_preds = self.model.predict(padded_data)
        y_preds = np.argsort(-y_preds)
        y_preds = y_preds[0][:num_words]
        possible_words = [self.idx2word[item] for item in y_preds]
        print(text,possible_words)
        print(possible_words)


In [None]:
pred = Prediction(pr.tokenizer,pr.max_length)    
pred.load_model()

In [None]:
pred.predict_words("I would like to",2)
pred.predict_words("can you please",2)