<h1>Language Model Using Tensorflow & keras<h1> 

<h5>Importing Libraries<h5>

In [2]:
from numpy import array
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import load_model


<h5>Preprocessing Data<h5>

In [7]:
class Preprocessing():
    
    def __init__(self,input_file):
        self.input_data_file = input_file
        self.data = None
        self.vocab_size = None
        self.encoded_data = None
        self.max_length = None
        self.sequences = None
        self.x = None
        self.y = None
        self.tokenizer = None
    
    def load_data(self):
        fp = open(self.input_data_file,'r')
        self.data = fp.read().splitlines()        
        fp.close()
        
    def encode_data(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(self.data)
        self.encoded_data = self.tokenizer.texts_to_sequences(self.data)
        print(self.encoded_data)
        self.vocab_size = len(self.tokenizer.word_counts)+1
        
    def generate_sequence(self):
        seq_list = list()
        for item in self.encoded_data:
            l = len(item)
            for id in range(1,l):
                seq_list.append(item[:id+1])
        self.max_length = max([len(seq) for seq in seq_list])
        self.sequences = pad_sequences(seq_list, maxlen=self.max_length, padding='pre')
        print(self.sequences)
        self.sequences = array(self.sequences)
            
    def get_data(self):
        self.x = self.sequences[:,:-1]
        self.y = self.sequences[:,-1]
        print("y before:",self.y)
        self.y = to_categorical(self.y,num_classes=self.vocab_size)
        print("y After:",self.y)

In [8]:
pr = Preprocessing('data.txt')
pr.load_data()
pr.encode_data()
pr.generate_sequence()
pr.get_data()

[[2, 1, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13], [2, 14, 15, 1, 16, 17, 18], [1, 3, 19, 20, 21]]
[[ 0  0  0  0  0  2  1]
 [ 0  0  0  0  2  1  3]
 [ 0  0  0  2  1  3  4]
 [ 0  0  2  1  3  4  5]
 [ 0  2  1  3  4  5  6]
 [ 2  1  3  4  5  6  7]
 [ 0  0  0  0  0  8  9]
 [ 0  0  0  0  8  9 10]
 [ 0  0  0  8  9 10 11]
 [ 0  0  8  9 10 11 12]
 [ 0  8  9 10 11 12 13]
 [ 0  0  0  0  0  2 14]
 [ 0  0  0  0  2 14 15]
 [ 0  0  0  2 14 15  1]
 [ 0  0  2 14 15  1 16]
 [ 0  2 14 15  1 16 17]
 [ 2 14 15  1 16 17 18]
 [ 0  0  0  0  0  1  3]
 [ 0  0  0  0  1  3 19]
 [ 0  0  0  1  3 19 20]
 [ 0  0  1  3 19 20 21]]
y before: [ 1  3  4  5  6  7  9 10 11 12 13 14 15  1 16 17 18  3 19 20 21]
y After: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0

<h3>Model

In [None]:
class Model():
    def __init__(self,params):
        self.model = None
        self.history = None
        self.x = None
        self.y = None
        self.vocab_size = params['vocab_size']
        self.max_len = params['max_len']
        self.activation = params['activation']
        self.optimizer = params['optimizer']
        self.epochs = params['epochs']
        self.metrics = params['metrics']
        
        
    def create_model(self):
        self.model = Sequential()
        self.model.add(Embedding(self.vocab_size,10,input_length=self.max_len-1))
        self.model.add(LSTM(50))
        self.model.add(Dropout(0.1))
        self.model.add(Dense(self.vocab_size,activation=self.activation))
        self.model.compile(loss='categorical_crossentropy',optimizer=self.optimizer,metrics=self.metrics)
        print(self.model.summary())
    def run(self):
        self.history = self.model.fit(self.x,self.y,epochs=self.epochs)
        
    def save(self):
        self.model.save("lang_model.h5")
        

In [None]:
params = {"activation":"softmax","epochs":500,"verbose":2,"loss":"categorical_crossentropy",
          "optimizer":"adam","metrics":['accuracy'],"vocab_size":pr.vocab_size,"max_len":pr.max_length}
model_obj = Model(params)
model_obj.x = pr.x
model_obj.y = pr.y
model_obj.create_model()

In [None]:
model_obj.run()
model_obj.save()

<h4>Prediction

In [None]:
class Prediction():
    def __init__(self,tokenizer,max_len):
        self.model = None
        self.tokenizer = tokenizer
        self.idx2word = {v:k for k,v in self.tokenizer.word_index.items()}
        self.max_length = max_len
    
    def load_model(self):
        self.model = load_model("lang_model.h5")
        
    def predict_sequnce(self,text,num_words):
        for id in range(num_words):
            encoded_data = self.tokenizer.texts_to_sequences([text])[0]
            padded_data = pad_sequences([encoded_data],maxlen = self.max_length-1,padding='pre')
            y_pred = self.model.predict(padded_data)
            y_pred = np.argmax(y_pred)
            predict_word = self.idx2word[y_pred]
            text += ' ' + predict_word
        return text

In [None]:
pred = Prediction(pr.tokenizer,pr.max_length)    
pred.load_model()
print(pred.predict_sequnce("Jack and",5))
print(pred.predict_sequnce('And Jill', 4))
print(pred.predict_sequnce('fell down', 5))
print(pred.predict_sequnce('pail of', 3))