<h1>Language Model Using Tensorflow & keras<h1> 

<h5>Importing Libraries<h5>

In [None]:
from numpy import array
import tensorflow as tf
print(tf.version.VERSION)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

<h5>Preprocessing Data<h5>

In [None]:
class Preprocessing():
    
    def __init__(self,input_file):
        self.input_data_file = input_file
        self.data = None
        self.vocab_size = None
        self.encoded_data = None
        self.max_length = None
        self.seq_len = 3
        self.sequences = None
        self.x = None
        self.y = None
        self.tokenizer = None
    
    def load_data(self):
        fp = open(self.input_data_file,'r')
        self.data = fp.read().splitlines()        
        fp.close()
        
    def encode_data(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(self.data)
        self.encoded_data = self.tokenizer.texts_to_sequences(self.data)
        self.vocab_size = len(self.tokenizer.word_counts)+1
        #print("encoded: {}".format(self.encoded_data))
        #print("Data: {}".format(self.data))
    def get_sequence(self):
        seq_list = list()
        for item in self.encoded_data:
            l = len(item)
            stop_len = l-self.seq_len+1
            for id in range(stop_len):
                seq_list.append(item[id:id+self.seq_len])
        
        self.max_length = max([len(seq) for seq in seq_list])
        self.sequences = pad_sequences(seq_list, maxlen=self.max_length, padding='pre')
        self.sequences = array(self.sequences)
        #print("sequences: {}".format(self.sequences))
            
    def get_data(self):
        self.x = self.sequences[:,:-1]
        self.y = self.sequences[:,-1]
        self.y = to_categorical(self.y,num_classes=self.vocab_size)
        #print("X: {}".format(self.x))
        #print("Y: {}".format(self.y))
pr = Preprocessing('data.txt')
pr.load_data()
pr.encode_data()
pr.get_sequence()
pr.get_data()

<h3>Model

In [None]:
class Model():
    def __init__(self,params):
        self.model = None
        self.x = None
        self.y = None
        self.vocab_size = params['vocab_size']
        self.max_len = params['max_len']
        self.activation = params['activation']
        self.optimizer = params['optimizer']
        self.epochs = params['epochs']
        self.metrics = params['metrics']
        
        
    def create_model(self):
        self.model = Sequential()
        self.model.add(Embedding(self.vocab_size,10,input_length=self.max_len-1))
        self.model.add(LSTM(50))
        self.model.add(Dense(self.vocab_size,activation=self.activation))
        self.model.compile(loss='categorical_crossentropy',optimizer=self.optimizer,metrics=self.metrics)
        
    def run(self):
        self.model.fit(self.x,self.y,epochs=self.epochs)
    
params = {"activation":"softmax","epochs":500,"verbose":2,"loss":"categorical_crossentropy",
          "optimizer":"adam","metrics":['accuracy'],"vocab_size":pr.vocab_size,"max_len":pr.max_length}
model_obj = Model(params)
model_obj.x = pr.x
model_obj.y = pr.y
model_obj.create_model()
model_obj.run()
        

<h4>Prediction

In [None]:
class Prediction():
    def __init__(self,model,tokenizer,max_len):
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_len
        
    def generate_seq(self,text,num_words):
        for id in range(num_words):
            encoded = self.tokenizer.texts_to_sequences([text])[0]
            print("encoded:",encoded)
            encoded = pad_sequences([encoded],maxlen = self.max_length-1,padding='pre')
            #print("encoded:",encoded)
            y_pred = self.model.predict_classes(encoded)
            out_word = ''
            for word, index in self.tokenizer.word_index.items():
                if index == y_pred:
                    out_word = word
                    break
            # append to input
            text += ' ' + out_word
        return text
            
pred = Prediction(model_obj.model,pr.tokenizer,pr.max_length)        
print(pred.generate_seq("Jack and",4))
print(pred.generate_seq('And Jill', 3))
print(pred.generate_seq('fell down', 5))
print(pred.generate_seq('pail of', 5))