<h1>Language Model Using Tensorflow & keras<h1> 

<h2>Importing Libraries<h2>

In [None]:
from numpy import array
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding


In [None]:
class language_model():
    def __init__(self,data,params):
        self.data = data
        self.model = None
        self.tokenizer = None
        self.max_length = None
        self.params = params
        self.x = None
        self.y = None
        self.voacb_size = None
        
    def preprocessing(self):
        
        # integer encode sequences of words
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts([data])
        encoded = self.tokenizer.texts_to_sequences([data])[0]
        
        # retrieve vocabulary size
        self.vocab_size = len(self.tokenizer.word_index) + 1
        
        print('Vocabulary Size: %d' % self.vocab_size)
        
        # encode 2 words -> 1 word
        sequences = list()
        for i in range(2, len(encoded)):
            sequence = encoded[i-2:i+1]
            sequences.append(sequence)
        print('Total Sequences: %d' % len(sequences))
        
        # pad sequences
        self.max_length = max([len(seq) for seq in sequences])
        sequences = pad_sequences(sequences, maxlen=self.max_length, padding='pre')
        print('Max Sequence Length: %d' % self.max_length)
        
        # split into input and output elements
        sequences = array(sequences)
        self.x, self.y = sequences[:,:-1],sequences[:,-1]
        self.y = to_categorical(self.y, num_classes=self.vocab_size)
        
    def define_model(self):
        self.model = Sequential()
        self.model.add(Embedding(self.vocab_size, 10, input_length=self.max_length-1))
        self.model.add(LSTM(50))
        self.model.add(Dense(self.vocab_size, activation=self.params["activation"]))
    
    def create_model(self):
        # compile network
        self.model.compile(loss='categorical_crossentropy', optimizer=self.params['optimizer'], 
                           metrics=self.params["metrics"])
        # fit network
        self.model.fit(self.x, self.y, epochs=self.params["epochs"], verbose=self.params["verbose"])
    
    # generate a sequence from a language model
    def generate_seq(self,seed_text, n_words):
        in_text = seed_text
        # generate a fiselxed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = self.tokenizer.texts_to_sequences([in_text])[0]
            # pre-pad sequences to a fixed length
            encoded = pad_sequences([encoded], maxlen=self.max_length-1, padding='pre')
            # predict probabilities for each word
            yhat = self.model.predict_classes(encoded, verbose=0)
            # map predicted word index to word
            out_word = ''
            for word, index in self.tokenizer.word_index.items():
                if index == yhat:
                    out_word = word
                    break
            # append to input
            in_text += ' ' + out_word
        return in_text

# source text
data = """ Jack and Jill went up the hill\n
        To fetch a pail of water\n
        Jack fell down and broke his crown\n
        And Jill came tumbling after\n """

params = {"activation":"softmax","epochs":500,"verbose":2,"loss":"categorical_crossentropy",
          "optimizer":"adam","metrics":['accuracy']}


lang_model = language_model(data,params)
lang_model.preprocessing()
lang_model.define_model()
lang_model.create_model()

# define model

print(lang_model.model.summary())

# evaluate model
print(lang_model.generate_seq('Jack and', 5))
print(lang_model.generate_seq('And Jill', 3))
print(lang_model.generate_seq('fell down', 5))
print(lang_model.generate_seq('pail of', 5))