In [3]:
import tensorflow as tf

In [4]:
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding


In [21]:
class language_model():
    def __init__(self,data,params):
        self.data = data
        self.model = None
        self.tokenizer = None
        self.max_length = None
        self.params = params
        self.x = None
        self.y = None
        
    def preprocessing(self):
        
        # integer encode sequences of words
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts([data])
        encoded = self.tokenizer.texts_to_sequences([data])[0]
        
        # retrieve vocabulary size
        vocab_size = len(self.tokenizer.word_index) + 1
        print('Vocabulary Size: %d' % vocab_size)
        
        # encode 2 words -> 1 word
        sequences = list()
        for i in range(2, len(encoded)):
            sequence = encoded[i-2:i+1]
            sequences.append(sequence)
        print('Total Sequences: %d' % len(sequences))
        
        # pad sequences
        self.max_length = max([len(seq) for seq in sequences])
        sequences = pad_sequences(sequences, maxlen=self.max_length, padding='pre')
        print('Max Sequence Length: %d' % self.max_length)
        
        # split into input and output elements
        sequences = array(sequences)
        self.x, self.y = sequences[:,:-1],sequences[:,-1]
        self.y = to_categorical(self.y, num_classes=vocab_size)
        
    def define_model(self):
        self.model = Sequential()
        self.model.add(Embedding(vocab_size, 10, input_length=max_length-1))
        self.model.add(LSTM(50))
        self.model.add(Dense(vocab_size, activation=self.params["activation"]))
    
    def create_model(self):
        # compile network
        self.model.compile(loss='categorical_crossentropy', optimizer=self.params['optimizer'], 
                           metrics=self.params["metrics"])
        # fit network
        self.model.fit(self.x, self.y, epochs=self.params["epochs"], verbose=self.params["verbose"])
    
    # generate a sequence from a language model
    def generate_seq(self,seed_text, n_words):
        in_text = seed_text
        # generate a fiselxed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = self.tokenizer.texts_to_sequences([in_text])[0]
            # pre-pad sequences to a fixed length
            encoded = pad_sequences([encoded], maxlen=self.max_length-1, padding='pre')
            # predict probabilities for each word
            yhat = self.model.predict_classes(encoded, verbose=0)
            # map predicted word index to word
            out_word = ''
            for word, index in self.tokenizer.word_index.items():
                if index == yhat:
                    out_word = word
                    break
            # append to input
            in_text += ' ' + out_word
        return in_text

# source text
data = """ Jack and Jill went up the hill\n
        To fetch a pail of water\n
        Jack fell down and broke his crown\n
        And Jill came tumbling after\n """

params = {"activation":"softmax","epochs":500,"verbose":2,"loss":"categorical_crossentropy",
          "optimizer":"adam","metrics":['accuracy']}


lang_model = language_model(data,params)
lang_model.preprocessing()
lang_model.define_model()
lang_model.create_model()

# define model

print(lang_model.model.summary())

# evaluate model
print(lang_model.generate_seq('Jack and', 5))
print(lang_model.generate_seq('And Jill', 3))
print(lang_model.generate_seq('fell down', 5))
print(lang_model.generate_seq('pail of', 5))

Vocabulary Size: 22
Total Sequences: 23
Max Sequence Length: 3
Epoch 1/500
 - 2s - loss: 3.0914 - acc: 0.0000e+00
Epoch 2/500
 - 0s - loss: 3.0907 - acc: 0.0000e+00
Epoch 3/500
 - 0s - loss: 3.0900 - acc: 0.0435
Epoch 4/500
 - 0s - loss: 3.0892 - acc: 0.0870
Epoch 5/500
 - 0s - loss: 3.0883 - acc: 0.1304
Epoch 6/500
 - 0s - loss: 3.0875 - acc: 0.1304
Epoch 7/500
 - 0s - loss: 3.0866 - acc: 0.1304
Epoch 8/500
 - 0s - loss: 3.0857 - acc: 0.1304
Epoch 9/500
 - 0s - loss: 3.0848 - acc: 0.1304
Epoch 10/500
 - 0s - loss: 3.0839 - acc: 0.1304
Epoch 11/500
 - 0s - loss: 3.0830 - acc: 0.1304
Epoch 12/500
 - 0s - loss: 3.0821 - acc: 0.1304
Epoch 13/500
 - 0s - loss: 3.0811 - acc: 0.1304
Epoch 14/500
 - 0s - loss: 3.0801 - acc: 0.1304
Epoch 15/500
 - 0s - loss: 3.0791 - acc: 0.1304
Epoch 16/500
 - 0s - loss: 3.0780 - acc: 0.1304
Epoch 17/500
 - 0s - loss: 3.0769 - acc: 0.1304
Epoch 18/500
 - 0s - loss: 3.0758 - acc: 0.1304
Epoch 19/500
 - 0s - loss: 3.0747 - acc: 0.1304
Epoch 20/500
 - 0s - loss:

Epoch 169/500
 - 0s - loss: 0.8668 - acc: 0.9130
Epoch 170/500
 - 0s - loss: 0.8470 - acc: 0.9130
Epoch 171/500
 - 0s - loss: 0.8275 - acc: 0.9130
Epoch 172/500
 - 0s - loss: 0.8084 - acc: 0.9130
Epoch 173/500
 - 0s - loss: 0.7897 - acc: 0.9130
Epoch 174/500
 - 0s - loss: 0.7713 - acc: 0.9130
Epoch 175/500
 - 0s - loss: 0.7533 - acc: 0.9130
Epoch 176/500
 - 0s - loss: 0.7357 - acc: 0.9130
Epoch 177/500
 - 0s - loss: 0.7186 - acc: 0.9130
Epoch 178/500
 - 0s - loss: 0.7018 - acc: 0.9130
Epoch 179/500
 - 0s - loss: 0.6853 - acc: 0.9130
Epoch 180/500
 - 0s - loss: 0.6693 - acc: 0.9130
Epoch 181/500
 - 0s - loss: 0.6537 - acc: 0.9130
Epoch 182/500
 - 0s - loss: 0.6385 - acc: 0.9565
Epoch 183/500
 - 0s - loss: 0.6237 - acc: 0.9565
Epoch 184/500
 - 0s - loss: 0.6092 - acc: 0.9565
Epoch 185/500
 - 0s - loss: 0.5951 - acc: 0.9565
Epoch 186/500
 - 0s - loss: 0.5814 - acc: 0.9565
Epoch 187/500
 - 0s - loss: 0.5681 - acc: 0.9565
Epoch 188/500
 - 0s - loss: 0.5551 - acc: 0.9565
Epoch 189/500
 - 0s 

 - 0s - loss: 0.0925 - acc: 0.9565
Epoch 337/500
 - 0s - loss: 0.0921 - acc: 0.9565
Epoch 338/500
 - 0s - loss: 0.0918 - acc: 0.9565
Epoch 339/500
 - 0s - loss: 0.0915 - acc: 0.9565
Epoch 340/500
 - 0s - loss: 0.0912 - acc: 0.9565
Epoch 341/500
 - 0s - loss: 0.0909 - acc: 0.9565
Epoch 342/500
 - 0s - loss: 0.0906 - acc: 0.9565
Epoch 343/500
 - 0s - loss: 0.0903 - acc: 0.9565
Epoch 344/500
 - 0s - loss: 0.0901 - acc: 0.9565
Epoch 345/500
 - 0s - loss: 0.0898 - acc: 0.9565
Epoch 346/500
 - 0s - loss: 0.0895 - acc: 0.9565
Epoch 347/500
 - 0s - loss: 0.0892 - acc: 0.9565
Epoch 348/500
 - 0s - loss: 0.0890 - acc: 0.9565
Epoch 349/500
 - 0s - loss: 0.0887 - acc: 0.9565
Epoch 350/500
 - 0s - loss: 0.0885 - acc: 0.9565
Epoch 351/500
 - 0s - loss: 0.0882 - acc: 0.9565
Epoch 352/500
 - 0s - loss: 0.0880 - acc: 0.9565
Epoch 353/500
 - 0s - loss: 0.0877 - acc: 0.9565
Epoch 354/500
 - 0s - loss: 0.0875 - acc: 0.9565
Epoch 355/500
 - 0s - loss: 0.0872 - acc: 0.9565
Epoch 356/500
 - 0s - loss: 0.0870

Jack and jill came tumbling after after
And Jill came tumbling after
fell down and broke his crown and
pail of water jack fell down and
