In [38]:
import numpy as np

In [39]:
def load_text(filename):
    file_open = open(file=filename, mode='r')
    text = file_open.read()
    file_open.close()
    return text

In [40]:
def save_file(sequence, filename):
    data = "\n".join(sequence)
    new_file = open(filename, mode='w')
    new_file.write(data)
    new_file.close()

In [41]:
raw_text = load_text('rhyme.txt')
raw_text

"Sing a song of sixpence,\nA pocket full of rye.\nFour and twenty blackbirds,\nBaked in a pie.\nWhen the pie was opened\nThe birds began to sing;\nWasn't that a dainty dish,\nTo set before the king.\nThe king was in his counting house,\nCounting out his money;\nThe queen was in the parlour,\nEating bread and honey.\nThe maid was in the garden,\nHanging out the clothes,\nWhen down came a blackbird\nAnd pecked off her nose."

In [42]:
tokens = raw_text.split()
raw_text = " ".join(tokens)

In [43]:
raw_text

"Sing a song of sixpence, A pocket full of rye. Four and twenty blackbirds, Baked in a pie. When the pie was opened The birds began to sing; Wasn't that a dainty dish, To set before the king. The king was in his counting house, Counting out his money; The queen was in the parlour, Eating bread and honey. The maid was in the garden, Hanging out the clothes, When down came a blackbird And pecked off her nose."

##### we have a long list of characters, we can create our input-output sequences used to train the model. Each input sequence will be 10 characters with one output character.

In [44]:
sequence = []
for i in range(len(raw_text)-10):
    seq = raw_text[i:i+11]
    sequence.append(seq)

In [45]:
len(sequence)
# sequence[5:10]

399

In [46]:
save_file(sequence, 'char_sequence_rhyme.txt')

In [47]:
raw_text = load_text('char_sequence_rhyme.txt')

line_seprate = raw_text.split('\n')
line_seprate[5:10]

['a song of s', ' song of si', 'song of six', 'ong of sixp', 'ng of sixpe']

In [48]:
chars = sorted(list(set(raw_text)))

mapped_char = dict((char, index) for index, char in enumerate(chars))
    

In [49]:
sequences = [[mapped_char[char] for char in line] for line in line_seprate]

In [50]:
sequences[5:10]

[[15, 1, 32, 28, 27, 21, 1, 28, 20, 1, 32],
 [1, 32, 28, 27, 21, 1, 28, 20, 1, 32, 23],
 [32, 28, 27, 21, 1, 28, 20, 1, 32, 23, 36],
 [28, 27, 21, 1, 28, 20, 1, 32, 23, 36, 29],
 [27, 21, 1, 28, 20, 1, 32, 23, 36, 29, 19]]

In [51]:
## Split Input and output 
sequences = np.array(sequences)

X, y = sequences[:,:-1], sequences[:,-1]

In [52]:
from keras.utils import to_categorical, plot_model
from keras.models import Sequential
from keras import layers
import pickle
from keras import regularizers

In [53]:
input_X = np.array([to_categorical(x, len(mapped_char)) for x in X])
input_X.shape

(399, 10, 38)

### 399 - Number of document 
### 10 - each document have 10 word.
### 38 - vectorization of each word.

In [54]:
input_y = to_categorical(y, len(mapped_char))
input_y.shape

(399, 38)

In [91]:
def define_model(input_X):
    model = Sequential()
    model.add(layers.LSTM(units=75, input_shape=(input_X.shape[1], input_X.shape[2]), 
                         kernel_initializer='uniform', unit_forget_bias=True, 
                         kernel_regularizer = regularizers.l2(l=0.0001), dropout = 0.3, return_sequences=True ))
    
    model.add(layers.LSTM(units=50, input_shape=(input_X.shape[1], input_X.shape[2]), 
                         kernel_initializer='uniform', unit_forget_bias=True, 
                         kernel_regularizer = regularizers.l2(l=0.0001), dropout = 0.3, ))
    
    model.add(layers.Dense(units=50,activation='relu', ))
    
    model.add(layers.Dense(units=len(mapped_char), activation='softmax'))
    
    model.compile(optimizer='rmsprop', loss="categorical_crossentropy", metrics = ['accuracy'])
    model.summary()
    return model

In [92]:
model = define_model(input_X)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 10, 75)            34200     
_________________________________________________________________
lstm_12 (LSTM)               (None, 50)                25200     
_________________________________________________________________
dense_13 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_14 (Dense)             (None, 38)                1938      
Total params: 63,888
Trainable params: 63,888
Non-trainable params: 0
_________________________________________________________________


In [93]:
model.fit(x=input_X, y=input_y, epochs=100, verbose=2, validation_split=0.1)

Train on 359 samples, validate on 40 samples
Epoch 1/100
 - 16s - loss: 3.5335 - accuracy: 0.1671 - val_loss: 3.3247 - val_accuracy: 0.1750
Epoch 2/100
 - 1s - loss: 3.1899 - accuracy: 0.1922 - val_loss: 3.2559 - val_accuracy: 0.1750
Epoch 3/100
 - 1s - loss: 3.1176 - accuracy: 0.1922 - val_loss: 3.2326 - val_accuracy: 0.1750
Epoch 4/100
 - 1s - loss: 3.0887 - accuracy: 0.1922 - val_loss: 3.2446 - val_accuracy: 0.1750
Epoch 5/100
 - 1s - loss: 3.0434 - accuracy: 0.1922 - val_loss: 3.2364 - val_accuracy: 0.1750
Epoch 6/100
 - 1s - loss: 3.0481 - accuracy: 0.1922 - val_loss: 3.2263 - val_accuracy: 0.1750
Epoch 7/100
 - 1s - loss: 3.0377 - accuracy: 0.1922 - val_loss: 3.1906 - val_accuracy: 0.1750
Epoch 8/100
 - 1s - loss: 3.0125 - accuracy: 0.1922 - val_loss: 3.2164 - val_accuracy: 0.1750
Epoch 9/100
 - 1s - loss: 3.0142 - accuracy: 0.1922 - val_loss: 3.2473 - val_accuracy: 0.1750
Epoch 10/100
 - 1s - loss: 3.0196 - accuracy: 0.1922 - val_loss: 3.2233 - val_accuracy: 0.1750
Epoch 11/100


Epoch 87/100
 - 1s - loss: 2.8464 - accuracy: 0.1866 - val_loss: 3.2042 - val_accuracy: 0.1750
Epoch 88/100
 - 1s - loss: 2.8433 - accuracy: 0.1978 - val_loss: 3.2301 - val_accuracy: 0.1750
Epoch 89/100
 - 1s - loss: 2.8291 - accuracy: 0.1950 - val_loss: 3.2295 - val_accuracy: 0.1750
Epoch 90/100
 - 1s - loss: 2.8389 - accuracy: 0.2089 - val_loss: 3.3037 - val_accuracy: 0.1750
Epoch 91/100
 - 1s - loss: 2.8281 - accuracy: 0.1950 - val_loss: 3.2278 - val_accuracy: 0.1750
Epoch 92/100
 - 1s - loss: 2.7986 - accuracy: 0.2033 - val_loss: 3.3244 - val_accuracy: 0.1750
Epoch 93/100
 - 1s - loss: 2.8019 - accuracy: 0.2033 - val_loss: 3.2477 - val_accuracy: 0.1750
Epoch 94/100
 - 1s - loss: 2.8078 - accuracy: 0.1950 - val_loss: 3.2863 - val_accuracy: 0.1750
Epoch 95/100
 - 1s - loss: 2.8218 - accuracy: 0.1950 - val_loss: 3.2869 - val_accuracy: 0.1750
Epoch 96/100
 - 1s - loss: 2.8226 - accuracy: 0.1894 - val_loss: 3.2206 - val_accuracy: 0.2000
Epoch 97/100
 - 1s - loss: 2.8013 - accuracy: 0.20

<keras.callbacks.callbacks.History at 0x2a04e4212e8>

In [94]:
model.save("Character_based_Language_model.h5")

In [95]:
pickle.dump(obj=mapped_char, file=open('Character_based_LM_Dictionary.pkl', mode='wb'))

### Generate model

In [96]:
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [97]:
loaded_model = load_model("Character_based_Language_model.h5")

In [98]:
loaded_model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 10, 75)            34200     
_________________________________________________________________
lstm_12 (LSTM)               (None, 50)                25200     
_________________________________________________________________
dense_13 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_14 (Dense)             (None, 38)                1938      
Total params: 63,888
Trainable params: 63,888
Non-trainable params: 0
_________________________________________________________________


In [99]:
dictionary = pickle.load(open("Character_based_LM_Dictionary.pkl", 'rb'))

In [100]:
input_text = 'Eating honey'

In [101]:
encoded_text = [dictionary[i] for i in input_text]
encoded_text

[9, 15, 33, 23, 27, 21, 1, 22, 28, 27, 19, 37]

In [102]:
padded_encode = pad_sequences([encoded_text], maxlen=10,)

In [103]:
to_input = to_categorical(padded_encode, num_classes=len(dictionary))

In [104]:
pred_class = loaded_model.predict_classes(to_input, verbose=0)
pred_class

array([1], dtype=int64)

In [105]:
out_chr = " "
for char, map_index in dictionary.items():
    if map_index == pred_class:
        out_chr = char
        break    
        
out_chr

' '

In [106]:
def generate_char(model, vocab, seed_text, n_char):
    in_text = seed_text
    for _ in range(n_char):
        encode_text = [vocab[i] for i in in_text]
        pad_sequence = pad_sequences([encode_text], maxlen=10, )
        to_input = to_categorical(pad_sequence, num_classes=len(vocab))
        y_hat = model.predict_classes(to_input, verbose = 0)
        out_char = ' '
        for char, index in vocab.items():
            if index == y_hat:
                out_char = char
                break
                
        in_text += out_char
    return in_text

In [107]:
text = 'eat lun'
generate_char(model=loaded_model, vocab=dictionary, seed_text=text, n_char=5)

'eat lun     '

In [108]:
text = 'Sing a sok lkh'
generate_char(model=loaded_model, vocab=dictionary, seed_text=text, n_char=10)

'Sing a sok lkh   eee    '