In [5]:
import numpy as np 
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import os

In [8]:
# read file
filename = './story.txt'
raw_text = open(filename).read() # raw_text is a long string
len(raw_text)

148574

In [9]:
content = raw_text.lower()

In [10]:
characters = list(set(content))
list_chars = sorted(characters)

In [11]:
dictionary = dict((c,i) for (i, c) in enumerate(list_chars))

In [12]:
import string
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [20]:
list_chars_new = list(string.ascii_lowercase) + ['0', '.', ',', ' ', '!', '?', 'unk']
char_to_int = dict((char, idx) for (idx, char) in enumerate(list_chars_new))
int_to_char = dict((idx, char) for (idx, char) in enumerate(list_chars_new))

In [14]:
def _encode_sentence(text):
    text = text.lower()
    result_vector = []
    for each_char in text:
        if each_char in list_chars_new[:-1]:
            result_vector.append(char_to_int[each_char])
        else:
            result_vector.append(char_to_int['unk'])

    return result_vector

In [15]:
example_sentence = 'Alice is a wonderful story. #'
_encode_sentence(example_sentence)

[0,
 11,
 8,
 2,
 4,
 29,
 8,
 18,
 29,
 0,
 29,
 22,
 14,
 13,
 3,
 4,
 17,
 5,
 20,
 11,
 29,
 18,
 19,
 14,
 17,
 24,
 27,
 29,
 32]

In [30]:
def _decode_sentence(vector):
    result = []
    for i in vector:
        result.append(int_to_char[i])
    return ''.join(result)

In [31]:
_decode_sentence(_encode_sentence(example_sentence))

'alice is a wonderful story. unk'

In [50]:
n_chars = len(raw_text)
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length):
    dataX.append(_encode_sentence(raw_text[i: i + seq_length]))
    dataY.append((_encode_sentence(raw_text[i + seq_length][0])))

X_train = np.reshape(dataX, (len(dataX), 100, 1))
X_train = X_train / len(char_to_int)

In [52]:
len(X_train[-1])

100

In [56]:
y_train = np_utils.to_categorical(dataY)
len(y_train[-1])
y_train.shape

(148474, 33)

In [57]:
X_train.shape

(148474, 100, 1)

In [64]:
# LSTM model
model = Sequential()
model.add(LSTM(256, input_shape = (X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1], activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               264192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 33)                8481      
Total params: 272,673
Trainable params: 272,673
Non-trainable params: 0
_________________________________________________________________


In [65]:
filepath = 'weights-improvement-{epoch:02d}-{loss:.4f}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor = 'val_acc', verbose = 1, save_best_only = True, mode = 'max')
callback_list = [checkpoint]

In [66]:
model.fit(X_train, y_train, epochs = 5, batch_size = 128, validation_split=0.33, callbacks = callback_list, verbose = 1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8ffc3afc10>

In [122]:
base_word = 'ssghjhjggffgfgjhtghqgffgghnbvfgAlice was beginning to get very tired of sitting by her sister on the bank'
len(base_word)


105

In [126]:
def _predict_let(text, len_sen = 1):
    text_for = []
    for i in range(len_sen):
        x_input = np.array(_encode_sentence(text)[-100:])/len(int_to_char)
        if x_input.shape[0] < 100:
            x_input = np.concatenate((np.zeros(100-x_input.shape[0]), x_input), axis = 0)
        x_input = np.expand_dims(np.expand_dims(x_input, -1), 0)
        # reshape x_input with shape (sample = 1, time_step = 100, feature = 1)
        y_prob = model.predict(x_input)
        y_let = int_to_char[np.argmax(y_prob, axis = 1)[0]]
        text = text + y_let
    return text[len_sen:]
_predict_let(base_word, 100)

205


' bank  and the was  hn  hn  hn  hu                                                                       '

In [100]:
a = np.array(_encode_sentence(base_word)[-10:])
b = np.concatenate((np.zeros(100-a.shape[0]), a), axis=0) 
np.expand_dims(np.expand_dims(b, -1), 0).shape

(1, 100, 1)

In [89]:
_encode_sentence(base_word)

[0,
 11,
 8,
 2,
 4,
 29,
 22,
 0,
 18,
 29,
 1,
 4,
 6,
 8,
 13,
 13,
 8,
 13,
 6,
 29,
 19,
 14,
 29,
 6,
 4,
 19,
 29,
 21,
 4,
 17,
 24,
 29,
 19,
 8,
 17,
 4,
 3,
 29,
 14,
 5,
 29,
 18,
 8,
 19,
 19,
 8,
 13,
 6,
 29,
 1,
 24,
 29,
 7,
 4,
 17,
 29,
 18,
 8,
 18,
 19,
 4,
 17,
 29,
 14,
 13,
 29,
 19,
 7,
 4,
 29,
 1,
 0,
 13,
 10]