In [1]:
import keras
import numpy as np
import re, collections            # for text processing
from google.colab import files    # for download files

Using TensorFlow backend.


In [2]:
path = keras.utils.data_utils.get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
with open(path, encoding='utf-8') as f:
    text = f.read()
print('corpus length:', len(text))


corpus length: 600893


In [3]:
text = text.lower().replace("\n", " ")
text = re.sub('[ëæéä0-9_\[\]=\(\)]', '', text)

chars = sorted(list(set(text)))
num_chars = len(chars)
print('total characters in vocabulary:', num_chars)

charcounts = collections.Counter(list(text))
sorted(charcounts.items(), key=lambda i: i[1])

total characters in vocabulary: 36


[('z', 261),
 ("'", 271),
 ('?', 400),
 ('!', 409),
 ('q', 547),
 ('j', 557),
 (';', 627),
 (':', 667),
 ('x', 888),
 ('"', 1990),
 ('k', 2147),
 ('.', 2787),
 ('-', 3740),
 ('v', 5273),
 ('b', 5944),
 (',', 8201),
 ('w', 8329),
 ('y', 8497),
 ('g', 8843),
 ('p', 9484),
 ('m', 11829),
 ('f', 12222),
 ('u', 13221),
 ('c', 13655),
 ('d', 16943),
 ('l', 20336),
 ('h', 26940),
 ('r', 27224),
 ('s', 34500),
 ('n', 35598),
 ('o', 36591),
 ('a', 36796),
 ('i', 37548),
 ('t', 44450),
 ('e', 59595),
 (' ', 101142)]

In [0]:
SEQ_LENGTH = 40
STEP = 10
DEPTH = 1
UNIT_SIZE = 128
DROPOUT = 0.1

In [5]:
sentences = list()
targets = list()
for i in range(0, len(text) - SEQ_LENGTH - 1, STEP):
    sentences.append(text[i: i + SEQ_LENGTH])
    targets.append(text[i + 1: i + SEQ_LENGTH + 1])
print('number of sequences:', len(sentences))

number of sequences: 59842


In [0]:
# dictionaries to convert characters to numbers and vice-versa
char_to_indices = dict((c, i) for i, c in enumerate(chars))
indices_to_char = dict((i, c) for i, c in enumerate(chars))

X = np.zeros((len(sentences), SEQ_LENGTH, num_chars), dtype=np.bool)
y = np.zeros((len(sentences), SEQ_LENGTH, num_chars), dtype=np.bool)
for i in range(len(sentences)):
    sentence = sentences[i]
    target = targets[i]
    for j in range(SEQ_LENGTH):
        X[i][j][char_to_indices[sentence[j]]] = 1
        y[i][j][char_to_indices[target[j]]] = 1

In [30]:
print('Build model...')
model = keras.models.Sequential()
for _ in range(DEPTH):
    model.add(keras.layers.LSTM(UNIT_SIZE, input_shape=(None, num_chars), return_sequences=True))
    model.add(keras.layers.Dropout(DROPOUT))
model.add(keras.layers.wrappers.TimeDistributed(keras.layers.Dense(num_chars)))
model.add(keras.layers.Activation('softmax'))

Build model...


In [0]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [0]:
def multinomial_with_temperature(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature  
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def sample(model, char_to_indices, indices_to_char, 
           seed_string=" ", temperature=1.0, test_length=150):
    """
    Higher temperatures correspond to more potentially creative sentences (at the cost of mistakes)
    """
    num_chars = len(char_to_indices.keys())
    for i in range(test_length):
        test_in = np.zeros((1, len(seed_string), num_chars))
        for t, char in enumerate(seed_string):
            test_in[0, t, char_to_indices[char]] = 1
        entire_prediction = model.predict(test_in, verbose=0)[0]
        next_index = multinomial_with_temperature(entire_prediction[-1], temperature)
        next_char = indices_to_char[next_index]
        seed_string = seed_string + next_char
    return seed_string

In [33]:
history = model.fit(X, y,
            batch_size=1024,
            epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
sample(model, char_to_indices=char_to_indices, indices_to_char=indices_to_char, seed_string="truth", temperature=0.8)

'truth is a really one af the sympathy with its deperfection of his own rilers like the moral sembs, something of the german socrates and epines the break a'

In [0]:
model_filename = 'nietzsche.loss{0:.2f}.h5'.format(history.history['loss'][-1])
model.save(model_filename)
files.download(model_filename)