# Setup

https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

Using TensorFlow backend.


In [2]:
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('corpus length:', len(text))

corpus length: 600893


In [3]:
print(text[:100])

preface


supposing that truth is a woman--what then? is there not ground
for suspecting that all ph


In [4]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 57


In [5]:
print(char_indices)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '=': 22, '?': 23, '[': 24, ']': 25, '_': 26, 'a': 27, 'b': 28, 'c': 29, 'd': 30, 'e': 31, 'f': 32, 'g': 33, 'h': 34, 'i': 35, 'j': 36, 'k': 37, 'l': 38, 'm': 39, 'n': 40, 'o': 41, 'p': 42, 'q': 43, 'r': 44, 's': 45, 't': 46, 'u': 47, 'v': 48, 'w': 49, 'x': 50, 'y': 51, 'z': 52, 'ä': 53, 'æ': 54, 'é': 55, 'ë': 56}


In [6]:
print(indices_char)

{0: '\n', 1: ' ', 2: '!', 3: '"', 4: "'", 5: '(', 6: ')', 7: ',', 8: '-', 9: '.', 10: '0', 11: '1', 12: '2', 13: '3', 14: '4', 15: '5', 16: '6', 17: '7', 18: '8', 19: '9', 20: ':', 21: ';', 22: '=', 23: '?', 24: '[', 25: ']', 26: '_', 27: 'a', 28: 'b', 29: 'c', 30: 'd', 31: 'e', 32: 'f', 33: 'g', 34: 'h', 35: 'i', 36: 'j', 37: 'k', 38: 'l', 39: 'm', 40: 'n', 41: 'o', 42: 'p', 43: 'q', 44: 'r', 45: 's', 46: 't', 47: 'u', 48: 'v', 49: 'w', 50: 'x', 51: 'y', 52: 'z', 53: 'ä', 54: 'æ', 55: 'é', 56: 'ë'}


In [7]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40 # of input char vector
step = 3  # don't get every next char. Why desirable?
stopping_char_index = len(text) - maxlen  # 600853, to avoid IndexError at end of a file
sentences = []
next_chars = []
for i in range(0, stopping_char_index, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 200285


In [8]:
print(sentences[0])

preface


supposing that truth is a woma


In [9]:
print(sentences[1])

face


supposing that truth is a woman--


In [10]:
print(sentences[2])

e


supposing that truth is a woman--wha


In [11]:
print(next_chars[0])

n


In [12]:
print(next_chars[1])

w


In [13]:
print(next_chars[2])

t


In [14]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)  # a 3-tensor
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)  # a 2-tensor

Vectorization...


In [15]:
type(X)

numpy.ndarray

In [16]:
len(X)  # number of "sentences", instances

200285

In [20]:
len(X[0])  # len of each "sentence"; there will be made a vector for each of the forty characters

40

In [25]:
len(X[0][0])  # the total number of features, ie the columns; one-hot encodings will be put into this

57

In [27]:
X

array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ..., 
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, 

In [28]:
X[0]

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ..., 
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]], dtype=bool)

In [29]:
X[0][0]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False], dtype=bool)

In [34]:
# one-hot encode each sentence into the 3-tensor
# one-hot encode each target char into the 2-tensor
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        # about filling out the 3-tensor
        # X[<0>] is the sentence index
        # X[<1>] is the character index
        # X[<2>] is the one-hot character index
        X[i, t, char_indices[char]] = 1
    # about filling out the 2-tensor
    # y[<0>] is the sentence index
    # y[<1>] is the character index, contains the "next" character
    y[i, char_indices[next_chars[i]]] = 1

In [35]:
for i, sentence in enumerate(sentences):
    print(i, sentence)
    input()

0 preface


supposing that truth is a woma

1 face


supposing that truth is a woman--


KeyboardInterrupt: 

In [36]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        print(t, char)
        input()

0 p

1 r

2 e

3 f

4 a

5 c

6 e

7 


8 


9 



KeyboardInterrupt: 

# Make model

In [37]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
# on the values of the first layer:
# input_shape[0] is the length of the feature vector sentences
# input_shape[1] is the total number of characters/features possible
# so this is essentially the shape of each one-hot encoded input vector (there being ~200K total)
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

Build model...


In [38]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [40]:
def sample(preds, temperature=1.0):
    """A helper function to sample an index from a probability array.
    For each input sentence, an output array of features is made, with each feature
    given a different weight.
    
    On the 'temperature' or 'diversity':
    https://github.com/karpathy/char-rnn
    - the temperature is dividing the predicted log probabilities before the Softmax
    - lower temperature will cause the model to make more likely, but also more boring and conservative predictions
    - Higher temperatures cause the model to take more chances and increase diversity of results, but at a cost of more mistakes
    """
    preds = np.asarray(preds).astype('float64')
    # https://en.wikipedia.org/wiki/Smoothing
    # to smooth a data set is to create an approximating function that attempts to capture important patterns 
    # in the data, while leaving out noise or other fine-scale structures/rapid phenomena
    preds = np.log(preds) / temperature
    # https://stackoverflow.com/a/31952102
    # e^x where e is a mathematical constant called Euler's number, approximately 2.718281
    #? what is the point of this here? is it to separate the now-smoothed numbers?
    exp_preds = np.exp(preds)
    # essentially get the avg. probability of each char; this is necessary for the multinomial() function next
    # so `preds` is a list of fractions (e.g., `[3/6., 1/6., .5/6., .5/6., .5/6., .5/6.]`) that together add up to 1
    preds = exp_preds / np.sum(exp_preds)
    # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.random.multinomial.html
    # rolling dice example: roll 20 times, 6 possible options
    # >>> np.random.multinomial(20, [1/6., 1/6., 1/6., 1/6., 1/6., 1/6.], size=1)
    # array([[4, 1, 7, 5, 2, 1]])
    # this means that in this test the die landed 4 times on 1, once on 2, etc.
    # but we can weight the rolls, too, to make one side more likely:
    # >>> np.random.multinomial(20, [3/6., 1/6., .5/6., .5/6., .5/6., .5/6.], size=1)
    # array([[11,  4,  1,  1,  3,  0]])
    # so the "1" side has a 50% chance, the others much less
    # in multinomial(), first param is the number of "rolls",
    # preds the number of possible "sides" (in this case, each char)
    # but due to the uneven liklihood of each char (some more likely, as if weighted dice), some chars
    # more likely to roll out than others
    probas = np.random.multinomial(1, preds, size=1)
    # in the random "rollout", yet with each character weighted, the most commonly occuring
    # character comes out
    # >>> np.argmax([4, 2, 9, 6])
    # 2
    # thus the index point 2 has the highest value among this list
    return np.argmax(probas)

In [None]:
# train the model, output generated text after each iteration
for iteration in range(1, 60):  # do 60 total iterations
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y,
              batch_size=128,
              epochs=1)  # we could just put the eteration/epochs here ...
    # ... however we wouldn't be able to print out the results to manually
    # check the quality of the text generated by the model
    # could be increased if we only want to check the output every n epochs

    # generatr a "seed" string of text for manually checking
    start_index = random.randint(0, len(text) - maxlen - 1)

    # that is, diversity of temperature
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        # using sys.stdout.write() here because it does not start a newline
        # >>> sys.stdout.write('a')
        # sys.stdout.write('b')
        # sys.stdout.write('c')
        # abc
        sys.stdout.write(generated)

        # generated sentence will be 400 chars long
        for i in range(400):
            # make an 3-tensor for 1 sentence; other two "inner" lists are same as input
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                # only 1 instance to iterate through, but must do it for each char in the newly generated seed sentence
                # `t` here is the index of the char w/in the sentence
                # the innermost value (`char_indices[char]`) is the one-hot, getting the one-hot column's index w/ the character mapping
                x[0, t, char_indices[char]] = 1.

            # use model to get prediction distribution of each character
            # the `[0]` added because .predict() returns a 2-tensort with a len of 1
            '''
            print(model.predict(x, verbose=0))
            [[  1.12908916e-03   2.82170554e-03   8.17681666e-06   9.73082279e-05
            ...
            1.08626102e-06   4.76212680e-10   5.19392807e-10   5.03852682e-10
            6.59048205e-10]]
            '''
            preds = model.predict(x, verbose=0)[0]
            '''
            print(type(preds))
            <class 'numpy.ndarray'>
            print(len(preds))  # i.e., the total number of unique characters; == len(chars)
            53
            print(preds)
            [  5.63451322e-03   6.24113195e-02   5.61064326e-06   1.89539453e-04
               5.23562312e-06   3.58946739e-09   6.11432199e-07   3.76065401e-03
               9.97148527e-05   7.50456355e-04   3.09984955e-07   4.14119359e-06
               1.70199803e-06   2.60797742e-06   4.45179069e-07   2.18999546e-07
               5.98566203e-07   3.62546700e-07   2.31203103e-07   7.92739286e-07
               8.65528818e-06   8.75548449e-06   2.16324429e-06   3.82856433e-06
               9.58330801e-08   2.07838355e-07   5.81625734e-08   3.64573151e-02
               2.15078962e-05   5.13660780e-04   8.96637328e-04   5.39803803e-01
               5.37416257e-04   4.68064053e-03   4.04005223e-05   2.57718772e-01
               1.44889341e-07   8.06269236e-06   5.32099220e-04   2.06660107e-03
               8.01626220e-03   2.35885438e-02   2.02944051e-04   1.21316859e-07
               1.99312028e-02   2.55254954e-02   6.40479208e-04   2.53212755e-03
               3.33019823e-04   4.86706085e-05   5.96273230e-06   3.00484011e-03
               4.17966959e-07   1.45478837e-10   1.63744171e-10   1.71333586e-10
               1.96869632e-10]
            '''
            
            # smooth the weighting of this distribution and choose a character randomly (yet weighted by this distribution)
            next_index = sample(preds, diversity)
            '''
            print(next_index)
            35
            '''
            # convert this character index into a real character
            next_char = indices_char[next_index]

            # add the predicted char to the seed sentence
            generated += next_char
            # now slide the window one character
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
print()


--------------------------------------------------
Iteration 1
Epoch 1/1

----- diversity: 0.2
----- Generating with seed: " night
vigils, fasts, ardent prayer, per"
 night
vigils, fasts, ardent prayer, per[[  1.12908916e-03   2.82170554e-03   8.17681666e-06   9.73082279e-05
    1.03873963e-05   3.11837425e-06   2.10473013e-06   2.48148048e-04
    9.23632324e-05   8.47898016e-04   4.95617769e-07   5.61147863e-06
    1.87149828e-06   8.85386498e-07   4.65500352e-06   5.56936300e-07
    8.86192367e-07   1.47843593e-07   8.48043157e-07   1.54683732e-06
    4.95718223e-06   9.54393545e-06   3.00270017e-06   7.94278185e-06
    1.95011535e-06   6.61827357e-07   7.76587456e-07   7.12269964e-03
    1.54276669e-03   6.85653090e-02   2.74163345e-03   1.28264111e-02
    2.56166663e-02   1.23041985e-03   2.84018397e-01   1.46878108e-01
    3.13137192e-04   1.19867986e-02   8.68887524e-04   2.05761790e-02
    2.17084796e-03   3.86586203e-03   2.88721044e-02   8.33353115e-05
    6.32750662e-03   3.

# Evaluate