
reference:

https://github.com/martin-gorner/tensorflow-rnn-shakespeare/blob/master/my_txtutils.py
https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py


In [1]:
import numpy as np

In [2]:
# size of the alphabet that we work with
ALPHASIZE = 98


# Specification of the supported alphabet (subset of ASCII-7)
# 10 line feed LF
# 32-64 numbers and punctuation
# 65-90 upper-case letters
# 91-97 more punctuation
# 97-122 lower-case letters
# 123-126 more punctuation
def convert_from_alphabet(a):
    """Encode a character
    :param a: one character
    :return: the encoded value
    """
    a = ord(a)
    if a == 9:
        return 1
    if a == 10:
        return 127 - 30  # LF
    elif 32 <= a <= 126:
        return a - 30
    else:
        return 0  # unknown


# encoded values:
# unknown = 0
# tab = 1
# space = 2
# all chars from 32 to 126 = c-30
# LF mapped to 127-30
def convert_to_alphabet(c, avoid_tab_and_lf=False):
    """Decode a code point
    :param c: code point
    :param avoid_tab_and_lf: if True, tab and line feed characters are replaced by '\'
    :return: decoded character
    """
    out_c = 0
    
    if c == 1:
        out_c = 32 if avoid_tab_and_lf else 9  # space instead of TAB
    if c == 127 - 30:
        out_c = 92 if avoid_tab_and_lf else 10  # \ instead of LF
    if 32 <= c + 30 <= 126:
        out_c = c + 30
    else:
        return '?'  # unknown
    return chr(out_c)

In [3]:
text='ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz'
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

('corpus length:', 62)
('total chars:', 62)


In [4]:
def generate_batches(text, batch_size, sequence_size, num_of_chars, num_of_epochs):
    """
    Divides the data into batches of sequences so that all the sequences in one batch
    continue in the next batch. This is a generator that will keep returning batches
    until the input data has been seen nb_epochs times. Sequences are continued even
    between epochs, apart from one, the one corresponding to the end of raw_data.
    The remainder at the end of raw_data that does not fit in an full batch is ignored.
    :param raw_data: the training text
    :param batch_size: the size of a training minibatch
    :param sequence_size: the unroll size of the RNN
    :num_of_chars: the number of chars for encodering charactors
    :param num_of_epochs: number of epochs to train on
    :return:
        X_encode: one group of batches of training sequences, encoded
        Y_encode: one group of batches of training label sequences, encoded
        X: one group of batches of training sequences, not encoded, used for debugging
        Y: one group of batches of training label sequences, not encoded, used for debugging
        epoch: the current epoch number (starting at 0)
    """
    
    data = np.array(list(text))
    data_len = data.shape[0]

    num_of_batchs = (data_len - 1) // (batch_size * sequence_size) # len(sentences) = num_of_batch * batch_size

    assert num_of_batchs > 0, "Not enough data, even for a single batch. Try using a smaller batch_size."

    rounded_data_len = num_of_batchs * batch_size * sequence_size

    xdata = np.reshape(data[0:rounded_data_len], [batch_size, num_of_batchs * sequence_size])
    ydata = np.reshape(data[1:rounded_data_len + 1], [batch_size, num_of_batchs * sequence_size])
    

    for epoch in range(num_of_epochs):
        start = True
        for batch in range(num_of_batchs):
            x = xdata[:, batch * sequence_size:(batch + 1) * sequence_size]
            y = ydata[:, batch * sequence_size:(batch + 1) * sequence_size]
            x = np.roll(x, -epoch, axis=0)  # to continue the text from epoch to epoch (do not reset rnn state!)
            y = np.roll(y, -epoch, axis=0)
            
            if start:
                 X = x
                 Y = y[:, -1]
            else:
                X = np.vstack((X, x))
                Y = np.hstack((Y, y[:, -1]))
            start = False
        
        X_encode = np.zeros((num_of_batchs * batch_size, sequence_size, num_of_chars), dtype=np.bool)
        Y_encode = np.zeros((num_of_batchs * batch_size, num_of_chars), dtype=np.bool)
        for i, sentence in enumerate(X):
            for t, char in enumerate(sentence):
                X_encode[i, t, convert_from_alphabet(char)] = 1
        
        for i, char in enumerate(Y):
            Y_encode[i, convert_from_alphabet(char)] = 1
            
        yield X_encode, Y_encode, X, Y, epoch
        

## Test generate_batches() function

In [5]:
seq_len = 7
batch_size=4

ge = generate_batches(text, batch_size, seq_len, ALPHASIZE, 10)

In [6]:
X_encode, Y_encode, X,Y, epoch = ge.next()
print('Epoch number {}'.format(epoch))
print('X_encode.shape', X_encode.shape)
print('Y_encode.shape', Y_encode.shape)
print('X.shape', X.shape)
print('Y.shape', Y.shape)
print(X)
print(Y)

X_decode = []
for i, sentence in enumerate(X_encode):
    line = []
    for t, char_code in enumerate(sentence):
        line.append(convert_to_alphabet(np.argmax(char_code)))
    X_decode.append(line)             
print(np.array(X_decode))

Y_decode = []
for i, char_code in enumerate(Y_encode):
    Y_decode.append(convert_to_alphabet(np.argmax(char_code)))                
print(np.array(Y_decode))

Epoch number 0
('X_encode.shape', (8, 7, 98))
('Y_encode.shape', (8, 98))
('X.shape', (8, 7))
('Y.shape', (8,))
[['A' 'B' 'C' 'D' 'E' 'F' 'G']
 ['O' 'P' 'Q' 'R' 'S' 'T' 'U']
 ['3' '4' '5' '6' '7' '8' '9']
 ['g' 'h' 'i' 'j' 'k' 'l' 'm']
 ['H' 'I' 'J' 'K' 'L' 'M' 'N']
 ['V' 'W' 'X' 'Y' 'Z' '1' '2']
 ['0' 'a' 'b' 'c' 'd' 'e' 'f']
 ['n' 'o' 'p' 'q' 'r' 's' 't']]
['H' 'V' '0' 'n' 'O' '3' 'g' 'u']
[['A' 'B' 'C' 'D' 'E' 'F' 'G']
 ['O' 'P' 'Q' 'R' 'S' 'T' 'U']
 ['3' '4' '5' '6' '7' '8' '9']
 ['g' 'h' 'i' 'j' 'k' 'l' 'm']
 ['H' 'I' 'J' 'K' 'L' 'M' 'N']
 ['V' 'W' 'X' 'Y' 'Z' '1' '2']
 ['0' 'a' 'b' 'c' 'd' 'e' 'f']
 ['n' 'o' 'p' 'q' 'r' 's' 't']]
['H' 'V' '0' 'n' 'O' '3' 'g' 'u']


In [7]:
X_encode, Y_encode, X,Y, epoch = ge.next()
print epoch
print X_encode.shape
print X.shape
print Y.shape
print X
print Y

X_decode = []
for i, sentence in enumerate(X_encode):
    line = []
    for t, char_code in enumerate(sentence):
        line.append(convert_to_alphabet(np.argmax(char_code)))
    X_decode.append(line)             
print(np.array(X_decode))

Y_decode = []
for i, char_code in enumerate(Y_encode):
    Y_decode.append(convert_to_alphabet(np.argmax(char_code)))                
print(np.array(Y_decode))

1
(8, 7, 98)
(8, 7)
(8,)
[['O' 'P' 'Q' 'R' 'S' 'T' 'U']
 ['3' '4' '5' '6' '7' '8' '9']
 ['g' 'h' 'i' 'j' 'k' 'l' 'm']
 ['A' 'B' 'C' 'D' 'E' 'F' 'G']
 ['V' 'W' 'X' 'Y' 'Z' '1' '2']
 ['0' 'a' 'b' 'c' 'd' 'e' 'f']
 ['n' 'o' 'p' 'q' 'r' 's' 't']
 ['H' 'I' 'J' 'K' 'L' 'M' 'N']]
['V' '0' 'n' 'H' '3' 'g' 'u' 'O']
[['O' 'P' 'Q' 'R' 'S' 'T' 'U']
 ['3' '4' '5' '6' '7' '8' '9']
 ['g' 'h' 'i' 'j' 'k' 'l' 'm']
 ['A' 'B' 'C' 'D' 'E' 'F' 'G']
 ['V' 'W' 'X' 'Y' 'Z' '1' '2']
 ['0' 'a' 'b' 'c' 'd' 'e' 'f']
 ['n' 'o' 'p' 'q' 'r' 's' 't']
 ['H' 'I' 'J' 'K' 'L' 'M' 'N']]
['V' '0' 'n' 'H' '3' 'g' 'u' 'O']


In [8]:
epoch

1

## Data prepare process in a Kera code example
https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

We will not using it as we would like to use a stateful LSTM model and we would like our batchsize to be large

In [9]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 6
step = 2
batch_size=4
num_epochs = 10

sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

('nb sequences:', 28)
Vectorization...


In [10]:
print(sentences[0])
print(sentences[3])

print(sentences[2])
print(sentences[5])

ABCDEF
GHIJKL
EFGHIJ
KLMNOP


In [11]:
len(sentences)

28

In [12]:
print X.shape
print y.shape

(28, 6, 62)
(28, 62)


# Build a very simple model and test our batch generator code:

In [22]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

# size of the alphabet that we work with
ALPHASIZE = 98

text='ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz' \
    + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz' \
    + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz' \
    + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz' \
    + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz' \
    + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz' \
    + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz' \
    + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz' \
    + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyz'

num_of_epochs = 400
sequence_size = 5
batch_size = 10
num_of_chars = ALPHASIZE

# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(256, batch_input_shape=(batch_size, sequence_size, num_of_chars), stateful = True))
model.add(Dense(num_of_chars))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer) 
# adam optimizer will not work, at least not that fast...? why?

Build model...


In [23]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [24]:
batch_generator = generate_batches(text, batch_size, sequence_size, num_of_chars, num_of_epochs)

# train the model, output generated text after each iteration
for iteration in range(1, num_of_epochs):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    
    X,y, _X, _y, epoch = batch_generator.next()
    
    model.fit(X, y,
              batch_size=batch_size,
              epochs=1,
              shuffle=False
              )

    start_index = random.randint(0, len(text) - sequence_size - 1)
    
    if (iteration + 1)  % 10 == 0:
        for diversity in [0.1]:
            print()
            print('----- diversity:', diversity)

            generated = ''
            sentence = '_ABCD' #text[start_index: start_index + sequence_size]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)
            
            model.reset_states()
            
            for i in range(100):
                x = np.zeros((batch_size, sequence_size, num_of_chars))
                for i, batch in enumerate(x):
                    for t, char in enumerate(sentence):
                        x[i, t, convert_from_alphabet(char)] = 1.

                preds = model.predict(x, verbose=0)
                preds = preds[0]
                next_index = sample(preds, diversity)
                next_char = convert_to_alphabet(next_index)

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
            
            generated_2 = ''
            sentence_2 = 'x#,3F' #text[start_index: start_index + sequence_size]
            generated_2 += sentence_2
            print('\n----- Generating with seed: "' + sentence_2 + '"')
            sys.stdout.write(generated_2)
            
            model.reset_states()
            
            for i in range(100):
                x = np.zeros((batch_size, sequence_size, num_of_chars))
                for i, batch in enumerate(x):
                    for t, char in enumerate(sentence_2):
                        x[i, t, convert_from_alphabet(char)] = 1.

                preds = model.predict(x, verbose=0)
                preds = preds[0]
                next_index = sample(preds, diversity)
                next_char = convert_to_alphabet(next_index)

                generated_2 += next_char
                sentence_2 = sentence_2[1:] + next_char

                sys.stdout.write(next_char)
            print()


--------------------------------------------------
Iteration 1
Epoch 1/1

--------------------------------------------------
Iteration 2
Epoch 1/1

--------------------------------------------------
Iteration 3
Epoch 1/1

--------------------------------------------------
Iteration 4
Epoch 1/1

--------------------------------------------------
Iteration 5
Epoch 1/1

--------------------------------------------------
Iteration 6
Epoch 1/1

--------------------------------------------------
Iteration 7
Epoch 1/1

--------------------------------------------------
Iteration 8
Epoch 1/1

--------------------------------------------------
Iteration 9
Epoch 1/1

----- diversity: 0.1
----- Generating with seed: "_ABCD"
_ABCDEJKPXZ5achijkmquzBFKPS56aefhhjmosyBGKPU16afghsjkowyBGHPXZ5abcdefhhopowABGKPX567efghijopyBHLRW289himd
----- Generating with seed: "x#,3F"
x#,3FD0aefhijopyBHLQW27efhityopuADINSX39cdfghimuwAEJKLPQRWXefhhojkswBDHIN9399cdfghimuzDEFGHNOTY40afghijop

---------------------------


----- diversity: 0.1
----- Generating with seed: "_ABCD"
_ABCDEFLMNPQSTUVWX34567bcdjkgiqrs7GjkprkoqrsEhinstcejklmprsu7wBDFKMNPQSTUVWX34567bcdjkgiqrs7GjkprNoqrDEpu
----- Generating with seed: "x#,3F"
x#,3FGIJOLMRSUX3890achihjkqrsDFGLPORSTU334567bcijkqpqrsuDEFLNOPRSTUVX345678cdjkqkqrwCEGLMOPQRSTZ3789cijij

--------------------------------------------------
Iteration 50
Epoch 1/1

--------------------------------------------------
Iteration 51
Epoch 1/1

--------------------------------------------------
Iteration 52
Epoch 1/1

--------------------------------------------------
Iteration 53
Epoch 1/1

--------------------------------------------------
Iteration 54
Epoch 1/1

--------------------------------------------------
Iteration 55
Epoch 1/1

--------------------------------------------------
Iteration 56
Epoch 1/1

--------------------------------------------------
Iteration 57
Epoch 1/1

--------------------------------------------------
Iteration 58
Epoch 1/1

------------------


--------------------------------------------------
Iteration 96
Epoch 1/1

--------------------------------------------------
Iteration 97
Epoch 1/1

--------------------------------------------------
Iteration 98
Epoch 1/1

--------------------------------------------------
Iteration 99
Epoch 1/1

----- diversity: 0.1
----- Generating with seed: "_ABCD"
_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklqnopqrsEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklqnopq
----- Generating with seed: "x#,3F"
x#,3F5Wu890abcdefghijklqnopqrsEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklqnopqrsEFGHIJKLMNOPQRSTUVWXYZ12

--------------------------------------------------
Iteration 100
Epoch 1/1

--------------------------------------------------
Iteration 101
Epoch 1/1

--------------------------------------------------
Iteration 102
Epoch 1/1

--------------------------------------------------
Iteration 103
Epoch 1/1

--------------------------------------------------
Iteration 104
Epoch 1/1

-------------


--------------------------------------------------
Iteration 142
Epoch 1/1

--------------------------------------------------
Iteration 143
Epoch 1/1

--------------------------------------------------
Iteration 144
Epoch 1/1

--------------------------------------------------
Iteration 145
Epoch 1/1

--------------------------------------------------
Iteration 146
Epoch 1/1

--------------------------------------------------
Iteration 147
Epoch 1/1

--------------------------------------------------
Iteration 148
Epoch 1/1

--------------------------------------------------
Iteration 149
Epoch 1/1

----- diversity: 0.1
----- Generating with seed: "_ABCD"
_ABCDEFKLMNOPQRSTUVWXYZ1234567890abddfghijklmnopqrstwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abddfghijklmn
----- Generating with seed: "x#,3F"
x#,3F567890abddfghijklmnopqrstwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abddfghijklmnopqrstwxyzABCDEFGHIJKL

--------------------------------------------------
Iteration 150
Epoch 1/1

---------


----- diversity: 0.1
----- Generating with seed: "_ABCD"
_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdef
----- Generating with seed: "x#,3F"
x#,3F5HIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefgh

--------------------------------------------------
Iteration 190
Epoch 1/1

--------------------------------------------------
Iteration 191
Epoch 1/1

--------------------------------------------------
Iteration 192
Epoch 1/1

--------------------------------------------------
Iteration 193
Epoch 1/1

--------------------------------------------------
Iteration 194
Epoch 1/1

--------------------------------------------------
Iteration 195
Epoch 1/1

--------------------------------------------------
Iteration 196
Epoch 1/1

--------------------------------------------------
Iteration 197
Epoch 1/1

--------------------------------------------------
Iteration 198
Epoch 1/1

---------


--------------------------------------------------
Iteration 236
Epoch 1/1

--------------------------------------------------
Iteration 237
Epoch 1/1

--------------------------------------------------
Iteration 238
Epoch 1/1

--------------------------------------------------
Iteration 239
Epoch 1/1

----- diversity: 0.1
----- Generating with seed: "_ABCD"
_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklqnotqvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghij
----- Generating with seed: "x#,3F"
x#,3F57890abcdefghijklqnstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklqnotqvwxyzABCDEFGHIJKLMNO

--------------------------------------------------
Iteration 240
Epoch 1/1

--------------------------------------------------
Iteration 241
Epoch 1/1

--------------------------------------------------
Iteration 242
Epoch 1/1

--------------------------------------------------
Iteration 243
Epoch 1/1

--------------------------------------------------
Iteration 244
Epoch 1/1

---------


--------------------------------------------------
Iteration 282
Epoch 1/1

--------------------------------------------------
Iteration 283
Epoch 1/1

--------------------------------------------------
Iteration 284
Epoch 1/1

--------------------------------------------------
Iteration 285
Epoch 1/1

--------------------------------------------------
Iteration 286
Epoch 1/1

--------------------------------------------------
Iteration 287
Epoch 1/1

--------------------------------------------------
Iteration 288
Epoch 1/1

--------------------------------------------------
Iteration 289
Epoch 1/1

----- diversity: 0.1
----- Generating with seed: "_ABCD"
_ABCDEFGHIJKLMNOPQRSTUVWXYZ12345678efghijklmnopqrstwvyzBBCDEFGHIJKLMNOPQRSTUVWXYZ12345678efghijklmnopqrst
----- Generating with seed: "x#,3F"
x#,3F56789cdefghijklmnstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ12345678efghijklmnopqrstwvyzBBCDEFGHIJKLMNOPQRSTU

--------------------------------------------------
Iteration 290
Epoch 1/1

---------


----- diversity: 0.1
----- Generating with seed: "_ABCD"
_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdef
----- Generating with seed: "x#,3F"
x#,3Fk7ukJKLMNOPQRSTUVWXYZ1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefg

--------------------------------------------------
Iteration 330
Epoch 1/1

--------------------------------------------------
Iteration 331
Epoch 1/1

--------------------------------------------------
Iteration 332
Epoch 1/1

--------------------------------------------------
Iteration 333
Epoch 1/1

--------------------------------------------------
Iteration 334
Epoch 1/1

--------------------------------------------------
Iteration 335
Epoch 1/1

--------------------------------------------------
Iteration 336
Epoch 1/1

--------------------------------------------------
Iteration 337
Epoch 1/1

--------------------------------------------------
Iteration 338
Epoch 1/1

---------


--------------------------------------------------
Iteration 376
Epoch 1/1

--------------------------------------------------
Iteration 377
Epoch 1/1

--------------------------------------------------
Iteration 378
Epoch 1/1

--------------------------------------------------
Iteration 379
Epoch 1/1

----- diversity: 0.1
----- Generating with seed: "_ABCD"
_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdijklmnopqrwtyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdijklmnopqr
----- Generating with seed: "x#,3F"
x#,3F577890hijklmnopqrsFuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abddijklmnopqrwtyvABCDEFGHIJKLMNOPQRSTUV

--------------------------------------------------
Iteration 380
Epoch 1/1

--------------------------------------------------
Iteration 381
Epoch 1/1

--------------------------------------------------
Iteration 382
Epoch 1/1

--------------------------------------------------
Iteration 383
Epoch 1/1

--------------------------------------------------
Iteration 384
Epoch 1/1

---------