In [1]:
import numpy as np

In [2]:
import idx2numpy

y_train = idx2numpy.convert_from_file('../dataset/train-labels.idx1-ubyte')
y_test = idx2numpy.convert_from_file('../dataset/t10k-labels.idx1-ubyte')

In [3]:
text = ''

In [4]:
for x in y_train:
    text+=str(x)

In [5]:
for x in y_test:
    text+=str(x)

In [6]:
print (len(text))
print (text[0:9])

70000
504192131


In [31]:
# Length of extracted character sequences
maxlen = 50

# We sample a new sequence every `step` characters
step = 3

# This holds our extracted sequences
sentences = []

# This holds the targets (the follow-up characters)
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

# List of unique characters in the corpus
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
# Dictionary mapping unique characters to their index in `chars`
char_indices = dict((char, chars.index(char)) for char in chars)

# Next, one-hot encode the characters into binary arrays.
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Number of sequences: 23317
Unique characters: 10
Vectorization...


In [32]:
from keras import layers, models, optimizers

model = models.Sequential()
model.add(layers.LSTM(256, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

In [33]:
optimizer = optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [34]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [35]:
# Fit the model for 1 epoch on the available training data
model.fit(x, y, batch_size=128, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x7f6bdc37ca58>

In [40]:
import random
import sys

chars_to_generate = 50
test_to_perform = 10

seed_text = []
text_to_generate = []
text_generated_0_2 = []
text_generated_0_5 = []
text_generated_1_0 = []
text_generated_1_2 = []


for epoch in range(test_to_perform):

    # Select a text seed at random
    start_index = random.randint(0, len(text) - maxlen - 1)
    end_index = start_index + maxlen
    generated_text = text[start_index: end_index]
    
    seed_text.append(generated_text)
    text_to_generate.append(text[end_index: end_index + chars_to_generate])

    
    print('--- Generating with seed: "' + generated_text + '"')
    print('--- Original Text: "' + str(text_to_generate[epoch]) + '"')
    
        
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ temperature:', temperature)

        # We generate 50 characters
        for i in range(chars_to_generate):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]
            
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
        if temperature == 0.2:
            text_generated_0_2.append(generated_text)
        if temperature == 0.5:
            text_generated_0_5.append(generated_text)
        if temperature == 1.0:
            text_generated_1_0.append(generated_text)
        if temperature == 1.2:
            text_generated_1_2.append(generated_text)
        
        
    #model.save('model_epoch_{}.hdf5'.format(epoch))
    #model.save_weights('text_generator_gigantic_weights{}.h5'.format(epoch))

--- Generating with seed: "75396859656403432447390061398132579991300662545093"
--- Original Text: "94262685568701740492564289114513517448566317106576"
------ temperature: 0.2
86666666666666666666668686666666666668666663666666
------ temperature: 0.5
66586668668856662628666662368868686366662858886626
------ temperature: 1.0
61636267166817688665936666866615383580663325885858
------ temperature: 1.2
50064768206666836656261666865262686777905662616832
--- Generating with seed: "27050571326633361620321891964140141341767179105162"
--- Original Text: "73848566375809504122630437586940616233647516473819"
------ temperature: 0.2
66666666666666866666686668666686668688686666686666
------ temperature: 0.5
56866818566666668586686686166888896665666665258666
------ temperature: 1.0
69488686573396586886861230662665636888688806962796
------ temperature: 1.2
68582816913846763863736626886292896622946885638888
--- Generating with seed: "61061911810646777061306804774640188431761496398407"
--- Original Text: 

In [43]:
for x in range(test_to_perform):
    print ('seed text:            ', seed_text[x])
    print ('text to be generated: ', text_to_generate[x])
    print ('text generated 0.2:   ', text_generated_0_2[x])
    print ('text generated 0.5:   ', text_generated_0_5[x])
    print ('text generated 1.0:   ', text_generated_1_0[x])
    print ('text generated 1.2:   ', text_generated_1_2[x])
    print ()

seed text:             75396859656403432447390061398132579991300662545093
text to be generated:  94262685568701740492564289114513517448566317106576
text generated 0.2:    86666666666666666666668686666666666668666663666666
text generated 0.5:    66586668668856662628666662368868686366662858886626
text generated 1.0:    61636267166817688665936666866615383580663325885858
text generated 1.2:    50064768206666836656261666865262686777905662616832

seed text:             27050571326633361620321891964140141341767179105162
text to be generated:  73848566375809504122630437586940616233647516473819
text generated 0.2:    66666666666666866666686668666686668688686666686666
text generated 0.5:    56866818566666668586686686166888896665666665258666
text generated 1.0:    69488686573396586886861230662665636888688806962796
text generated 1.2:    68582816913846763863736626886292896622946885638888

seed text:             61061911810646777061306804774640188431761496398407
text to be generated:  1726314054617