In [1]:
import numpy as np

In [2]:
import idx2numpy

y_train = idx2numpy.convert_from_file('../dataset/train-labels.idx1-ubyte')
y_test = idx2numpy.convert_from_file('../dataset/t10k-labels.idx1-ubyte')

In [3]:
text = ''

In [4]:
for x in y_train:
    text+=str(x)

In [5]:
for x in y_test:
    text+=str(x)

In [6]:
print (len(text))
print (text[0:9])

70000
504192131


In [17]:
# Length of extracted character sequences
maxlen = 50

# We sample a new sequence every `step` characters
step = 3

# This holds our extracted sequences
sentences = []

# This holds the targets (the follow-up characters)
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

# List of unique characters in the corpus
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
# Dictionary mapping unique characters to their index in `chars`
char_indices = dict((char, chars.index(char)) for char in chars)

# Next, one-hot encode the characters into binary arrays.
print('Vectorization...')
x_data = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y_data = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x_data[i, t, char_indices[char]] = 1
    y_data[i, char_indices[next_chars[i]]] = 1

Number of sequences: 23317
Unique characters: 10
Vectorization...


In [18]:
from keras import layers, models, optimizers

model = models.Sequential()
model.add(layers.LSTM(256, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

In [19]:
optimizer = optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [20]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [22]:
# Fit the model for 1 epoch on the available training data
model.fit(x_data, y_data, batch_size=128, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x7efd15aadb00>

In [23]:
import random
import sys

chars_to_generate = 50
test_to_perform = 3

seed_text = []
text_to_generate = []

for epoch in range(test_to_perform):

    # Select a text seed at random
    start_index = random.randint(0, len(text) - maxlen - 1)
    end_index = start_index + maxlen
    
    seed_text.append(text[start_index: end_index])
    text_to_generate.append(text[end_index: end_index + chars_to_generate])

    
    print('Seed Text {}: "'.format(epoch) + str(seed_text[epoch]) + '"')
    print('OG Text   {}: "'.format(epoch) + str(text_to_generate[epoch]) + '"')       
    print()
    #model.save('model_epoch_{}.hdf5'.format(epoch))
    #model.save_weights('text_generator_gigantic_weights{}.h5'.format(epoch))

Seed Text 0: "56972849801172032435269778793690681022636779642701"
OG Text   0: "69611731041010342167951781437373712689373493408235"

Seed Text 1: "03842043193092073121509766221287166006599478049333"
OG Text   1: "37782055389380760883470815318840615213341728490051"

Seed Text 2: "30311902137775612792380790515986940172538163656354"
OG Text   2: "01060880082934992216305112739546272899300112443536"



In [24]:
text_generated_0_2 = []
text_generated_0_5 = []
text_generated_1_0 = []
text_generated_1_2 = []

for x in range(len(seed_text)):
    
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ temperature:', temperature)
        gentext = ''
        # We generate 50 characters
        for i in range(chars_to_generate):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(seed_text[x]):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            gentext += '{}'.format(next_char)
            
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
        
        if temperature == 0.2:
            text_generated_0_2.append(gentext)
        if temperature == 0.5:
            text_generated_0_5.append(gentext)
        if temperature == 1.0:
            text_generated_1_0.append(gentext)
        if temperature == 1.2:
            text_generated_1_2.append(gentext)

------ temperature: 0.2
44444444444444444444444444444444444444444444444444
------ temperature: 0.5
49253141444444442448414143344111834434424448434492
------ temperature: 1.0
11194284414913144845914844624837111742381149342480
------ temperature: 1.2
15921753171921414456346131549248520321814933275709
------ temperature: 0.2
44444444444444444444444444444444444444444444444444
------ temperature: 0.5
92458449942421444484449444414813444145443844484814
------ temperature: 1.0
41745443883922464841283985105543449448348645424481
------ temperature: 1.2
49864483714684294292522481891824143441234985731743
------ temperature: 0.2
41444444444444444444444444444444444444444444444444
------ temperature: 0.5
44444547442449544441447445444817444144444414444735
------ temperature: 1.0
57453884134441911282504645041121031514115103154815
------ temperature: 1.2
21394661081243472192598284738380514445459898453151


In [25]:
for x in range(test_to_perform):
    print ('{} seed text:            '.format(x), seed_text[x])
    print ('{} text to be generated: '.format(x), text_to_generate[x])
    print ('{} text generated 0.2:   '.format(x), text_generated_0_2[x])
    print ('{} text generated 0.5:   '.format(x), text_generated_0_5[x])
    print ('{} text generated 1.0:   '.format(x), text_generated_1_0[x])
    print ('{} text generated 1.2:   '.format(x), text_generated_1_2[x])
    print ()

0 seed text:             56972849801172032435269778793690681022636779642701
0 text to be generated:  69611731041010342167951781437373712689373493408235
0 text generated 0.2:    44444444444444444444444444444444444444444444444444
0 text generated 0.5:    49253141444444442448414143344111834434424448434492
0 text generated 1.0:    11194284414913144845914844624837111742381149342480
0 text generated 1.2:    15921753171921414456346131549248520321814933275709

1 seed text:             03842043193092073121509766221287166006599478049333
1 text to be generated:  37782055389380760883470815318840615213341728490051
1 text generated 0.2:    44444444444444444444444444444444444444444444444444
1 text generated 0.5:    92458449942421444484449444414813444145443844484814
1 text generated 1.0:    41745443883922464841283985105543449448348645424481
1 text generated 1.2:    49864483714684294292522481891824143441234985731743

2 seed text:             30311902137775612792380790515986940172538163656354
2 text to 

In [26]:
count_0_2 = []
per_0_2 = []
count_0_5 = []
per_0_5 = []
count_1_0 = []
per_1_0 = []
count_1_2 = []
per_1_2 = []

for y in range(test_to_perform):
    a=0
    for x in range(chars_to_generate):
        if text_to_generate[y][x] == text_generated_0_2[y][x]:
            a+=1
    count_0_2.append(a)
    per_0_2.append((a/chars_to_generate)*100)


for y in range(test_to_perform):
    a=0
    for x in range(chars_to_generate):
        if text_to_generate[y][x] == text_generated_0_5[y][x]:
            a+=1
    count_0_5.append(a)
    per_0_5.append((a/chars_to_generate)*100)

    
for y in range(test_to_perform):
    a=0
    for x in range(chars_to_generate):
        if text_to_generate[y][x] == text_generated_1_0[y][x]:
            a+=1
    count_1_0.append(a)
    per_1_0.append((a/chars_to_generate)*100)

    
for y in range(test_to_perform):
    a=0
    for x in range(chars_to_generate):
        if text_to_generate[y][x] == text_generated_1_2[y][x]:
            a+=1
    count_1_2.append(a)
    per_1_2.append((a/chars_to_generate)*100)

print()
print ('temp 0.2')
for x in range(test_to_perform):
    print (count_0_2[x], per_0_2[x])
    
print()
print ('temp 0.5')
for x in range(test_to_perform):
    print (count_0_5[x], per_0_5[x])

print()
print ('temp 1.0')
for x in range(test_to_perform):
    print (count_1_0[x], per_1_0[x])

print()
print ('temp 1.2')
for x in range(test_to_perform):
    print (count_1_2[x], per_1_2[x])



temp 0.2
5 10.0
4 8.0
5 10.0

temp 0.5
7 14.000000000000002
6 12.0
5 10.0

temp 1.0
5 10.0
10 20.0
6 12.0

temp 1.2
5 10.0
2 4.0
8 16.0


## Additional Training

In [27]:
# Fit the model for more 10 epoch on the data
model.fit(x_data, y_data, batch_size=128, epochs=9)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.callbacks.History at 0x7efd16358e48>

In [28]:
text_generated_0_2 = []
text_generated_0_5 = []
text_generated_1_0 = []
text_generated_1_2 = []

for x in range(len(seed_text)):
    
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ temperature:', temperature)
        gentext = ''
        # We generate 50 characters
        for i in range(chars_to_generate):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(seed_text[x]):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            gentext += '{}'.format(next_char)
            
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
        
        if temperature == 0.2:
            text_generated_0_2.append(gentext)
        if temperature == 0.5:
            text_generated_0_5.append(gentext)
        if temperature == 1.0:
            text_generated_1_0.append(gentext)
        if temperature == 1.2:
            text_generated_1_2.append(gentext)

------ temperature: 0.2
11211711112111221211221111111111211110111111111111
------ temperature: 0.5
21111600597311112112110233126011312616202114682141
------ temperature: 1.0
12313010531146214854652190232741112262738406511406
------ temperature: 1.2
74324341480030811752472165021150039991096122811213
------ temperature: 0.2
47444643444331403344464304444474434444640364444444
------ temperature: 0.5
34943334443440660453073134444193306349436357330747
------ temperature: 1.0
34596337974614244443304439626716283374473272834433
------ temperature: 1.2
57396792480783374141197427369653543533459004872753
------ temperature: 0.2
00363866363026060042600622603310666303431366486336
------ temperature: 0.5
16306216066743160944003225666393933726973332633380
------ temperature: 1.0
20262335608453547354565701665047360632977763060766
------ temperature: 1.2
36403063714614637362755526150663931802907185319326


In [29]:
for x in range(test_to_perform,test_to_perform+9):
    print ('{} seed text:            '.format(x), seed_text[x])
    print ('{} text to be generated: '.format(x), text_to_generate[x])
    print ('{} text generated 0.2:   '.format(x), text_generated_0_2[x])
    print ('{} text generated 0.5:   '.format(x), text_generated_0_5[x])
    print ('{} text generated 1.0:   '.format(x), text_generated_1_0[x])
    print ('{} text generated 1.2:   '.format(x), text_generated_1_2[x])
    print ()

0 seed text:             56972849801172032435269778793690681022636779642701
0 text to be generated:  69611731041010342167951781437373712689373493408235
0 text generated 0.2:    11211711112111221211221111111111211110111111111111
0 text generated 0.5:    21111600597311112112110233126011312616202114682141
0 text generated 1.0:    12313010531146214854652190232741112262738406511406
0 text generated 1.2:    74324341480030811752472165021150039991096122811213

1 seed text:             03842043193092073121509766221287166006599478049333
1 text to be generated:  37782055389380760883470815318840615213341728490051
1 text generated 0.2:    47444643444331403344464304444474434444640364444444
1 text generated 0.5:    34943334443440660453073134444193306349436357330747
1 text generated 1.0:    34596337974614244443304439626716283374473272834433
1 text generated 1.2:    57396792480783374141197427369653543533459004872753

2 seed text:             30311902137775612792380790515986940172538163656354
2 text to 

In [26]:
count_0_2 = []
per_0_2 = []
count_0_5 = []
per_0_5 = []
count_1_0 = []
per_1_0 = []
count_1_2 = []
per_1_2 = []

for y in range(test_to_perform):
    a=0
    for x in range(chars_to_generate):
        if text_to_generate[y][x] == text_generated_0_2[y][x]:
            a+=1
    count_0_2.append(a)
    per_0_2.append((a/chars_to_generate)*100)


for y in range(test_to_perform):
    a=0
    for x in range(chars_to_generate):
        if text_to_generate[y][x] == text_generated_0_5[y][x]:
            a+=1
    count_0_5.append(a)
    per_0_5.append((a/chars_to_generate)*100)

    
for y in range(test_to_perform):
    a=0
    for x in range(chars_to_generate):
        if text_to_generate[y][x] == text_generated_1_0[y][x]:
            a+=1
    count_1_0.append(a)
    per_1_0.append((a/chars_to_generate)*100)

    
for y in range(test_to_perform):
    a=0
    for x in range(chars_to_generate):
        if text_to_generate[y][x] == text_generated_1_2[y][x]:
            a+=1
    count_1_2.append(a)
    per_1_2.append((a/chars_to_generate)*100)

print()
print ('temp 0.2')
for x in range(test_to_perform):
    print (count_0_2[x], per_0_2[x])
    
print()
print ('temp 0.5')
for x in range(test_to_perform):
    print (count_0_5[x], per_0_5[x])

print()
print ('temp 1.0')
for x in range(test_to_perform):
    print (count_1_0[x], per_1_0[x])

print()
print ('temp 1.2')
for x in range(test_to_perform):
    print (count_1_2[x], per_1_2[x])



temp 0.2
5 10.0
4 8.0
5 10.0

temp 0.5
7 14.000000000000002
6 12.0
5 10.0

temp 1.0
5 10.0
10 20.0
6 12.0

temp 1.2
5 10.0
2 4.0
8 16.0
