In [1]:
import numpy as np
import sys

import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, LSTM, Dropout, Input
from keras.optimizers import RMSprop, Adam

from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

import tensorflow as tf

Using TensorFlow backend.


In [2]:
print(tf.__version__)
print(keras.__version__)

1.2.0
2.0.5


## Load the corpus

In [3]:
# load ascii text and covert to lowercase
filename = "wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

## Make look up tables based on characters

In [5]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Unique chars: ", n_vocab)

Total Characters:  144345
Total Unique chars:  46


In [6]:
#raw_text = raw_text[1:]

## Make the sequences

This shows you an example of making sequences sampled from the overall text data. 

We are creating sequences that are 100 characters long

In [7]:
# create input and output pairs
seq_length = 50
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total sequences: ", n_patterns)

Total sequences:  144295


### Lets examine some of these sequences

In [7]:
print(dataX[0])
print(dataY[0])

[43, 19, 24, 17, 32, 36, 21, 34, 1, 25, 10, 1, 20, 31, 39, 30, 1, 36, 24, 21, 1, 34, 17, 18, 18, 25, 36, 9, 24, 31, 28, 21, 0, 0, 17, 28, 25, 19, 21, 1, 39, 17, 35, 1, 18, 21, 23, 25, 30, 30]
25


In [8]:
print("\"", ''.join([int_to_char[value] for value in dataX[0]]), "\"")
print(int_to_char[dataY[0]])

" ﻿chapter i. down the rabbit-hole

alice was beginn "
i


In [9]:
print(dataX[1])
print(dataY[1])

[19, 24, 17, 32, 36, 21, 34, 1, 25, 10, 1, 20, 31, 39, 30, 1, 36, 24, 21, 1, 34, 17, 18, 18, 25, 36, 9, 24, 31, 28, 21, 0, 0, 17, 28, 25, 19, 21, 1, 39, 17, 35, 1, 18, 21, 23, 25, 30, 30, 25]
30


In [10]:
print("\"", ''.join([int_to_char[value] for value in dataX[1]]), "\"")
print(int_to_char[dataY[1]])

" chapter i. down the rabbit-hole

alice was beginni "
n


In [11]:
print("\"", ''.join([int_to_char[value] for value in dataX[4]]), "\"")
print(int_to_char[dataY[4]])

" pter i. down the rabbit-hole

alice was beginning  "
t


### Reshaping the sequences to become timesteps into the LSTM

In [12]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
print(X.shape)

# normalize
X = X / float(n_vocab)

(144293, 50, 1)


In [13]:
print(X[0][:10])
print(dataY[0])

[[ 0.97727273]
 [ 0.43181818]
 [ 0.54545455]
 [ 0.38636364]
 [ 0.72727273]
 [ 0.81818182]
 [ 0.47727273]
 [ 0.77272727]
 [ 0.02272727]
 [ 0.56818182]]
25


In [14]:
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [15]:
print(X[0][:10])
print(y[0])

[[ 0.97727273]
 [ 0.43181818]
 [ 0.54545455]
 [ 0.38636364]
 [ 0.72727273]
 [ 0.81818182]
 [ 0.47727273]
 [ 0.77272727]
 [ 0.02272727]
 [ 0.56818182]]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]


## Creating our model

We will use the return sequences = true to pass the sequence up to the 2nd LSTM

In [16]:
# define the input shape
inp = Input(shape=(X.shape[1], X.shape[2]))
print('our input shape is ',(X.shape[1], X.shape[2]) )

our input shape is  (50, 1)


In [17]:
#x = LSTM(256, return_sequences = True)(inp) 
#x = Dropout(0.2)(x)
x = LSTM(256)(inp)
x = Dropout(0.2)(x)
output = Dense(y.shape[1], activation ='softmax')(x)

In [18]:
generative_model = Model(inputs = inp, outputs=output )

In [19]:
optimizer = RMSprop(lr=0.01)
generative_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [20]:
# define the checkpoint
filepath="checkpoints/weights-improvement-{epoch:02d}-{loss:.4f}-gentext-CharRNN-simple.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [21]:
generative_model.fit(X, y, epochs=10, batch_size=64, callbacks=callbacks_list)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa7cf7f9be0>

In [22]:
generative_model.save('Text_gen_01-CharRNN_no_embedding-simple')

In [23]:
generative_model.fit(X, y, epochs=10, batch_size=64, callbacks=callbacks_list)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa7dca3e2b0>

In [24]:
generative_model.save('Text_gen_01-CharRNN_no_embedding-simple')

In [25]:
#generative_model = load_model('Text_gen_01_no_embedding')

In [26]:
# create mapping of unique chars to integers, and a reverse mapping
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  144343
Total Vocab:  44


In [27]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
seed = dataX[start]
print(pattern)
print("Seed pattern:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

[30, 20, 1, 37, 30, 36, 39, 25, 35, 36, 1, 25, 36, 10, 1, 17, 22, 36, 21, 34, 1, 17, 1, 39, 24, 25, 28, 21, 1, 35, 24, 21, 0, 34, 21, 29, 21, 29, 18, 21, 34, 21, 20, 1, 36, 24, 17, 36, 1, 35]
Seed pattern:
" nd untwist it. after a while she
remembered that s "


In [28]:
generated_text = []

# generate characters
for i in range(100):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = generative_model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    pattern.append(index)
    generated_text.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")


Done.


In [29]:
print(pattern)
print("\"", ''.join([int_to_char[value] for value in seed]), "\"")
print("\"", ''.join([int_to_char[value] for value in generated_text]), "\"")

[18, 17, 37, 36, 21, 1, 36, 24, 21, 1, 24, 31, 34, 35, 21, 1, 39, 17, 35, 1, 35, 24, 21, 1, 39, 17, 35, 1, 35, 31, 25, 1, 36, 25, 34, 21, 1, 36, 24, 21, 1, 39, 24, 35, 36, 1, 35, 17, 34, 1]
" nd untwist it. after a while she
remembered that sh "
" he was no thee th the whrl shye the was toinking abaute the horse was she was soi tire the whst sar  "


In [30]:
generated_text = []

# generate characters
for i in range(100):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = generative_model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    pattern.append(index)
    generated_text.append(index)
    pattern = pattern[1:len(pattern)]
print(pattern)
print("\"", ''.join([int_to_char[value] for value in seed]), "\"")
print("\"", ''.join([int_to_char[value] for value in generated_text]), "\"")

[21, 1, 35, 17, 29, 10, 1, 4, 1, 0, 4, 36, 24, 21, 1, 35, 25, 29, 21, 1, 36, 24, 21, 1, 29, 17, 40, 36, 21, 34, 1, 39, 17, 34, 1, 17, 1, 29, 25, 36, 36, 28, 21, 1, 17, 25, 36, 8, 4, 1]
" nd untwist it. after a while she
remembered that sh "
" ofating an in lo hn an in eiong on the temte on the sam. ' 
'the sime the maxter war a mittle ait,'  "


In [31]:
generated_text = []

# generate characters
for i in range(500):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = generative_model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    pattern.append(index)
    generated_text.append(index)
    pattern = pattern[1:len(pattern)]
print(pattern)
print("\"", ''.join([int_to_char[value] for value in seed]), "\"")
print("\"", ''.join([int_to_char[value] for value in generated_text]), "\"")

[21, 21, 1, 24, 17, 34, 21, 21, 34, 8, 1, 39, 24, 21, 1, 24, 17, 36, 36, 21, 34, 1, 39, 17, 35, 1, 36, 24, 21, 1, 22, 17, 34, 36, 21, 34, 8, 1, 17, 30, 20, 1, 36, 24, 21, 1, 39, 24, 34, 36]
" nd untwist it. after a while she
remembered that sh "
" said alice, ''whll in sae sfmtert,' said alice. ''whll io sase'' said the caterpillar.

'well, i shanl heve your mave youl mo,  she macci hare aglan a gittlls of the gad tuiee her ined th the thate oabrirg hn a linute of the ladte oatter and lare on the taale, and the whrt hardey so be a lortle shing so her hn a loeen tone. ''lakd the mart ri tha bir here the ragt of the samts on the samd bel, 'no that ds wat a little brere't thick tay th the karee hareer, whe hatter was the farter, and the whrt "
