In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical


Using TensorFlow backend.


In [None]:
class Corpus:
    def __init__(self, corpus_path,Tx):
        with open(corpus_path) as corpus_file:
            self.corpus = corpus_file.read()

        # Get a unique identifier for each char in the corpus,
        # then make some dicts to ease encoding and decoding
        self.vocab = sorted(list(set(self.corpus)))
        self.encoder = {c: i for i, c in enumerate(self.chars)}
        self.decoder = {i: c for i, c in enumerate(self.chars)}

        # Some fields we'll need later
        self.vocab_len = len(self.chars)
        self.Tx = Tx
        self.corpus_len = len(self.corpus)

In [2]:
with open("data/sonnets.txt") as corpus_file:
    corpus = corpus_file.read()
print("corpus sample = '{}' corpus len = {}".format(corpus[0:20],len(corpus)))

corpus sample = 'From fairest creatur' corpus len = 94651


In [4]:
vocab = sorted(list(set(corpus)))
vocab_len = len(vocab)
char_to_index = {c: i for i, c in enumerate(vocab)}
index_to_char = {i: c for i, c in enumerate(vocab)}
print("vocab = {} vocab_len = {}".format(vocab,vocab_len))

vocab = ['\n', ' ', '!', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] vocab_len = 61


Lets create a Dataset like below, for a copus <i><b> from fairest creatures we desire increase </b></i> with Tx = 10.
-    X   ==>          Y
- from faire ==>       s
- rom faires ==>       t
- om fairest ==>      ' '

We can create corpus_len - Tx examples

In [9]:
sentence_length = 50  # Tx
X_train_raw = []
Y_train_raw = []
for i in range (0, len(corpus) - sentence_length, 1):
    sentence = corpus[i:i + sentence_length]
    next_char = corpus[i + sentence_length]
    X_train_raw.append([char_to_index[char] for char in sentence])
    Y_train_raw.append(char_to_index[next_char])

num_sentences = len(X_train_raw)
print("Sliced our corpus into {0} sentences of length {1}".format(num_sentences, sentence_length))
m = num_sentences
Tx = sentence_length

Sliced our corpus into 94601 sentences of length 50


- Transform X data (m,Tx,vec_size) to Y (m,1,vec_size) i.e (m,vec_size).
- Many to One RNN architecture
- Lets convert into one hot encoding
- LSTM model will remove Tx dimension if you don't specify return_sequences=True.
- In the predict, you can pass a random text of length upto Tx to kick start the prediction. Loop it after it gives each word.
- 

In [13]:
# Vectorize our data and labels. We want everything in one-hot
X_train = to_categorical(X_train_raw)
Y_train = to_categorical(Y_train_raw)

print('X_train.shape = ',X_train.shape,'\nY_train.shape = ',Y_train.shape)


Vectorizing X and y...
X_train.shape =  (94601, 50, 61) 
Y_train.shape =  (94601, 61)


In [15]:
# Define our model
print("Let's build a brain!")
model = Sequential()
model.add(LSTM(256, input_shape=(Tx, vocab_len)))
model.add(Dense(vocab_len))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Dump our model architecture to a file so we can load it elsewhere
architecture = model.to_yaml()
with open('model.yaml', 'a') as model_file:
    model_file.write(architecture)

# Set up checkpoints
file_path="weights-{epoch:02d}-{loss:.3f}.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor="loss", verbose=1, save_best_only=True, mode="min")
callbacks = [checkpoint]

# Action time! [Insert guitar solo here]
model.fit(X_train, Y_train, epochs=30, batch_size=128, callbacks=callbacks)

Let's build a brain!




Epoch 1/30

Epoch 00001: loss improved from inf to 2.61865, saving model to weights-01-2.619.hdf5
Epoch 2/30

Epoch 00002: loss improved from 2.61865 to 2.11454, saving model to weights-02-2.115.hdf5
Epoch 3/30

Epoch 00003: loss improved from 2.11454 to 1.95527, saving model to weights-03-1.955.hdf5
Epoch 4/30

Epoch 00004: loss improved from 1.95527 to 1.85029, saving model to weights-04-1.850.hdf5
Epoch 5/30

Epoch 00005: loss improved from 1.85029 to 1.76882, saving model to weights-05-1.769.hdf5
Epoch 6/30

Epoch 00006: loss improved from 1.76882 to 1.70265, saving model to weights-06-1.703.hdf5
Epoch 7/30

Epoch 00007: loss improved from 1.70265 to 1.64594, saving model to weights-07-1.646.hdf5
Epoch 8/30

Epoch 00008: loss improved from 1.64594 to 1.59491, saving model to weights-08-1.595.hdf5
Epoch 9/30

Epoch 00009: loss improved from 1.59491 to 1.54878, saving model to weights-09-1.549.hdf5
Epoch 10/30

Epoch 00010: loss improved from 1.54878 to 1.50384, saving model to weigh

<keras.callbacks.History at 0x12824e4e0>