In [1]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

Using TensorFlow backend.


In [2]:
ids = np.load('quote_matrix10.npy')
labels = np.load('final_topics10.npy')

ids.shape, labels.shape

((9983, 200), (9983,))

In [3]:
labels[999], labels[1000]

('death', 'family')

In [4]:
ids = ids[:1000,:]

In [5]:
ids.shape

(1000, 200)

In [6]:
labels = labels[:1000]

In [8]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 100
step = 1
seq = []
next_seq = []
quote_len = ids.shape[1]

for quote in ids:
    for i in range(0, quote_len - maxlen, step):
        if quote[i] >0:
            seq.append(quote[i: i + maxlen])
            next_seq.append(quote[i + maxlen])
print('nb sequences:', len(seq))

nb sequences: 23340


In [15]:
seq[0:10], next_seq[0:10]

([array([137], dtype=int32),
  array([7430], dtype=int32),
  array([13376], dtype=int32),
  array([13503], dtype=int32),
  array([3442], dtype=int32),
  array([4934], dtype=int32),
  array([11355], dtype=int32),
  array([6656], dtype=int32),
  array([8280], dtype=int32),
  array([4823], dtype=int32)],
 [7430, 13376, 13503, 3442, 4934, 11355, 6656, 8280, 4823, 12374])

In [16]:
word_to_int = np.load('word_to_int10.npy')
int_to_word = np.load('int_to_word10.npy')

word_to_int = word_to_int.item()
int_to_word = int_to_word.item()

In [22]:
len(int_to_word)

13661

In [24]:
seq = np.asarray(seq)
next_seq = np.asarray(next_seq)

In [27]:
seq.shape, seq.max(), next_seq.max()

((23340, 1), 13654, 13654)

In [26]:
max_word = np.asarray(next_seq.max())

In [28]:
from keras.utils import to_categorical

In [32]:
# reshape X to be [samples, time steps, features]
X = np.reshape(seq, (len(seq), maxlen, 1))
# normalize
X = X / float(next_seq.max())
# one hot encode the output variable
y = to_categorical(next_seq, num_classes= len(int_to_word))

In [33]:
y.shape, X.shape

((23340, 13661), (23340, 1, 1))

In [34]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(X.shape[1], X.shape[2])))



model.add(Dense(y.shape[1]))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [35]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [37]:
text = np.ndarray.flatten(seq)

In [38]:
text.shape

(23340,)

In [43]:
x_pred = 0
generated = 0
sentence = 0
def on_epoch_end(epoch, logs):
    global x_pred
    global sentence 
    global generated
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        print(sentence)
        generated.join([str([int_to_word[value]]).join(' ') for value in sentence])
        print('----- Generating with seed: %s'%sentence)
#         sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.reshape(sentence,(1, maxlen, 1))
            x_pred = x_pred / max_word

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = int_to_word[next_index]

            generated.join(str(next_char))
            sentence = np.append(sentence[1:],next_index)

            sys.stdout.write(next_char)
            sys.stdout.write(" ")
            sys.stdout.flush()
        print()

In [44]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(X, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

Epoch 1/60

----- Generating text after Epoch: 0
----- diversity: 0.2
[11928]
----- Generating with seed: [11928]
04 04 and 04 of the death death 04 is the death the death is 04 04 04 the death 04 is of the to death death the death 04 of the to the death the death 04 the death 04 the to 04 the death 04 04 04 the the death 04 04 is 04 the death death 04 the death 04 the the death the death the to death a of death 04 04 the death the of and of death 04 04 04 04 04 04 04 the death the death 04 04 to death the to death 04 the death 04 04 the the death 04 of the death the death of death 04 the the death of the death the to death the death 04 04 the death the death of 04 04 the to death the death the death 04 and 04 04 the to the death 04 04 the 04 04 of death 04 the death 04 04 04 04 04 04 the and 04 is 04 and 04 of the death death is 04 of death 04 to death a the and the death and 04 04 the death to death and 04 the death and 04 04 the death of 04 04 of death 04 the death 04 04 04 the the 

KeyboardInterrupt: 