In [71]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

In [72]:
ids = np.load('quote_matrix10.npy')
labels = np.load('final_topics10.npy')

ids.shape, labels.shape

((9983, 200), (9983,))

In [76]:
labels[999], labels[1000], labels[1998], labels[1999]

('death', 'family', 'family', 'freedom')

In [77]:
ids = ids[1000:1999,:]

In [78]:
ids.shape

(999, 200)

In [79]:
for quote in ids:
    for word in quote:
        if word == word_to_int['dearly']:
            print(word)

In [80]:
ids[1][:30]

array([ 6074,   137,  2839,  6656, 12374,  1190,  2801,  5045,   611,
        1215,   137,  8087,  8415,  1209,  7758,  6035, 11724,  4696,
       12236,   779, 12296,  6850, 11302,  7641, 13376,  1809,  7413,
         137,  3448, 12247], dtype=int32)

In [81]:
labels = labels[:1000]

In [82]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 10
step = 1
seq = []
next_seq = []
quote_len = ids.shape[1]

for quote in ids:
    for i in range(0, quote_len - maxlen, step):
        if quote[i] >0:
            seq.append(quote[i: i + maxlen])
            next_seq.append(quote[i + maxlen])
print('nb sequences:', len(seq))

nb sequences: 28961


In [83]:
seq[0:10], next_seq[0:10]

([array([ 6086,  8180, 11656,  3785, 12205,  8468,  3081, 13269,  1809,  1190], dtype=int32),
  array([ 8180, 11656,  3785, 12205,  8468,  3081, 13269,  1809,  1190,   137], dtype=int32),
  array([11656,  3785, 12205,  8468,  3081, 13269,  1809,  1190,   137,  9885], dtype=int32),
  array([ 3785, 12205,  8468,  3081, 13269,  1809,  1190,   137,  9885,  4601], dtype=int32),
  array([12205,  8468,  3081, 13269,  1809,  1190,   137,  9885,  4601, 12384], dtype=int32),
  array([ 8468,  3081, 13269,  1809,  1190,   137,  9885,  4601, 12384,   497], dtype=int32),
  array([ 3081, 13269,  1809,  1190,   137,  9885,  4601, 12384,   497,  8415], dtype=int32),
  array([13269,  1809,  1190,   137,  9885,  4601, 12384,   497,  8415, 12939], dtype=int32),
  array([ 1809,  1190,   137,  9885,  4601, 12384,   497,  8415, 12939,  6998], dtype=int32),
  array([ 1190,   137,  9885,  4601, 12384,   497,  8415, 12939,  6998,   611], dtype=int32)],
 [137, 9885, 4601, 12384, 497, 8415, 12939, 6998, 611, 1204

In [84]:
word_to_int = np.load('word_to_int10.npy')
int_to_word = np.load('int_to_word10.npy')

word_to_int = word_to_int.item()
int_to_word = int_to_word.item()

In [85]:
len(int_to_word)

13661

In [86]:
len(set(next_seq))

3258

In [87]:
word_to_int['dearly'], word_to_int['haley']

(3107, 5588)

In [88]:
seq = np.asarray(seq)
next_seq = np.asarray(next_seq)

In [89]:
seq.shape, seq.max(), next_seq.max()

((28961, 10), 13660, 13660)

In [90]:
max_word = np.asarray(next_seq.max())

In [91]:
from keras.utils import to_categorical

In [92]:
# reshape X to be [samples, time steps, features]
X = np.reshape(seq, (len(seq), maxlen, 1))
# normalize
X = X / float(next_seq.max())
# one hot encode the output variable
y = to_categorical(next_seq, num_classes= len(int_to_word))

In [93]:
y.shape, X.shape

((28961, 13661), (28961, 10, 1))

In [94]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(X.shape[1], X.shape[2])))



model.add(Dense(y.shape[1]))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [95]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [96]:
text = np.ndarray.flatten(seq)

In [97]:
text.shape

(289610,)

In [98]:
x_pred = 0
generated = 0
sentence = 0
def on_epoch_end(epoch, logs):
    global x_pred
    global sentence 
    global generated
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        print(sentence)
        generated.join([str([int_to_word[value]]).join(' ') for value in sentence])
        print('----- Generating with seed: %s'%sentence)
#         sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.reshape(sentence,(1, maxlen, 1))
            x_pred = x_pred / max_word

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = int_to_word[next_index]

            generated.join(str(next_char))
            sentence = np.append(sentence[1:],next_index)

            sys.stdout.write(next_char)
            sys.stdout.write(" ")
            sys.stdout.flush()
        print()

In [99]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(X, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

Epoch 1/60

----- Generating text after Epoch: 0
----- diversity: 0.2
[12283  6035 13236 13270   845   137  4601 13236  6035 12283]
----- Generating with seed: [12283  6035 13236 13270   845   137  4601 13236  6035 12283]
i 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04 04

KeyboardInterrupt: 