In [1]:
import corpora
corpora.shakespeare_plays

'data\\shakespeare_plays.txt'

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Preprocessing

In [4]:
import preprocessing
corpus = corpora.shakespeare_sonnets
words, word2idx = preprocessing.get_words(corpus)

# Model specification


In [10]:
import os
os.environ['KERAS_BACKEND'] = 'theano'
from keras.models import Sequential

from keras.layers import Dense, Activation, LSTM, Reshape, TimeDistributed, Embedding
from keras.callbacks import Callback


def get_model(num_timesteps, num_words, embedding_dim, hidden_dim, batch_size):
    model = Sequential()
    model.add(Embedding(input_dim=num_words, 
                        input_length=num_timesteps,
                        batch_input_shape=[batch_size, num_timesteps],
                        output_dim=embedding_dim))
    model.add(LSTM(units=hidden_dim, 
                   batch_input_shape=[batch_size, num_timesteps, embedding_dim], 
                   return_sequences=True, 
                   stateful=True))
    model.add(TimeDistributed(Dense(num_words), input_shape=(num_timesteps, hidden_dim)))
    model.add(Activation("softmax"))
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


class ResetStates(Callback):

    def on_epoch_begin(self, epoch, logs={}):
        self.model.reset_states()

# Training

In [11]:
def train_model(num_timesteps, hidden_dim, embedding_dim, batch_size, num_epochs, corpus, word2idx, model=None):
    num_words = len(word2idx)
    model = model if model else get_model(num_timesteps, num_words, embedding_dim, hidden_dim, batch_size)
    examples = preprocessing.vectorized_example_stream(corpus, num_timesteps, batch_size, word2idx, word_level=True)
    total_num_words = preprocessing.count_words(corpus)
    total_num_words = total_num_words - total_num_words % (num_timesteps * batch_size)
    samples_per_epoch = total_num_words//num_timesteps
    model.fit_generator(examples, samples_per_epoch, num_epochs, callbacks=[ResetStates()])
    return model

In [12]:
num_timesteps = 30
hidden_dim = 128
embedding_dim = 50
batch_size = 32
num_epochs = 1
trained_model = train_model(num_timesteps, hidden_dim, embedding_dim, batch_size, 
                            num_epochs, corpus,word2idx)

  "The file scan_perform.c is not available. This do"
  "The file scan_perform.c is not available. This do"
  "The file scan_perform.c is not available. This do"
  "The file scan_perform.c is not available. This do"


Epoch 1/1
total num symbols 26643


In [13]:
model_name = 'wordlevel.%s.h5' % os.path.basename(corpus)
trained_model.save_weights(model_name, overwrite=True)

# Inference

In [14]:
import numpy as np

def generate_text(model, seed_words, temperature, word2ix, idx2word, N):
    model.reset_states()
    # first we initialize the state of the LSTM using the seed_str
    for seed_word in seed_words:
        seed_word_idx = word2idx[seed_word]
        x = np.zeros(shape=model.input_shape)
        x[0, 0] = seed_word_idx
        probs = model.predict(x, verbose=0)
        
    # now we start generating text
    probs = probs[0,0,:]
    next_word_idx = sample(probs, temperature)
    generated_text_idx = [next_word_idx]
    generated_text = [idx2word[next_word_idx]]
    x = np.zeros(shape=model.input_shape)
    for i in range(N - 1):
        last_word_idx = generated_text_idx[-1]
        x = np.zeros(shape=model.input_shape)
        x[0, 0] = last_word_idx
        probs = model.predict(x, verbose=0)
        probs = probs[0,0,:]
        next_word_idx = sample(probs, temperature)
        generated_text_idx.append(next_word_idx)
        generated_text.append(idx2word[next_word_idx])        
    return generated_text
    
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [15]:
from nltk import word_tokenize
seed_str = '''To be or not to be'''
seed_str = [w.lower() for w in word_tokenize(seed_str)]
trained_model_test = get_model(1, len(word2idx), embedding_dim, hidden_dim, 1)
trained_model_test.load_weights(model_name)
generated_text = generate_text(trained_model_test, seed_str, 1.1, word2idx, words, 3000)

  "The file scan_perform.c is not available. This do"
  "The file scan_perform.c is not available. This do"


NameError: name 'xrange' is not defined

In [None]:
print(' '.join(seed_str + generated_text))