In [1]:
import tensorflow as tf
import numpy as np

In [2]:
def build_model(vocab_size, embedding_dim, rnn_units, use_LSTM, batch_size):
    model = tf.keras.Sequential()

    if embedding_dim != None:
        model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                            batch_input_shape=[batch_size, None]))
    else: # use one-hot encoding instead of embedding layer
        def one_hot(x):
            return tf.one_hot(tf.cast(x, 'uint8'), depth=vocab_size)
        model.add(tf.keras.layers.Lambda(one_hot, batch_input_shape=[batch_size,None]))

    # recurrent layer
    if use_LSTM:
        model.add(tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True))
    else:
        model.add(tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True, stateful=True))

    # output layer
    model.add(tf.keras.layers.Dense(vocab_size))

    return model

In [3]:
def preprocess(filename, batch_size, seq_length):
    text = open('datasets/'+filename+'.txt', 'rb').read().decode(encoding='utf-8')
    print ('Length of text: {} characters'.format(len(text)))

    vocab = sorted(set(text))
    print ('{} unique characters'.format(len(vocab)))

    char_to_idx = {u:i for i, u in enumerate(vocab)}
    idx_to_char = np.array(vocab)

    text_as_int = np.array([char_to_idx[c] for c in text])

    # Create training examples / targets
    char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
    sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

    def split_input_target(chunk):
        input_text = chunk[:-1]
        target_text = chunk[1:]
        return input_text, target_text

    dataset = sequences.map(split_input_target)

    BUFFER_SIZE = 10000
    dataset = dataset.shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)

    return dataset, idx_to_char, char_to_idx, vocab

In [4]:
def train_model(model, dataset, batch_size, epochs, checkpoint_prefix):
    def loss(labels, logits):
        return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

    model.compile(optimizer='adam', loss=loss)

    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_prefix,
        save_weights_only=True,
        save_best_only=True,
        monitor='loss') # TODO monitor val_loss instead

    return model.fit(dataset, batch_size=batch_size, validation_split=0.1, epochs=epochs, callbacks=[checkpoint_callback])

In [5]:
def generate_text(model, char_to_idx, idx_to_char,
                  start_string, num_generate, temperature):
    # Converting our start string to numbers (vectorizing)
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    model.reset_states()
    for _ in range(num_generate):
        predictions = model(input_eval)
         # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx_to_char[predicted_id])

    return (start_string + ''.join(text_generated))

In [6]:
filename = 'shakespeare'
epochs = 1
batch_size = 32
seq_length = 100
rnn_units = 128
use_LSTM = True
embedding_dim = None
checkpoint_dir = './training_checkpoints/' + filename
checkpoint_prefix = checkpoint_dir + '/ckpt'

In [7]:
dataset, idx_to_char, char_to_idx, vocab = preprocess(filename, batch_size, seq_length)
vocab_size = len(vocab)

Length of text: 1115378 characters
63 unique characters


In [8]:
model = build_model(vocab_size, embedding_dim, rnn_units, use_LSTM, batch_size)

In [22]:
# Optional, resume training from best weights
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa15d9d3100>

In [9]:
history = train_model(model, dataset, batch_size=batch_size, epochs=epochs, checkpoint_prefix=checkpoint_prefix)

ValueError: `validation_split` is only supported for Tensors or NumPy arrays, found: (<BatchDataset shapes: ((32, 100), (32, 100)), types: (tf.int64, tf.int64)>, None, None)

In [56]:
# Generate text
start_string = u"ROMEO:"
temperature = 1.0
num_generate = 500
gen_model = build_model(vocab_size, embedding_dim, rnn_units, use_LSTM, batch_size=1)
gen_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
gen_model.build(tf.TensorShape([1, None]))
result = generate_text(gen_model,char_to_idx,idx_to_char,
                       start_string=start_string,
                       num_generate=num_generate,
                       temperature=temperature)
print(result)

ROMEO:3:?
WYRUCKE:
Then by twe bud why, thy leat with lifes!
For to you good pais, and Master foll the;
In Hadinca, and your bace,n.

JULIE:
Dreg the ghalk, but I have no grest them,
In thee undenal wound. I'll exect.

LUCINo:
Here I dost we hath the doy, too mirst for me to her, sivine.

LUCENTIO:
Your torgee, are,
Lith; blother they benoun of Yord,
Footho chost wind tinge but and heap, unto your gno-s
Sanalef he be please of the weartat
The show her, it not fright bed and the will ilverlo
Is no'd sh


In [55]:
# print(history.history)
# todo visualize using matplotlib