## Shakespeare generation explorer

Explore generators as an extension to the tensorflow tutorial.

In [41]:
!pip install --upgrade pip
!pip install unidecode

import tensorflow as tf
import os
import unidecode
import numpy as np

from keras.models import Sequential
from keras import layers
from keras.utils import Sequence

Requirement already up-to-date: pip in /usr/local/lib/python3.6/dist-packages (18.0)


### Data prep
Get the data using the instructions from the tensorflow tutorial

In [15]:
if not os.path.isdir('/tmp/shakespeare'):
    os.mkdir('/tmp/shakespeare')
path_to_file = tf.keras.utils.get_file('/tmp/shakespeare/shakespeare.txt', 
                                       'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = unidecode.unidecode(open(path_to_file).read())
# unique contains all the unique characters in the file
unique = sorted(set(text))

# creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(unique)}
idx2char = {i:u for i, u in enumerate(unique)}

### Set data specific parameters

In [105]:
# setting the maximum length sentence we  want for a single input in characters
max_length = 100

# length of the vocabulary in chars
vocab_size = len(unique)

# the embedding dimension 
embedding_dim = vocab_size

# # number of RNN (here GRU) units
# units = 48 #1024 OOM

# batch size 
BATCH_SIZE = 2 #64 gave OOM error

# buffer size to shuffle our dataset
BUFFER_SIZE = 10000


### Create data tensors
We create *max_length* chunks of input, where each input vector is all the characters in that chunk apart from the last one. The target vector is all of the characters in the chunk except the first.

eg if text = 'tensorflow' and max_length = 9:

So, the input = 'tensorflo' and output = 'ensorflow'

After creating the vectors, we convert each character into numbers using the char2idx dictionary we created above.

Note, this is a function, so that the inputs are slightly randomised each epoch.

In [29]:
## Alternative randomised batch load

def load_text_splitter(text, max_length):
    input_text = []
    target_text = []
    offset = np.random.randint(max_length)
    for f in range(0 + offset, len(text) - max_length, max_length):
        inps = text[f:f+max_length]
        targ = text[f+1:f+1+max_length]
        input_text.append([char2idx[i] for i in inps])
        target_text.append([char2idx[t] for t in targ])
    return input_text, target_text

input_text, target_text = load_text_splitter(text, max_length)


### Create model

In [108]:
print('vocab_size:', vocab_size)
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(layers.LSTM(16, return_sequences=True))
model.add(layers.TimeDistributed(layers.Dense(embedding_dim)))
model.add(layers.Flatten())
model.add(layers.Dense(embedding_dim, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])
model.summary()

vocab_size: 65
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 100, 65)           4225      
_________________________________________________________________
lstm_35 (LSTM)               (None, 100, 16)           5248      
_________________________________________________________________
time_distributed_22 (TimeDis (None, 100, 65)           1105      
_________________________________________________________________
flatten_21 (Flatten)         (None, 6500)              0         
_________________________________________________________________
dense_36 (Dense)             (None, 65)                422565    
Total params: 433,143
Trainable params: 433,143
Non-trainable params: 0
_________________________________________________________________


In [109]:
# Create generator
def data_gen(text, max_length, batch_size=2):
    input_text, target_text = load_text_splitter(text, max_length)
    while True:
        for ii in range(0, len(input_text), batch_size):
            x = np.array(input_text[ii:ii+batch_size])
            y = np.array(target_text[ii:ii+batch_size])
            print(x.shape, y.shape)
            yield (x, y)
data_generator = data_gen(text, max_length, batch_size=batch_size)
steps_per_epoch = int(np.floor((float(len(text)) / float(max_length)) / float(batch_size)))
print('steps_per_epoch:', steps_per_epoch)

steps_per_epoch: 5576


In [110]:
batch_size = 2
history = model.fit_generator(data_generator, epochs=1, shuffle=True, steps_per_epoch=steps_per_epoch)

Epoch 1/1
(2, 100) (2, 100)
(2, 100) (2, 100)
(2, 100) (2, 100)
(2, 100) (2, 100)
(2, 100) (2, 100)
(2, 100) (2, 100)
(2, 100) (2, 100)
(2, 100) (2, 100)
(2, 100) (2, 100)
(2, 100) (2, 100)


ValueError: Error when checking target: expected dense_36 to have shape (1,) but got array with shape (100,)