[Tensorflow text generation with an RNN tutorial](https://www.tensorflow.org/tutorials/text/text_generation)

In [27]:
import tensorflow as tf

from string import punctuation
import numpy as np
import os
import time
import pickle

TensorFlow GPU memory growth must be limited to allow model to train (was having issues without doing this).  Code in below cell borrowed from the [TensorFlow documentation](https://www.tensorflow.org/guide/gpu).

In [28]:
# limiting GPU memory growth

gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        # currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True) # enabling memory growth
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), 'Physical GPUs,', len(logical_gpus), 'Logical GPU')
    except RuntimeError as e:
        # memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPU


# Data

Text must all be in a single `.txt` file.

In [29]:
# open the file
text = str(open('data/drake_lyrics.txt', 'r').read())

In [30]:
# peek into file
print(text[:250])

Yo what's goin on, this is Drake
And I'ma let you know what you about to witness aight?
This right here, is a Drake, and DJ Smallz collaboration
So I'm from Canada, my mans from down South
You understand the #1 DJ in the South to be exact
You heard t


# Data Preprocessing

In [31]:
words = text.lower().replace('\n', ' \n ')

for punc in punctuation:
    words = words.replace(punc, '')

words = words.split(' ')

In [32]:
vocab = sorted(set(words))

In [33]:
outfile = open(file='vocab', mode='wb')
pickle.dump(vocab, outfile)
outfile.close()

In [34]:
infile = open(file='vocab', mode='rb')
vocab = pickle.load(infile)
infile.close()
vocab

['',
 '\n',
 '0',
 '000',
 '000s',
 '010',
 '02',
 '05',
 '06',
 '07',
 '09',
 '1',
 '10',
 '100',
 '1017',
 '11',
 '110',
 '11th',
 '12',
 '12bedroom',
 '13',
 '14',
 '15',
 '150',
 '1503',
 '15th',
 '16',
 '16s',
 '17',
 '1799',
 '18',
 '19',
 '1991',
 '1998',
 '1da',
 '1s',
 '1st',
 '2',
 '20',
 '200',
 '2007',
 '2008',
 '2010',
 '2015',
 '2017',
 '2018',
 '20s',
 '21',
 '22',
 '224',
 '23',
 '23s',
 '24',
 '247',
 '25',
 '26',
 '27',
 '28',
 '281',
 '29',
 '2999',
 '3',
 '30',
 '300',
 '305',
 '31st',
 '325',
 '33rd',
 '35',
 '360',
 '3months',
 '3rd',
 '3s',
 '4',
 '40',
 '4000',
 '401',
 '4040',
 '4201',
 '4301',
 '44',
 '48',
 '4th',
 '5',
 '50',
 '500',
 '504',
 '50k',
 '50ms',
 '52',
 '54',
 '5s',
 '5th',
 '6',
 '60',
 '60000',
 '61',
 '62',
 '6449393',
 '645',
 '66',
 '680',
 '6am',
 '6er',
 '6ix',
 '6s',
 '7',
 '70',
 '70s',
 '747',
 '7am',
 '8',
 '80',
 '808',
 '80s',
 '81',
 '82',
 '85',
 '86',
 '87',
 '876',
 '8am',
 '8th',
 '9',
 '90',
 '90210',
 '90s',
 '91',
 '911',
 '

In [38]:
len(vocab)

9597

In [39]:
# map unique characters to indices
word2idx = {u:i for i, u in enumerate(vocab)}

# reverse the map - use this to specify an index to obtain a character
idx2word = np.array(vocab)

# entire text document represented in the above character-to-indices mapping
words_as_int = np.array([word2idx[c] for c in words])

# sample
print(f'"{words[:13]}" ---- characters mapped to int ---- > {words_as_int[:13]}')

"['yo', 'whats', 'goin', 'on', 'this', 'is', 'drake', '\n', 'and', 'ima', 'let', 'you', 'know']" ---- characters mapped to int ---- > [9515 9247 3558 5767 8480 4294 2565    1  379 4161 4767 9522 4613]


In [66]:
# pickle this since it is needed in text generation
outfile = open(file='pkl/word2idx', mode='wb')
pickle.dump(word2idx, outfile)
outfile.close()

# pickle this since it is needed in text generation
outfile = open(file='pkl/idx2word', mode='wb')
pickle.dump(idx2word, outfile)
outfile.close()

In [67]:
infile = open(file='pkl/word2idx', mode='rb')
word2idx = pickle.load(infile)
infile.close()

In [40]:
idx2word[1]

'\n'

In [41]:
vocab

['',
 '\n',
 '0',
 '000',
 '000s',
 '010',
 '02',
 '05',
 '06',
 '07',
 '09',
 '1',
 '10',
 '100',
 '1017',
 '11',
 '110',
 '11th',
 '12',
 '12bedroom',
 '13',
 '14',
 '15',
 '150',
 '1503',
 '15th',
 '16',
 '16s',
 '17',
 '1799',
 '18',
 '19',
 '1991',
 '1998',
 '1da',
 '1s',
 '1st',
 '2',
 '20',
 '200',
 '2007',
 '2008',
 '2010',
 '2015',
 '2017',
 '2018',
 '20s',
 '21',
 '22',
 '224',
 '23',
 '23s',
 '24',
 '247',
 '25',
 '26',
 '27',
 '28',
 '281',
 '29',
 '2999',
 '3',
 '30',
 '300',
 '305',
 '31st',
 '325',
 '33rd',
 '35',
 '360',
 '3months',
 '3rd',
 '3s',
 '4',
 '40',
 '4000',
 '401',
 '4040',
 '4201',
 '4301',
 '44',
 '48',
 '4th',
 '5',
 '50',
 '500',
 '504',
 '50k',
 '50ms',
 '52',
 '54',
 '5s',
 '5th',
 '6',
 '60',
 '60000',
 '61',
 '62',
 '6449393',
 '645',
 '66',
 '680',
 '6am',
 '6er',
 '6ix',
 '6s',
 '7',
 '70',
 '70s',
 '747',
 '7am',
 '8',
 '80',
 '808',
 '80s',
 '81',
 '82',
 '85',
 '86',
 '87',
 '876',
 '8am',
 '8th',
 '9',
 '90',
 '90210',
 '90s',
 '91',
 '911',
 '

## Text Vectorization

Note that this is **character vectorization**.  Word vectorization would probably make more coherent sentences.

In [None]:
# # map unique characters to indices
# char2idx = {u:i for i, u in enumerate(vocab)}

# # reverse the map - use this to specify an index to obtain a character
# idx2char = np.array(vocab)

# # entire text document represented in the above character-to-indices mapping
# text_as_int = np.array([char2idx[c] for c in text])

# # sample
# print(f'"{text[:13]}" ---- characters mapped to int ---- > {text_as_int[:13]}')

## Create Training Examples & Targets

**model input**: sequence of characters

**model output (prediction)**: the following character at each step (based on previous characters in the sequence)

Divide the text into **example sequences**.  Each input sequence will contain `seq_length` characters from the text.

**For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.**

So, break the text into chunks of `seq_length + 1`.  e.g. if `seq_length` is 4 and our text is "Hello", the input sequence would be "Hell" and the target sequence would be "ello".

`tf.data.Dataset.from_tensor_slices` converts the text vector into a stream of character indices.

In [42]:
# max sentence length (in number of characters) desired for single input
seq_length = 100
examples_per_epoch = len(words) // (seq_length + 1) # floored division

# create training examples/targets
word_dataset = tf.data.Dataset.from_tensor_slices(words_as_int)

# data type of train examples/targets
print(type(word_dataset))

<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>


In [43]:
# preview training examples as characters (using the indices in word_dataset)
for i in word_dataset.take(5):
    print(idx2word[i.numpy()]) # .numpy() converts into numpy data format (in this case, a numpy integer)

yo
whats
goin
on
this


Use the `batch` method on `char_dataset` (type `tensorflow.python.data.ops.dataset_ops.TensorSliceDataset`) to convert the individual characters to sequences of the desired size (`seq_length`).

In [44]:
# create sequence batches from the word_dataset
sequences = word_dataset.batch(seq_length + 1, drop_remainder=True)
print(type(sequences), '\n')

# preview some sequences
for item in sequences.take(5):
    print(repr(' '.join(idx2word[item.numpy()])))

<class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'> 

'yo whats goin on this is drake \n and ima let you know what you about to witness aight \n this right here is a drake and dj smallz collaboration \n so im from canada my mans from down south \n you understand the 1 dj in the south to be exact \n you heard that at the vmas you heard it wherever he goes \n my man smallz is out there down south \n same time reppin for toronto canada yknahmean \n so this right here what you bout what you listenin to right now \n is the official southern'
'smoke special edition \n i call it the room for improvement mixtape \n cause im not perfect and i bet neither are you if you listenin \n so you need to just accept whats there yknahmean but look \n you need to just let this play out \n from track one to track whatever however many i put on here \n you need to just listen to this because look man \n its the first time canadian down south my man smallz \n your boy drake you know what it is man 

For each sequence, duplicate and shift it to form the input and target text using the `map` method on the batch object to apple a simple function to each batch.

In [45]:
# define the shifting (splitting) function
def split_input_target(chunk):
    input_text = chunk[:-1] # up to but not including the last character
    target_text = chunk[1:] # everything except for the firs tcharacter
    return input_text, target_text

In [46]:
# apply the shifting to create input texts and target texts that comprise of our dataset
dataset = sequences.map(split_input_target)
print(type(dataset))

<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>


In [47]:
# see the first few examples of input and target values
for input_example, target_example in dataset.take(1):
    print('Input data: ', repr(' '.join(idx2word[input_example.numpy()])))
    print('Target data:', repr(' '.join(idx2word[target_example.numpy()])))
    print()

Input data:  'yo whats goin on this is drake \n and ima let you know what you about to witness aight \n this right here is a drake and dj smallz collaboration \n so im from canada my mans from down south \n you understand the 1 dj in the south to be exact \n you heard that at the vmas you heard it wherever he goes \n my man smallz is out there down south \n same time reppin for toronto canada yknahmean \n so this right here what you bout what you listenin to right now \n is the official'
Target data: 'whats goin on this is drake \n and ima let you know what you about to witness aight \n this right here is a drake and dj smallz collaboration \n so im from canada my mans from down south \n you understand the 1 dj in the south to be exact \n you heard that at the vmas you heard it wherever he goes \n my man smallz is out there down south \n same time reppin for toronto canada yknahmean \n so this right here what you bout what you listenin to right now \n is the official southern'



During training, at time step 0, the model receives the index for F (from "First") and tries to predict the "i" (from "First") as the next character.  At the next time step, it does the same thing, but the RNN considers the previous time step context in addition to the current input character (it would consider both "F" and "i" in trying to predict "r").

**BELOW CELL CAUSES GPU MEMORY SPIKE**

In [48]:
# # first few examples of prediction time steps
# for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
#     print(f"Step {i:4d}")
#     print(f"  input: {input_idx} ({repr(idx2char[input_idx]):s})")
#     print(f"  expected output: {target_idx} ({repr(idx2char[target_idx]):s})")

## Create Training *Batches*

`tf.data` was used to split the text into _sequences_.  But before feeding this data into the model, we must _shuffle_ the data and pack it into _batches_.  The first layer of the model will be a Keras `Embedding` layer

In [49]:
# batch size
BATCH_SIZE = 64

# buffer size to shuffle the dataset
# (TensorFlow data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory.  Instead,
# it maintains a buffer in which it shuffles elements)
BUFFER_SIZE = 10000

dataset_sb = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [50]:
dataset_sb

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [51]:
print(dataset_sb)

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>


The data is now ready to be passed into an RNN model.

# Modelling

## Building the Model

Use `tf.keras.Sequential` to define the model. For this simple example three layers are used to define our model:

- `tf.keras.layers.Embedding`: The input layer. A trainable lookup table that will map the numbers of each character to a vector with `embedding_dim` dimensions
- `tf.keras.layers.GRU`: A type of RNN with size `units = rnn_units` (You can also use an LSTM layer here)
- `tf.keras.layers.Dense`: The output layer, with `vocab_size` outputs

In [52]:
# vocabulary length (number of unique words)
vocab_size = len(vocab)

# embedding dimension
embedding_dim = 256

# number of RNN units
rnn_units = 1024

In [57]:
model_params = [vocab_size, embedding_dim, rnn_units]

outfile = open(file='model_params', mode='wb')
pickle.dump(model_params, outfile)
outfile.close()

In [58]:
infile = open(file='model_params', mode='rb')
vocab_size, embedding_dim, rnn_units = pickle.load(infile)
infile.close()

In [61]:
rnn_units

1024

In [28]:
# helper function to quickly build the RNN model based on vocab size, embedding dimension, number of RNN units, and batch size
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.Embedding(
        input_dim = vocab_size,
        output_dim = embedding_dim,
        batch_input_shape=[batch_size, None]
    ))
    
    model.add(tf.keras.layers.GRU(
        units = rnn_units,
        return_sequences = True,
        stateful = True,
        recurrent_initializer = 'glorot_uniform'
    ))
    
    model.add(tf.keras.layers.Dense(units=vocab_size))
    
    return model

In [29]:
# build the model
rnn = build_model(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = BATCH_SIZE
)

## Try the Model (Without Training)

First, check the shape of the output:

In [30]:
for input_example_batch, target_example_batch in dataset_sb.take(1):
    example_batch_predictions = rnn(input_example_batch)
    print(example_batch_predictions.shape, '# (batch_size, sequence_length, vocab_size)')

(64, 100, 9597) # (batch_size, sequence_length, vocab_size)


The sequence length (`seq_length`) was set to `100` but the model can be run on inputs of any length.

In [34]:
print(rnn.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           2456832   
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 9597)          9836925   
Total params: 16,232,061
Trainable params: 16,232,061
Non-trainable params: 0
_________________________________________________________________


None

To get actual predictions from the model, we must sample from the output distribution to get actual character indices.  This distribution is defined by the logits over the character vocabulary.

**Note**: It is important to _sample_ from this distribution, since taking the _argmax_ of the distribution can easily get the model stuck in a loop.

Try it for the first example in the batch:

In [38]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy() # tf.squeeze() removes all size-1 dimensions from the tensor

This gives us, at each timestep, a prediction of the next character index:

In [39]:
display(sampled_indices)
print(len(sampled_indices))

array([5816, 2650, 7238, 2160, 4167, 1206, 4934, 5845, 3529, 6255, 6132,
       2078, 4346, 7618, 1431, 8902, 5756, 4790, 1609, 7917,  664, 5745,
       1380, 4934, 4889, 9370, 6701,  470, 9531, 3868, 7702, 7029, 2726,
       5270, 2317, 7709,  573, 3822, 5280, 3049, 6777, 3065, 5522, 3468,
       7406, 5075, 3595, 8066, 3461,  293, 6167, 3442, 6027, 5830, 6919,
       7507, 7180, 7073, 6417, 4839, 4183, 1332, 6648, 7285, 7737, 8405,
       8913, 4946, 2526, 3808, 3269, 1196, 7069, 4521, 2391, 1861, 4393,
       7798,  746, 2385, 9204, 2489, 5349, 6896, 8805, 3875, 4707, 2221,
       7751,  102, 8486, 3523, 2214, 9301, 2852, 6277, 5370, 6369, 4223,
       3466], dtype=int64)

100


Decode these to see the text predicted by the untrained model:

In [40]:
print(f'Input: {repr(" ".join(idx2word[input_example_batch[0]]))}\n')
print(f'Output: {repr(" ".join(idx2word[sampled_indices]))}')

Input: 'quit playin \n girl quit playin \n you aint love me from the start \n youre the reason that i feel this way  \n you broke my fuckin heart \n and i gave you all my trust but you just tore it all apart \n now youre all i think about while im layin in the dark \n  \n texted me the other night its been too long \n always tryna figure out if ima move on \n staring at the ceiling almost all night long \n probably why i had the time to write this song \n'

Output: 'opposite dry seat dallas imbalance brushed louie oughta glee please phase crucified jag smart catching union old lieutenants chips staringcan bajan oho caramel louie lollipop wolves reclinin arch youngings heartbeat solely roxx echelon milli derek som audit hate mindful featuring reminiscing feh nas george shoot marathon grace stunt generation album pierre gasoline paul orgasm righttighten sinkin scholarships saaaand prettiest lipstick impression calls razor sender sop testosterone unlucky lovey door harvard forces brr russ

## Training the Model

We now have a classification problem: **Given the previous RNN state, and the input at this time step, predict the class of the next character.**

### Attaching an Optimizer and Loss Function

The standard `tf.keras.losses.sparse_categorical_crossentropy` loss function works in this case because it is applied across the last dimension of the predictions.

Because the model returns logits, we need to set the `from_logits` flag to `True`.

In [41]:
# helper function to obtain the loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [42]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 9597)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       9.169176


Configure the training procedure using the `tf.keras.Model.compile` method.  Use `tf.keras.optimizers.Adam` with default arguments and the loss function.

In [43]:
rnn.compile(
    optimizer = 'adam',
    loss = loss,
    metrics = ['accuracy']
)

### Configure Checkpoints

Use a `tf.keras.callbacks.ModelCheckpoint` to ensure that checkpoints are saved during training:

In [44]:
# directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'

# name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'checkpoint')

# create checkpoints-saving object
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    monitor = 'loss',
    save_best_only = True,
    mode = 'min',
    save_weights_only = True
)

### Execute the Training

In [45]:
# set number of desired epochs
EPOCHS = 50

In [46]:
%%time

# training!
history = rnn.fit(
    x = dataset_sb,
    epochs = EPOCHS,
    callbacks = [checkpoint_callback]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50

KeyboardInterrupt: 

---
## Generating Text (Making Predictions)

### Restore the Latest Checkpoint

- batch size 1 (for simplicity)
- because of the way the RNN state is passed from time step to time step, the model only accepts a fixed batch size once built
- **to run the model with a different `batch_size`, we need to rebuild the model and restore the weights from the last checkpoint**

In [47]:
# check the file in the working directory that contains the most recent checkpoint
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\checkpoint'

In [48]:
# initiate a new RNN model instance
rnn_cp = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

# load the saved weights from the checkpoint into the new model instance
rnn_cp.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

# build the model with a new input shape
rnn_cp.build(tf.TensorShape([1, None]))

In [49]:
rnn_cp.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            2456832   
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 9597)           9836925   
Total params: 16,232,061
Trainable params: 16,232,061
Non-trainable params: 0
_________________________________________________________________


### The Prediction Loop

- start by choosing a start string, initializing the RNN state and setting the number of characters to generate
- get the prediction distribution of the next character using the start string and the RNN state
- then, use a categorical distribution to calculate the index of the predicted character
- use this predicted character as our next input to the model
- the RNN state returned by the model is fed back into the model so that it now has more context, instead of only one character
- after predicting the next character, the modified RNN states are again fed back into the model, which is how it learns as it gets more context from the previously predicted characters

In [50]:
# text prediction function
def generate_text(model, start_string, num_generate=500, temperature=1.0):
    
    # num of chars to generate
    num_generate = num_generate
    
    # vectorizing the start string to numbers
    input_eval = [word2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input=input_eval, axis=0) # returns a tensor with a length-1 axis inserted at index `axis`
    
    # empty string to store results
    text_generated = list()
    
    # "temperature"
    # low temperature results in more predictable text,
    # high temperature results in more surprising text.
    # feel free to experiment with this parameter
    temperature = 1.0
    
    # the batch size was defined when we loaded model weights from training
    
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
        
        # use a categorical distribution to predict the character returned by the model
        preidctions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        
        # pass the predicted character as the next input to the model along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated.append(idx2word[predicted_id])
    
    return(' '.join(start_string + text_generated))

In [None]:
%%time

# text generation!
print(generate_text(rnn_cp, start_string=['bruh', 'no', 'cap'], num_generate=200))