<a href="https://colab.research.google.com/github/kieranfmaguire/learningdeeplearning/blob/main/notebooks/generate_text_with_RNN_TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generating Shakespearean Text Using a Character RNN
---
Based on example in chapter 16 - Hands on Machine Learnings with Scikitlearn and Tensorflow (O'Reilly)

In [20]:
import numpy as np
import tensorflow as tf


## Downloading some data
---

In [21]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespear.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

print(f"shakespeare_text is a: {type(shakespeare_text)}")
print(f"shakespeare_text has length: {len(shakespeare_text)}")
print(f"First 500 characters are:\n\n{shakespeare_text[:500]}")

shakespeare_text is a: <class 'str'>
shakespeare_text has length: 1115394
First 500 characters are:

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


## Tokenizing the text data - each unique character is given a unique ID
---

In [22]:
tokeniser = tf.keras.preprocessing.text.Tokenizer(char_level=True)   # char level creates unique id for each charater (default is by word)
tokeniser.fit_on_texts(shakespeare_text)  # assigns ID (starting at 1) for each unique character
num_unique_chars = len(tokeniser.word_index)
num_chars = tokeniser.document_count    # will need this later for one hot encoding

print(f"Number of distinct characters: {len(tokeniser.word_index)}")
print(f"Total number of characters in documents: {num_chars}")




Number of distinct characters: 39
Total number of characters in documents: 1115394


In [23]:
# now encode the entire text, split into train and test, and create tf.data.Dataset object
encoded = np.array(tokeniser.texts_to_sequences([shakespeare_text])).flatten()

train_size = int(encoded.shape[0] * 0.9)    # use 90% for train, save the rest for test
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset_val = tf.data.Dataset.from_tensor_slices(encoded[train_size:])

### To train the model, will use sub-sequences of length `n_steps`
---
the tensorflow datasets API can handle this with the window method

In [24]:
n_steps = 50                    # number of characters in a given instance
window_length = n_steps + 1     # we will predict the next character in the sequence, so will need it as the target
batch_size = 32                 # batch size for gradient descent

In [25]:
dataset = dataset.window(size=window_length, shift=1, stride=1, drop_remainder=False)
dataset = dataset.flat_map(lambda window: window.batch(window_length))

dataset_val = dataset_val.window(size=window_length, shift=1, stride=1, drop_remainder=False)
dataset_val = dataset_val.flat_map(lambda window: window.batch(window_length))


In [26]:
# have a look at the structure
for i, dat in enumerate(dataset):
    print(dat)
    if i > 0: break

tf.Tensor(
[20  6  9  8  3  1 19  6  3  6 36  2 10 24 11 22  2 20  4  9  2  1 17  2
  1 23  9  4 19  2  2 13  1  5 10 16  1 20 14  9  3  7  2  9 18  1  7  2
  5  9  1], shape=(51,), dtype=int64)
tf.Tensor(
[ 6  9  8  3  1 19  6  3  6 36  2 10 24 11 22  2 20  4  9  2  1 17  2  1
 23  9  4 19  2  2 13  1  5 10 16  1 20 14  9  3  7  2  9 18  1  7  2  5
  9  1 15], shape=(51,), dtype=int64)


In [27]:
# shuffle, batch, then split into fetaures and target
dataset = dataset.shuffle(int(1e3)).batch(batch_size)
dataset = dataset.map(lambda x: (x[:, :-1], x[:, 1:]))   
# one hot encode the categorical features - small enough dictionary for this to be reasonable. 
# in general this would usually be an embedding
dataset = dataset.map(lambda x, y: (tf.one_hot(x, depth=num_unique_chars), y))
dataset = dataset.prefetch(1) # don't really need to worry about prefetching on this small dataset

dataset_val = dataset_val.batch(batch_size)\
  .map(lambda x: (x[:, :-1], x[:, 1:]))\
  .map(lambda x,y: (tf.one_hot(x, depth=num_unique_chars), y))

## Now we can create a model
---

In [32]:
model = tf.keras.models.Sequential([
  tf.keras.layers.GRU(units=64, input_shape=[None, num_unique_chars], return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
  tf.keras.layers.GRU(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
  tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=num_unique_chars, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_8 (GRU)                  (None, None, 64)          20160     
_________________________________________________________________
gru_9 (GRU)                  (None, None, 64)          24960     
_________________________________________________________________
time_distributed_4 (TimeDist (None, None, 39)          2535      
Total params: 47,655
Trainable params: 47,655
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.fit(dataset, epochs=1, validation_data=dataset_val)

  21424/Unknown - 2413s 113ms/step - loss: 1.7703

InvalidArgumentError: ignored