In [1]:
import tensorflow as tf

### Data acquisition and preprocessing

In [2]:
# Get Shakespeares work from Andrej Karpathy's website

url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filepath = tf.keras.utils.get_file('shakespeare.txt', url)

with open(filepath) as f:
  shakespeare_text = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
# Print the first few characters
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [4]:
# Encode the text on the character level, converting all words to lowercase
text_vec_layer = tf.keras.layers.TextVectorization(split='character', standardize='lower')
text_vec_layer.adapt([shakespeare_text])
encoded_text = text_vec_layer(tf.constant(shakespeare_text))

In [5]:
# This is the text from before, just integer encoded now
print(encoded_text[:148])

tf.Tensor(
[21  7 10  9  4  2 20  7  4  7 37  3 11 25 12 23  3 21  5 10  3  2 18  3
  2 24 10  5 20  3  3 14  2  6 11 17  2 21 15 10  4  8  3 10 19  2  8  3
  6 10  2 16  3  2  9 24  3  6 26 28 12 12  6 13 13 25 12  9 24  3  6 26
 19  2  9 24  3  6 26 28 12 12 21  7 10  9  4  2 20  7  4  7 37  3 11 25
 12 17  5 15  2  6 10  3  2  6 13 13  2 10  3  9  5 13 27  3 14  2 10  6
  4  8  3 10  2  4  5  2 14  7  3  2  4  8  6 11  2  4  5  2 21  6 16  7
  9  8 31 12], shape=(148,), dtype=int64)


In [6]:
# How many number of distinct characters has the vocabulary:
tokens = text_vec_layer.vocabulary_size()
print(f'Number of tokens in vocabulary: {tokens}')

# How many characters has the dataset:
text_length = len(shakespeare_text)
print(f'Total length of text dataset: {text_length}')

Number of tokens in vocabulary: 41
Total length of text dataset: 1115394


In [7]:
# We need to break the total sequence into smaller chunks for our prediction model
# These chunks are of size "length" and are shifted by one character
# between input and output.
def get_dataset(sequence, length, shuffle=False, batch_size=128):

  dataset = tf.data.Dataset.from_tensor_slices(sequence)
  dataset = dataset.window(length + 1, shift=1, drop_remainder=True)
  dataset = dataset.flat_map(lambda window: window.batch(length + 1))
  if shuffle:
    dataset = dataset.shuffle(len(sequence))
  dataset = dataset.batch(batch_size)
  dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:])).prefetch(1)
  return dataset

In [8]:
# Prediction size for the model
length = 100

# Split the dataset into training, validation and testing
TRAIN_SPLIT = int(0.9 * text_length)
TEST_SPLIT = int(0.95 * text_length)

train_dataset = get_dataset(encoded_text[:TRAIN_SPLIT], length, shuffle=True)
val_dataset = get_dataset(encoded_text[TRAIN_SPLIT:TEST_SPLIT], length, shuffle=False)
test_dataset = get_dataset(encoded_text[TEST_SPLIT:], length, shuffle=False)

In [9]:
# Let's check how the training data samples look like
for x, y in train_dataset.take(1):
  print(f'Input shape: {x.shape}')
  print(f'Target shape: {y.shape}')

  # Show one sample
  print(f'Input: {x[0]}')
  print(f'Target: {y[0]}')

Input shape: (128, 100)
Target shape: (128, 100)
Input: [20  3 14  2  9 16  7 13  3  9 30 12  6 11 14  2 23  5  4  8  2  6 10  3
  2 10  3  6 14 17  2  7 11  2  4  8  3  7 10  2  5 21 21  7 20  3  9 19
 12  6  4  2  6 11 17  2  4  7 16  3 19  2  4  5  2 22 10  6 20  3  2 16
 17  2  9  4 10  6  4  6 22  3 16  9 28 12 23 15  4  2 18  8  6  4 19  2
  7  9  2 20]
Target: [ 3 14  2  9 16  7 13  3  9 30 12  6 11 14  2 23  5  4  8  2  6 10  3  2
 10  3  6 14 17  2  7 11  2  4  8  3  7 10  2  5 21 21  7 20  3  9 19 12
  6  4  2  6 11 17  2  4  7 16  3 19  2  4  5  2 22 10  6 20  3  2 16 17
  2  9  4 10  6  4  6 22  3 16  9 28 12 23 15  4  2 18  8  6  4 19  2  7
  9  2 20  6]


### Building an RNN model

In [10]:
# Model based on a simple Gated Recurrent Unit:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(tokens, 16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(tokens, activation='softmax')
])

In [11]:
# Check how the model looks like
model.summary()

In [12]:
# We do next character level prediction so the proper loss is cross-entropy
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
check_point = tf.keras.callbacks.ModelCheckpoint('model.keras', monitor='val_accuracy',
                                                 save_best_only=True)
history = model.fit(train_dataset, validation_data=val_dataset, epochs=10,
                    callbacks=[check_point])

In [14]:
# We can now use the model to predict some next characters. We add the text-encoding layer
# to be able to use raw text
deploy_model = tf.keras.models.Sequential([
    text_vec_layer,
    model
])

In [15]:
# The model predicts the probabilities for the most likely following letter
y_probability = deploy_model.predict(tf.constant(['To be or not to b']))[0,-1]

# Convert this probability into the actual letter
y_pred = tf.argmax(y_probability, axis=-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step


In [16]:
# The model correctly predicts the next letter
y_letter = text_vec_layer.get_vocabulary()[y_pred.numpy()]
print(f'Predicted letter: {y_letter}')

Predicted letter: e


In [17]:
# To predict more than individual letters we use the following function
def predict_next_letter(text):
  y_probability = deploy_model.predict(tf.constant([text]), verbose=0)[0,-1]
  y_pred = tf.argmax(y_probability, axis=-1)
  return text_vec_layer.get_vocabulary()[y_pred.numpy()]

In [18]:
def generate_text(text, n_chars=100):
  for _ in range(n_chars):
    text += predict_next_letter(text)
  return text

In [19]:
# Note that this generates the same words over and over
print(generate_text('To be or not to b'))

To be or not to be so stands the sea
me to the seat and leave the seat and man
that the seat and leave the seat and m


In [20]:
# To get more variation, we can sample from the resulting
# probability distribution rather than taking the most likely
# element
def sample_next_letter(text):
  y_probability = deploy_model.predict(tf.constant([text]), verbose=0)[0,-1:]
  y_pred = tf.random.categorical(tf.math.log(y_probability), num_samples=1)[0,0]
  return text_vec_layer.get_vocabulary()[y_pred.numpy()]

def generate_text(text, n_chars=500):
  for _ in range(n_chars):
    text += sample_next_letter(text)
  return text

In [21]:
print(generate_text('To be or not to b'))

To be or not to be a subcle,
these the sealchabour, on, and yet a word;
and guess by burnar?

julien:
man: no common mother was seek it shall you die, we by.

romeo:
who come you this was hard to your lives not, none:
he i speak this the earther serving
to fairs me with heaven shall. fearness, my begoistres on him.
ah, welcome dear arm, are not her father;
'tis not's blood of yet, but gentlemen, even now:
and that i breath, talke here; an hose bawd
as it die to please me all of i
hopes! this hate time to heir ou
