In [2]:
import tensorflow as tf


In [4]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [6]:
print(shakespeare_text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [8]:
# text to vector conversion for encoding
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [17]:
# removing token 0 and 1
encoded -= 2  # droping tokens 0(padding) & 1(unknown chars), which we will not use
n_tokens = text_vec_layer.vocabulary_size() - 2 #number of distinct chars = 39
dataset_size = len(encoded)
print(dataset_size)
print(len(sorted(list(set(shakespeare_text)))))

1115394
65


In [24]:
#converting a long sequence of character IDs into a dataset of input/target window pairs:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(sequence)
  ds = ds.window(length+1, shift=1, drop_remainder=True)
  ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
  if shuffle:
    ds = ds.shuffle(buffer_size=100_000, seed=seed)
  ds = ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [25]:
# Creating Training, test & Validation Set
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_00_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_00_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

In [28]:
# Buildin and Training the Char-RNN Model
model = tf.keras.Sequential([tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
                             tf.keras.layers.GRU(128, return_sequences=True),
                             tf.keras.layers.Dense(n_tokens, activation="softmax")])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model", monitor="val_accuracy", save_best_only=True
)
history = model.fit(train_set, validation_data=valid_set, epochs=10, callbacks=[model_ckpt])

ValueError: The filepath provided must end in `.keras` (Keras model format). Received: filepath=my_shakespeare_model