In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn

# Data Pipeline

In [2]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [3]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


## Encoding

In [4]:
text_vec_layer = tf.keras.layers.TextVectorization(split = 'character',
                                                   standardize = 'lower')
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [5]:
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12], dtype=int64)>

In [6]:
encoded -= 2

In [7]:
n_tokens = text_vec_layer.vocabulary_size()
dataset_size = len(encoded)

In [8]:
dataset_size

1115394

In [9]:
from Utility import to_dataset
length = 100 
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1000000], length = length, shuffle=True, seed = 42)
valid_set = to_dataset(encoded[1000000: 1060000], length = length)
test_set = to_dataset(encoded[1060000:], length = length)

## Model

In [10]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = n_tokens, output_dim = 16),
    tf.keras.layers.GRU(128, return_sequences= True),
    tf.keras.layers.Dense(n_tokens, activation = 'softmax')
])

model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer = tf.keras.optimizers.Nadam(),
              metrics = ['accuracy'])

In [11]:
history = model.fit(train_set , validation_data= valid_set, epochs = 10,
                    callbacks = tf.keras.callbacks.ModelCheckpoint('shakspeare_model.keras', monitor = 'val_accuracy', save_best_only= True))

Epoch 1/10
  31247/Unknown [1m1324s[0m 42ms/step - accuracy: 0.5433 - loss: 1.5126

  self.gen.throw(typ, value, traceback)


[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1348s[0m 43ms/step - accuracy: 0.5433 - loss: 1.5126 - val_accuracy: 0.5347 - val_loss: 1.5937
Epoch 2/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1295s[0m 41ms/step - accuracy: 0.5989 - loss: 1.2867 - val_accuracy: 0.5439 - val_loss: 1.5755
Epoch 3/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1317s[0m 42ms/step - accuracy: 0.6036 - loss: 1.2661 - val_accuracy: 0.5458 - val_loss: 1.5620
Epoch 4/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1445s[0m 46ms/step - accuracy: 0.6057 - loss: 1.2564 - val_accuracy: 0.5463 - val_loss: 1.5583
Epoch 5/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1443s[0m 46ms/step - accuracy: 0.6074 - loss: 1.2497 - val_accuracy: 0.5471 - val_loss: 1.5584
Epoch 6/10
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1420s[0m 45ms/step - accuracy: 0.6083 - loss: 1.2452 - val_accuracy: 0.5477 - val_loss