# Hello RNN next character predictor.

RNN trained with Shakespeare's work to predict next character in a sentence

In [1]:
# Prerequisites
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("Python Version: ", sys.version)
print("Numpy Version: ", np.__version__)
print("Pandas Version: ", pd.__version__)
print("TensorFlow Version: ", tf.__version__)

Python Version:  3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]
Numpy Version:  2.0.2
Pandas Version:  2.2.3
TensorFlow Version:  2.18.0


### Get Data (Shakespeare's works)

In [2]:
input_data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", input_data_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [3]:
print("Length of text:" , len(shakespeare_text))
print("Begins with:\n", shakespeare_text[:200] )

Length of text: 1115394
Begins with:
 First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


### Convert to lowercase and Encode characters 

In [5]:
# Show all 39 characters (lower case)
chars = "".join(sorted(set(shakespeare_text.lower())))
print("Characters: ", chars)
print("Number of characters: ", len(chars))

Characters:  
 !$&',-.3:;?abcdefghijklmnopqrstuvwxyz
Number of characters:  39


In [6]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [7]:
# Drop padding (0) and unknown (1) tokens
encoded -= 2
# Number of tokens
nr_tokens = text_vec_layer.vocabulary_size() - 2
print("Number of tokens: ", nr_tokens )
ds_size = len(encoded)
print("Dataset size: ", ds_size )


Number of tokens:  39
Dataset size:  1115394


Helper function to convert sequence of IDs to inputs/targets

In [8]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [9]:
# try to_dataset()
ds_sample = list(to_dataset(text_vec_layer(["To be"])[0], length=4))
print(ds_sample)

[(<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[ 4,  5,  2, 23]])>, <tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[ 5,  2, 23,  3]])>)]


### Split into test, validation, and training sets

In [10]:
length = 100
tf.random.set_seed(42)
ds_train = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=42)
ds_val = to_dataset(encoded[1_000_000:1_060_000], length=length)
ds_test = to_dataset(encoded[1_060_000:], length=length)

### Build and train the model

NOTE:  Need GPU to train in a reasonable time

In [None]:
tf.random.set_seed(42) 
model = keras.Sequential([
    keras.layers.Embedding(input_dim=nr_tokens, output_dim=16),  # Embed the character IDs
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dense(nr_tokens, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

cb_model_ckpt = keras.callbacks.ModelCheckpoint("my_shakespeare_model.keras", 
                                                monitor="val_accuracy", save_best_only=True)
history = model.fit(ds_train, validation_data=ds_val, epochs=10, callbacks=[cb_model_ckpt])

### Predict

In [None]:
# Wrap with preprocessing 
model_shakespeare = keras.Sequential([
    text_vec_layer, 
    keras.layers.Lambda(lambda X: X - 2),  # skip <PAD> or <UNK> tokens
    model
])

In [None]:
y_proba = model_shakespeare.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba)  # Pick the most probable character
text_vec_layer.get_vocabulary()[y_pred + 2]