This is an NLP modeling exercise (**Char-RNN**) followed the instruction of [*Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow*](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) on topic **Natural Language Processing with RNNs and Attention**.

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

# Create the training dataset

## Download Shakespeare's work

In [3]:
shakespeare_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filepath = tf.keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
  shakespeare_txt = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


## Encode every character as an interger

In [4]:
# Encode on character level
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([shakespeare_txt])

In [5]:
# Check word_index map dictionary
word_index=tokenizer.word_index
word_index

{'\n': 11,
 ' ': 1,
 '!': 31,
 '$': 39,
 '&': 38,
 "'": 28,
 ',': 18,
 '-': 32,
 '.': 27,
 '3': 37,
 ':': 24,
 ';': 29,
 '?': 30,
 'a': 5,
 'b': 22,
 'c': 19,
 'd': 13,
 'e': 2,
 'f': 20,
 'g': 21,
 'h': 7,
 'i': 6,
 'j': 33,
 'k': 25,
 'l': 12,
 'm': 15,
 'n': 10,
 'o': 4,
 'p': 23,
 'q': 34,
 'r': 9,
 's': 8,
 't': 3,
 'u': 14,
 'v': 26,
 'w': 17,
 'x': 35,
 'y': 16,
 'z': 36}

In [6]:
# Get the number of unique character IDs
max_id = len(tokenizer.word_index)
max_id

39

In [7]:
# Test on a sample
sample = 'First'
encoded_sample = tokenizer.texts_to_sequences([sample])
encoded_sample

[[20, 6, 9, 8, 3]]

In [8]:
decoded_sample = tokenizer.sequences_to_texts(encoded_sample)
decoded_sample

['f i r s t']

In [9]:
# Encode the whole text
[encoded_text] = np.array(tokenizer.texts_to_sequences([shakespeare_txt])) - 1
encoded_text

array([19,  5,  8, ..., 20, 26, 10])

## Split sequential dataset

In [10]:
# Train test split
dataset_size = len(encoded_text)
split_size = int(dataset_size * 0.9)
train_dataset = tf.data.Dataset.from_tensor_slices(encoded_text[:split_size])
test_dataset = tf.data.Dataset.from_tensor_slices(encoded_text[split_size:])

train_dataset, test_dataset

(<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>,
 <TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>)

## Chop sequential dataset into windows

In [11]:
N_STEPS = 100
SHIFT = 1

window_length = N_STEPS + SHIFT
train_dataset = train_dataset.window(window_length, shift=SHIFT, drop_remainder=True)
test_dataset = test_dataset.window(window_length, shift=SHIFT, drop_remainder=True)

train_dataset, test_dataset

(<WindowDataset element_spec=DatasetSpec(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorShape([]))>,
 <WindowDataset element_spec=DatasetSpec(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorShape([]))>)

In [12]:
# Convert nested dataset into flat dataset
train_dataset = train_dataset.flat_map(lambda x: x.batch(window_length))
test_dataset = test_dataset.flat_map(lambda x: x.batch(window_length))

In [13]:
# Process dataset into features and labels
BATCH_SIZE = 32
train_dataset = train_dataset.shuffle(10000).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

train_dataset = train_dataset.map(lambda x: (x[:, :-1], x[:, 1:]))
test_dataset = test_dataset.map(lambda x: (x[:, :-1], x[:, 1:]))

train_dataset, test_dataset

(<MapDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>,
 <MapDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>)

In [14]:
# One-hot encode features
train_dataset = train_dataset.map(lambda x, y: (tf.one_hot(x, depth=max_id), y))
test_dataset = test_dataset.map(lambda x, y: (tf.one_hot(x, depth=max_id), y))

train_dataset, test_dataset

(<MapDataset element_spec=(TensorSpec(shape=(None, None, 39), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>,
 <MapDataset element_spec=(TensorSpec(shape=(None, None, 39), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>)

In [15]:
# Configure for better performance
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

train_dataset, test_dataset

(<PrefetchDataset element_spec=(TensorSpec(shape=(None, None, 39), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>,
 <PrefetchDataset element_spec=(TensorSpec(shape=(None, None, 39), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>)

In [16]:
# Get a view of the processed dataset
for X_batch, Y_batch in train_dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


In [17]:
len(encoded_text)/32 *0.1

3485.6062500000003

# Built and train Char-RNN model

In [18]:
# Construct model
model = tf.keras.Sequential([layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
                             layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
                             layers.TimeDistributed(layers.Dense(max_id, activation='softmax'))])

# Compile model
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

# Fit model 
# (due to the large size of dataset and limited computaion power of my hardware, only a small portion of sataset is used in fitting process)
history = model.fit(train_dataset,
                   steps_per_epoch=100,
                   epochs=20,
                   validation_data=test_dataset,
                   validation_steps=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Use the trained model to predict

In [19]:
# Define a function to preprocess text
def preprocess(text):
  X = np.array(tokenizer.texts_to_sequences(text)) - 1
  return tf.one_hot(X, max_id)

In [25]:
# Predict the next letter
X_encoded = preprocess(['How are yo'])
y_pred = model.predict(X_encoded)
y_pred = y_pred[0].argmax(axis=1)
y_pred_decoded = tokenizer.sequences_to_texts(np.expand_dims(y_pred, axis=0) + 1)[0][-1]
y_pred_decoded

'u'

# Generate fake Shakespeare text

In [28]:
X_encoded = preprocess(['yo'])
y_pred = model.predict(X_encoded)[0,-1:,:]
y_pred, y_pred.shape

(array([[1.91117469e-02, 8.13456194e-04, 1.72687676e-02, 1.23935929e-02,
         1.57819642e-03, 8.36038380e-04, 3.85815423e-04, 3.13795405e-03,
         1.89732444e-02, 1.70831718e-02, 2.93776277e-03, 7.95162097e-03,
         2.76257959e-03, 8.48160803e-01, 9.57166601e-04, 7.30419415e-04,
         2.47653052e-02, 4.01399238e-03, 4.29374020e-04, 8.80742329e-04,
         2.36371008e-04, 2.94618658e-03, 1.22105563e-03, 1.26672827e-03,
         2.17578723e-04, 7.01064884e-04, 2.44294410e-03, 1.51141966e-03,
         1.18303800e-03, 4.48955223e-04, 1.09984283e-03, 3.99836252e-04,
         7.90655686e-05, 2.86505638e-05, 5.73482284e-05, 7.23620935e-04,
         6.71743619e-05, 1.02050064e-04, 9.52681876e-05]], dtype=float32),
 (1, 39))

In [29]:
rescaled_logits = tf.math.log(y_pred)/2
char_id = tf.random.categorical(rescaled_logits, num_samples=1)+1
char_id

<tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[17]])>

In [30]:
# Create a function to pick the next character randomly
def next_char(text, temperature):
  X_encoded = preprocess([text])
  y_proba = model.predict(X_encoded)[0,-1:,:]
  rescaled_logits = tf.math.log(y_proba)/temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1)+1
  return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [31]:
# Write a function to generate characters consecutively
def complete_text(text, n_chars, temperature=1):
  for _ in range(n_chars):
    text += next_char(text, temperature)
  return text

In [39]:
# Text the model's predictive power
print(complete_text('She is ugl', n_chars=50, temperature=1))

She is ugle eser to preantly,
a pereifure an canure number f


In [40]:
print(complete_text('A pig is flying w', n_chars=50, temperature=2))

A pig is flying witjenkat; quusproces and veigre.

all cmxic? you,



In [42]:
print(complete_text('A pig is flying w', n_chars=50, temperature=0.5))

A pig is flying were such of the people,
the people, the people, an
