In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
# consistent and stable output
np.random.seed(0)
tf.random.set_seed(0)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Load the Data

In [3]:
alice_url = "https://raw.githubusercontent.com/grbruns/cst383/master/alice.txt"
filepath = keras.utils.get_file("alice.txt", alice_url)
with open(filepath) as f:
    alice_text = f.read()

Downloading data from https://raw.githubusercontent.com/grbruns/cst383/master/alice.txt


In [4]:
print(alice_text[:150])

﻿
ALICE'S ADVENTURES IN WONDERLAND

Lewis Carroll

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on 


In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(alice_text)

In [6]:
tokenizer.texts_to_sequences(["First"])

[[22, 6, 10, 9, 3]]

In [7]:
tokenizer.sequences_to_texts([[22, 6, 10, 9, 3]])

['f i r s t']

In [8]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters
print(max_id)
print(dataset_size)

44
144395


In [9]:
[encoded] = np.array(tokenizer.texts_to_sequences([alice_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [10]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

In [11]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [12]:
batch_size = 128
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [13]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [14]:
dataset = dataset.prefetch(1)

In [15]:
for X_batch, Y_batch in dataset.take(1):
  print(X_batch.shape, Y_batch.shape)

(128, 100, 44) (128, 100)


# Creating and Training the Model

In [16]:
model = keras.models.Sequential([
  keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
  keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
  keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax'))                                 
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Using the Model to Generate Text

With this function, we can preprocess the text to feed it to the model

In [17]:
def preprocess(texts):
  X = np.array(tokenizer.texts_to_sequences(texts)) - 1
  return tf.one_hot(X, max_id)

Here we can use the model to try and predict the next letter.

In [18]:
X_new = preprocess(['How are yo'])
Y_pred = np.argmax(model.predict(X_new), axis=-1)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # First sentence, last character

'u'

With the next_char and complete_text function, we can generate some text that is like from Alice text.

In [19]:
def next_char(text, temperature=1):
  X_new = preprocess([text])
  y_proba = model.predict(X_new)[0, -1:, :]
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
  return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [20]:
next_char('how are yo', temperature=1)

'u'

In [21]:
def complete_text(text, n_chars=50, temperature=1):
  for _ in range(n_chars):
    text += next_char(text, temperature)
  return text

# Creative Outputs 

The model is generating some interesting text, not making too much sense however.

In [22]:
print(complete_text('hello', temperature=1))

hellow.'

'oh, and ale then, out to even it,' said the 


In [23]:
print(complete_text('there', temperature=1))

there' alice squeaked alice dansiday: 'i never followin


In [24]:
print(complete_text('the hole', temperature=1))

the holent.

alice was no moment her like of the opters, a


# Experimenting with the model

The model from the Geron text seems to only use two layers that are GRU. I want to see how the model performs as well with only using a subset of the alice text. This model will only use the first half of the text for faster training and faster tweaking. I will also lower number of steps to see if we can improve the output. For this experiment, I will change the optimizer to RMSProp as well

In [25]:
alice_shorter_text = alice_text[:len(alice_text) // 2]

tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(alice_shorter_text)

In [26]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters
print(max_id)
print(dataset_size)

42
72197


In [27]:
[encoded] = np.array(tokenizer.texts_to_sequences([alice_shorter_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

n_steps = 50
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

dataset = dataset.flat_map(lambda window: window.batch(window_length))

batch_size = 64
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

dataset = dataset.prefetch(1)

for X_batch, Y_batch in dataset.take(1):
  print(X_batch.shape, Y_batch.shape)

(64, 50, 42) (64, 50)


In [28]:
keras.backend.clear_session()

model = keras.models.Sequential([
  keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
  keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
  keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax'))                                 
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop')
history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [29]:
next_char('hol', temperature=1)

'd'

In [30]:
print(complete_text('h', temperature=1))

he e"cid a was glows here, alice, 'intwif, was she 


An interesting thing that the model from the Geron text is that it uses dropout and recurrent dropout. I wonder how the model will be without these hyperparameters. We can see that without dropout, training is a lot faster since the model can use cuDNN kernal layers.

In [31]:
keras.backend.clear_session()

model = keras.models.Sequential([
  keras.layers.LSTM(128, return_sequences=True, input_shape=[None, max_id]),
  keras.layers.LSTM(128, return_sequences=True),
  keras.layers.LSTM(128, return_sequences=True),
  keras.layers.LSTM(128, return_sequences=True),
  keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax'))                                 
])

In [32]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Using LTSM insread of GRU seems to generate interesting text.

In [33]:
next_char('hol', temperature=1)

'e'

In [34]:
print(complete_text('hol', temperature=1))

hold
shusks bat unce! the rawt atcile withto dich on 


Now i want to see how the model with GRU will perform without dropout and recurrent dropout. Training is much faster since the model fits the cuDNN kernal requirements

In [35]:
keras.backend.clear_session()

model = keras.models.Sequential([
  keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
  keras.layers.GRU(128, return_sequences=True),
  keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax'))                                 
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
next_char('nam', temperature=1)

'e'

In [37]:
print(complete_text('i am a ', temperature=1))

i am a dear little she
was moving them about as she could


In this last model that I want to experiment with, I will use both layers (GRU and LSTM) and see how this one performs on predicting and generating text.

In [38]:
keras.backend.clear_session()

model = keras.models.Sequential([
  keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
  keras.layers.LSTM(128, return_sequences=True),
  keras.layers.GRU(128, return_sequences=True),
  keras.layers.LSTM(128, return_sequences=True),
  keras.layers.GRU(128, return_sequences=True),
  keras.layers.LSTM(128, return_sequences=True),
  keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax'))                                 
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [41]:
print(complete_text('pa', temperature=1))

pair, but at the
hoar to come in amring her nint, an


In [43]:
print(complete_text('she', temperature=1))

shes with in her high) at the sides of
west her head 


# Paragraph about Lewis Carrol

Lewis Carroll was an English Novelist and a poet. He was also a lecturer in mathematics at Oxford. He is also best known for his work as the author of the children's book Alice's Adventures in Wonderland, as well as the sequel Through the Looking-Glass. Not only being amathematics lecturere, he was also an avid photographer and wrote essays, political pamphlets and poetry.