Import our dependencies

In [None]:
import tensorflow as tf
tf.keras.backend.clear_session()
import numpy as np
import os
import time

Download the data

In [None]:
path_to_file = tf.keras.utils.get_file('gatsby.txt', 'https://www.gutenberg.org/cache/epub/64317/pg64317.txt')

**Read the data. In this ICP we are using the book 'The Great Gatsby' as our dataset.**

Our data had some legal information attached to it, so we decided to slice that part off. 

First, explore the text by looking at the length.

In [None]:
# Read, then decode for compatability
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Slice text to get rid of all legal disclosures, we only want the raw text from the book
text = text[908:-18762]

# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 277003 characters


In [None]:
# Take a look at the first 1000 characters in text
print(text[:1000])

			   The Great Gatsby
				  by
			 F. Scott Fitzgerald


                           Table of Contents

I
II
III
IV
V
VI
VII
VIII
IX


                              Once again
                                  to
                                 Zelda

  Then wear the gold hat, if that will move her;
  If you can bounce high, bounce for her too,
  Till she cry “Lover, gold-hatted, high-bouncing lover,
  I must have you!”

  Thomas Parke d’Invilliers


                                  I

In my younger and more vulnerable years my father gave me some advice
that I’ve been turning over in my mind ever since.

“Whenever you feel like criticizing anyone,” he told me, “just
remember that all the people in this world haven’t had the advantages
that you’ve had.”

He didn’t say any more, but we’ve always been unusually communicative
in a reserved way, and I understood that he meant a great deal more
than that. In consequence, I’m inclined to reserve all

**Data Exploration**

Lets group the text by character to see the frequency distribution of each character

This is the frequency distribution for the raw data. We use the Counter module to aggregate the characters into buckets. We can then make decisions about preprocessing the data.

In [None]:
from collections import Counter                                                
  
# using collections.Counter() to get 
# count of each element in string 
res = Counter(text)
  
# printing result 
print ("Count of all characters in Gatsby is :\n " +  str(res))

Count of all characters in Gatsby is :
 Counter({' ': 44191, 'e': 25006, 't': 18096, 'a': 16841, 'o': 15738, 'n': 14063, 'i': 12528, 's': 12367, 'h': 12237, 'r': 11339, 'd': 9609, 'l': 8173, '\r': 6399, '\n': 6398, 'u': 5833, 'm': 5205, 'w': 4950, 'g': 4520, 'c': 4387, 'y': 4367, 'f': 4110, '.': 3107, 'p': 2993, 'b': 2975, ',': 2967, '-': 2047, 'k': 1927, 'v': 1883, 'I': 1671, '“': 1457, '”': 1455, '’': 1346, 'T': 690, 'W': 498, 'H': 421, '—': 417, 'G': 376, '?': 328, 'S': 324, 'A': 320, 'M': 314, 'D': 307, 'x': 288, 'B': 223, 'j': 203, 'Y': 181, 'C': 171, 'N': 163, 'q': 156, 'z': 145, 'O': 142, 'J': 142, '!': 124, 'E': 122, 'F': 97, 'L': 96, ':': 81, ';': 74, 'P': 65, '…': 54, 'R': 40, 'K': 37, 'V': 29, '‘': 25, '0': 22, 'é': 17, '1': 16, '\u200a': 12, '9': 11, '\t': 10, '5': 9, 'U': 9, '(': 7, ')': 7, '3': 7, '*': 6, '6': 5, 'Q': 4, '2': 4, '8': 3, 'X': 2, '7': 2, '4': 2, '[': 2, ']': 2, '$': 2, 'Z': 1, 'ô': 1, 'ê': 1, 'ç': 1})


We have decided to remove some characters and replace others. 

This reduction of data will hopefully help the model to learn faster.

In [None]:
# Use replace() to remove characters and replace with another
filtered_text = text.replace('\r', '\n').replace('\u200a', ' ').replace('é', 'e').replace('[', '(').replace(']', ')').replace('$', '').replace('ô','o').replace('ê','e').replace('ç','c')

# using collections.Counter() to get 
# count of each element in string 
char_freq = Counter(filtered_text)
  
# printing result 
# we now have removed some of the outliers
print ("Count of filtered characters in Gatsby is :\n " +  str(char_freq))

Count of filtered characters in Gatsby is :
 Counter({' ': 44203, 'e': 25024, 't': 18096, 'a': 16841, 'o': 15739, 'n': 14063, '\n': 12797, 'i': 12528, 's': 12367, 'h': 12237, 'r': 11339, 'd': 9609, 'l': 8173, 'u': 5833, 'm': 5205, 'w': 4950, 'g': 4520, 'c': 4388, 'y': 4367, 'f': 4110, '.': 3107, 'p': 2993, 'b': 2975, ',': 2967, '-': 2047, 'k': 1927, 'v': 1883, 'I': 1671, '“': 1457, '”': 1455, '’': 1346, 'T': 690, 'W': 498, 'H': 421, '—': 417, 'G': 376, '?': 328, 'S': 324, 'A': 320, 'M': 314, 'D': 307, 'x': 288, 'B': 223, 'j': 203, 'Y': 181, 'C': 171, 'N': 163, 'q': 156, 'z': 145, 'O': 142, 'J': 142, '!': 124, 'E': 122, 'F': 97, 'L': 96, ':': 81, ';': 74, 'P': 65, '…': 54, 'R': 40, 'K': 37, 'V': 29, '‘': 25, '0': 22, '1': 16, '9': 11, '\t': 10, '5': 9, '(': 9, ')': 9, 'U': 9, '3': 7, '*': 6, '6': 5, 'Q': 4, '2': 4, '8': 3, 'X': 2, '7': 2, '4': 2, 'Z': 1})


By replacing characters in the original text, we have reduced the number of unique characters from 90 to 81.

In [None]:
# The unique characters in the file

#Instead of having the vocab letters in an arbitrary format, we can sort the letters based on frequency
#For example, 'Z' has the lowest frequency of all letters and will get assigned the index of 0

#Hopefully this will have an impact on the learning
vocab = sorted(char_freq, key=char_freq.get, reverse=False)
print ('{} unique characters'.format(len(vocab)))
#Heres the list of all characters that appear in the text
print(vocab)                                                                  

81 unique characters
['Z', 'X', '7', '4', '8', 'Q', '2', '6', '*', '3', '5', '(', ')', 'U', '\t', '9', '1', '0', '‘', 'V', 'K', 'R', '…', 'P', ';', ':', 'L', 'F', 'E', '!', 'O', 'J', 'z', 'q', 'N', 'C', 'Y', 'j', 'B', 'x', 'D', 'M', 'A', 'S', '?', 'G', '—', 'H', 'W', 'T', '’', '”', '“', 'I', 'v', 'k', '-', ',', 'b', 'p', '.', 'f', 'y', 'c', 'g', 'w', 'm', 'u', 'l', 'd', 'r', 'h', 's', 'i', '\n', 'n', 'o', 'a', 't', 'e', ' ']


## Process the text

Vectorize the text

Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping characters to numbers, and another for numbers to characters.

In [None]:
# Creating a mapping from unique characters to indices
#Create a dictionary, i tracks the index of the char, u tracks the char
char2idx = dict((u,i) for i, u in enumerate(vocab))

#can access a char based on index
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in filtered_text])
#the corpus has been turned into an index for each char
print(text_as_int[500:600])

print(repr(''.join(idx2char[text_as_int[500:600]])))                                     #edit me

[80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 53 74 74 74
 74 53 75 80 66 62 80 62 76 67 75 64 79 70 80 77 75 69 80 66 76 70 79 80
 54 67 68 75 79 70 77 58 68 79 80 62 79 77 70 72 80 66 62 80 61 77 78 71
 79 70 80 64 77 54 79 80 66 79 80 72 76 66 79 80 77 69 54 73 63 79 74 74
 78 71 77 78]
'                    I\n\n\n\nIn my younger and more vulnerable years my father gave me some advice\n\nthat'


The prediction task

Given a character, or a sequence of characters, what is the most probable next character? This is the task we're training the model to perform. The input to the model will be a sequence of characters, and we train the model to predict the output—the following character at each time step.

Since RNNs maintain an internal state that depends on the previously seen elements, given all the characters computed until this moment, what is the next character?

Create training examples and targets

Next divide the text into example sequences. Each input sequence will contain seq_length characters from the text.

For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.

So break the text into chunks of seq_length+1. For example, say seq_length is 4 and our text is "Hello". The input sequence would be "Hell", and the target sequence "ello".

To do this first use the tf.data.Dataset.from_tensor_slices function to convert the text vector into a stream of character indices.

**We have changed the sequence length**

We hope that by changing the sequence length to be longer, the LSTM will have more information to learn from.

Doing this increases the number of time steps in which the model has to make a prediction.

In [None]:
# The maximum length sentence we want for a single input in characters
seq_length = 200                                                               

examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
# we use tensorflow dataset because it is good for streaming data
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

print(type(char_dataset))

#take() displays first x members of the tensor DataSet. For each element, we can get the data by calling .numpy() method
# we can then enter i into our lookup array to get the resultant character
for i in char_dataset.take(5):
  print(idx2char[i.numpy()])
  print(i)

<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>
	
tf.Tensor(14, shape=(), dtype=int64)
	
tf.Tensor(14, shape=(), dtype=int64)
	
tf.Tensor(14, shape=(), dtype=int64)
 
tf.Tensor(80, shape=(), dtype=int64)
 
tf.Tensor(80, shape=(), dtype=int64)


The batch method lets us easily convert these individual characters to sequences of the desired size.

In [None]:
#group the digits into strings of length 101
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#Heres an example of changing a sequence of numbers back into alphanumeric chars
for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'\t\t\t   The Great Gatsby\n\n\t\t\t\t  by\n\n\t\t\t F. Scott Fitzgerald\n\n\n\n\n\n                           Table of Contents\n\n\n\nI\n\nII\n\nIII\n\nIV\n\nV\n\nVI\n\nVII\n\nVIII\n\nIX\n\n\n\n\n\n                              Once again\n\n      '
'                            to\n\n                                 Zelda\n\n\n\n  Then wear the gold hat, if that will move her;\n\n  If you can bounce high, bounce for her too,\n\n  Till she cry “Lover, gold-ha'
'tted, high-bouncing lover,\n\n  I must have you!”\n\n\n\n  Thomas Parke d’Invilliers\n\n\n\n\n\n                                  I\n\n\n\nIn my younger and more vulnerable years my father gave me some advice\n\nthat I’'
've been turning over in my mind ever since.\n\n\n\n“Whenever you feel like criticizing anyone,” he told me, “just\n\nremember that all the people in this world haven’t had the advantages\n\nthat you’ve had.”\n\n'
'\n\nHe didn’t say any more, but we’ve always been unusually communicative\n\nin a reserved way, a

For each sequence, duplicate and shift it to form the input and target text by using the map method to apply a simple function to each batch:

In [None]:
#Create a function to split the data
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

#Create a mappedDataset object. For each item in this dataset, there is the input or training array and the output or testing array
dataset = sequences.map(split_input_target)

#We can see that the resultant is 2 numpy arrays for each batch
# The first array is the input and the second is the target
for i in dataset.take(1):
  print(i)

(<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([14, 14, 14, 80, 80, 80, 49, 71, 79, 80, 45, 70, 79, 77, 78, 80, 45,
       77, 78, 72, 58, 62, 74, 74, 14, 14, 14, 14, 80, 80, 58, 62, 74, 74,
       14, 14, 14, 80, 27, 60, 80, 43, 63, 76, 78, 78, 80, 27, 73, 78, 32,
       64, 79, 70, 77, 68, 69, 74, 74, 74, 74, 74, 74, 80, 80, 80, 80, 80,
       80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
       80, 80, 80, 80, 80, 49, 77, 58, 68, 79, 80, 76, 61, 80, 35, 76, 75,
       78, 79, 75, 78, 72, 74, 74, 74, 74, 53, 74, 74, 53, 53, 74, 74, 53,
       53, 53, 74, 74, 53, 19, 74, 74, 19, 74, 74, 19, 53, 74, 74, 19, 53,
       53, 74, 74, 19, 53, 53, 53, 74, 74, 53,  1, 74, 74, 74, 74, 74, 74,
       80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
       80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 30, 75, 63, 79,
       80, 77, 64, 77, 73, 75, 74, 74, 80, 80, 80, 80, 80])>, <tf.Tensor: shape=(200,), dtype=int64, numpy=
array([14, 14, 80, 8

Print the first examples input and target values:

Create training batches

We used tf.data to split the text into manageable sequences. But before feeding this data into the model, we need to shuffle the data and pack it into batches.

**We added the .repeat() helper to the end of the shuffle method**

This will allow the model to repeatable and so it will never end. This gives us a huge advantage as Deep Learning needs to have lots of data. This allows us to increase our Steps per epoch in the model.

In [None]:
# Change the Batch size to 128
BATCH_SIZE = 128

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

#Add the repeat() helper so that the dataset is endless
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).repeat()

dataset

<RepeatDataset element_spec=(TensorSpec(shape=(64, 200), dtype=tf.int64, name=None), TensorSpec(shape=(64, 200), dtype=tf.int64, name=None))>

Build The Model

Use tf.keras.Sequential to define the model. For this simple example three layers are used to define our model:

tf.keras.layers.Embedding: The input layer. A trainable lookup table that will map the numbers of each character to a vector with embedding_dim dimensions;

tf.keras.layers.GRU: A type of RNN with size units=rnn_units (You can also use a LSTM layer here.)

tf.keras.layers.Dense: The output layer, with vocab_size outputs.

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024                                                                                                          #KF: we could increase this 

**Adding a LSTM layer**
LSTM has a memory cell which can hold information in memory for a longer period of time. A set of gates is used to control when information enters the memory, when it's output, and when it's forgotten. We could have used the GRU but the problem with GRU is that it doesnt have the seperate memory cell and they have fewer gate cells.



**Adding a Dropuout layer**
We have added a Dropout Layer of Dropout(0.1) which means that 10% of data will be dropped out which aims to decrease the prossibility of overfitting ,thus we can decrease the loss and make a better prediction.

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    #The embedding layer helps to add semantic meaning to the input
    #With embedding we can do word math such as   King - man + woman = Queen
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    # The LSTM layer brings in the concept of cell state. These cell states are 
    # how the network remembers what has previously been entered
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    # The dropout layer prevents overfitting
    tf.keras.layers.Dropout(0.1) ,  
    # The dense layer is where the model decides what to output                 
    tf.keras.layers.Dense(vocab_size)
    
  ])
  return model

Here we build the model with the chosen params

In [None]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

Try the model

Now run the model to see that it behaves as expected.

First check the shape of the output:

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 200, 81) # (batch_size, sequence_length, vocab_size)


In the above example, the sequence length of the input is **XXXXXX** but the model can be run on inputs of any length:

In [None]:
# We can summarize the model to get information about the layers
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           20736     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dropout (Dropout)           (64, None, 1024)          0         
                                                                 
 dense (Dense)               (64, None, 81)            83025     
                                                                 
Total params: 5,350,737
Trainable params: 5,350,737
Non-trainable params: 0
_________________________________________________________________


To get actual predictions from the model we need to sample from the output distribution, to get actual character indices. This distribution is defined by the logits over the character vocabulary.

Note: It is important to sample from this distribution as taking the argmax of the distribution can easily get the model stuck in a loop.

Try it for the first example in the batch:

In [None]:
# We have not actually run the model, so this information will be gibberish
# However it helps us to understand what is going on in the model
# We use random so that the model does not output the same thing for similar inputs
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

#This gives us, at each timestep, a prediction of the next character index:
sampled_indices

array([47, 21,  5, 39, 42, 61, 27, 56, 22, 55, 35, 45, 22, 67, 30, 37, 49,
       19, 22, 32, 28, 71, 17, 39, 37, 27, 57, 26, 29, 60,  0,  1,  4, 76,
       15, 19, 49, 48, 45, 44, 46,  0, 37, 65, 42, 42, 32, 16, 73, 24, 58,
       74, 53, 31, 40, 56, 40, 34, 73, 21, 17, 49, 21, 78, 40, 50, 25, 42,
       47,  4, 60, 54, 76, 14, 73, 41, 40,  8, 18, 50, 73, 75, 24,  3, 47,
       70,  4,  1, 40,  1, 52, 19, 16, 31, 13, 22, 50, 45, 66, 47, 79,  2,
       28, 27,  8, 38, 75,  9, 22, 60, 22, 22, 69, 80, 32, 12, 37, 28, 44,
       56, 31,  4, 74, 73, 18, 44, 58, 78, 38, 55, 71, 37, 58,  9, 44, 42,
       16, 47,  7, 54, 48, 50, 52, 78, 69, 49, 18, 75, 22, 36, 79, 14, 68,
       42, 39, 21, 64, 39, 25,  4, 68, 19,  4,  5, 14, 53, 73, 54,  8, 45,
       53, 40, 11, 75, 62, 21, 61, 52, 60, 37, 45, 62, 37, 32,  2, 28,  1,
       45, 80, 19, 38, 59, 13, 30, 32, 68, 52, 74, 63, 29])

Train the model

At this point the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character.

Attach an optimizer, and a loss function
The standard tf.keras.losses.sparse_categorical_crossentropy loss function works in this case because it is applied across the last dimension of the predictions.

Because our model returns logits, we need to set the from_logits flag.

In [None]:
# Define the loss function, we use sparse because our data is not one hot encoded
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

# Before we run the model, lets see what the loss output will look like
example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 200, 81)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.3935976


Configure the training procedure using the tf.keras.Model.compile method. We'll use tf.keras.optimizers.Adam with default arguments and the loss function.

In [None]:
# Compile the model with the adam optimizer
model.compile(optimizer='adam', loss=loss)

Configure checkpoints

Use a tf.keras.callbacks.ModelCheckpoint to ensure that checkpoints are saved during training:

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

We also define an early stopping callback. This callback will allow the model to stop if the loss does not keep improving. 

With this callback, we can use large number of epochs but the model will stop after the optimal loss is found.

In [None]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    patience=5,
    monitor='loss',
    restore_best_weights=True,
    verbose=1
)

Execute the training

To keep training time reasonable, use 10 epochs to train the model. 

### **Changing Epoch size from 10 to 50.**

Increasing the epoch size aims to decrease the loss. The increased epoch size is to provide more iterations with our dataset. 

Increased epochs allow for more learning to be done.

In [None]:
EPOCHS = 50
INITIAL_EPOCH = 1
STEPS_PER_EPOCH = 200

In [None]:
history = model.fit(dataset, epochs=EPOCHS, steps_per_epoch=STEPS_PER_EPOCH, callbacks=[checkpoint_callback, early_stopping_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 47: early stopping


Generate text

Restore the latest checkpoint
To keep this prediction step simple, use a batch size of 1.

Because of the way the RNN state is passed from timestep to timestep, the model only accepts a fixed batch size once built.

To run the model with a different batch_size, we need to rebuild the model and restore the weights from the checkpoint.

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (1, None, 256)            20736     
                                                                 
 lstm_2 (LSTM)               (1, None, 1024)           5246976   
                                                                 
 dropout_2 (Dropout)         (1, None, 1024)           0         
                                                                 
 dense_2 (Dense)             (1, None, 81)             83025     
                                                                 
Total params: 5,350,737
Trainable params: 5,350,737
Non-trainable params: 0
_________________________________________________________________


The prediction loop

The following code block generates the text:

It Starts by choosing a start string, initializing the RNN state and setting the number of characters to generate.

Get the prediction distribution of the next character using the start string and the RNN state.

Then, use a categorical distribution to calculate the index of the predicted character. Use this predicted character as our next input to the model.

The RNN state returned by the model is fed back into the model so that it now has more context, instead than only one character. After predicting the next character, the modified RNN states are again fed back into the model, which is how it learns as it gets more context from the previously predicted characters.

Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.

In [80]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.3

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # We pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [81]:
print(generate_text(model, start_string="The Great Gatsby of New York "))

The Great Gatsby of New York and then didn’t know—though

her husband, was urged ar. Roby rrethe a rafed into swetthe laye; the grass on his lawn hid

he got up and informed me, in an

uncertain voice, that he was going home.



“Why’s that?”



“Nobody’s coming.”



“Welve you beat the

shoulder. At past one of the girls in yellowdy don’t

you not a little afraid of missing something if I

forget that, as my father snobbidythen watch

answered the lethy was walking the words the

whited—a nice right act and cashouse and excited young offincerst speads come and said that his shoulder

in a dize or along the Sturn.



“Your place looks like the World’s Fair,” I said.



“Does it?” He turned his eyes toward it absently. “I have been

glancing into some of the rooms. Let’s go so the window and, leaning forward, tapped on the

frontom with a

slight nod, and she winked at me again. “—And we’ve plut in the Sentincemar.”



She looked at Tom alound. “And siTe dream, you. You

absolute little