# A Text Generation RNN

## Brian Chen | 2020

Trained on "To Bleach A Pigeon", by Brian Chen & Joe Salmon

*Based on https://www.tensorflow.org/tutorials/text/text_generation*

In [0]:
#imports

import tensorflow as tf
import numpy as np
import os
import time

In [0]:
to_bleach = open("/content/drive/My Drive/TextGen RNN/Datasets/ToBleachAPigeon.txt", "r").read()

In [0]:
#repeat dataset a few times
for i in range(10):
  to_bleach = to_bleach + to_bleach

In [5]:
unique_chars = sorted(set(to_bleach))
print(len(unique_chars), unique_chars)

86 ['\n', ' ', '!', '&', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '©', '–', '—', '’', '“', '”', '…']


In [0]:
char_to_index = {u:i for i, u in enumerate(unique_chars)}
index_to_char = np.array(unique_chars)
text_as_int = np.array([char_to_index[c] for c in to_bleach]) #abcd -> 1,2,3,4, for example

In [0]:
#parameters
max_input_length = 100
examples_per_epoch = len(to_bleach)

dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [0]:
#turn chars into sequences of size
seqs = dataset.batch(max_input_length+1, drop_remainder=True)

In [9]:
for i in seqs.take(5):
  print(repr(''.join(index_to_char[i.numpy()])))

'5025 Willow St, Vancouver, BC V5Z 3S1\nEric Hamber Secondary School Drama\nDepartment\nPhone: (778) 681-'
'4322, (604) 831-4943\nE-mail: ihasdapi@gmail.com,\njoesgot2bcool@gmail.com\nCopyright © 2020,\nby Brian C'
'hen and Joe Salmon\nHOW TO BLEACH A PIGEON\n____ ____\nA Play in Three Acts\nBy\nBrian Chen & Joe Salmon\n1'
'\nContents\nDRAMATIS PERSONAE....................................... 2\nSETTING.........................'
'........................ 2\nTIME.................................................... 2\nACT 1..........'


In [0]:
#duplicate input to create target & input text
"""for example:
Text: ABCDE
Input: ABCDE
Target: BCDE
Thus, for input A, expect output = B, input = B, expected output=C..."""
def create_target_input(sequence):
  input = sequence[:-1]
  target = sequence[1:]
  return input, target

In [0]:
dataset = seqs.map(create_target_input)

In [12]:
BATCH_SIZE = 64
BUFFER_SIZE =  10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [0]:
vocab_size = len(unique_chars)
embedding_size = 256
rnn_units = 1024

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ]) 

#sequential model w/ embedding layer for input, GRU as RNN (can use LSTM), dense layer for output (similar to sigver in some ways!)
  return model

In [15]:
shakspeard = build_model(vocab_size, embedding_size, rnn_units, BATCH_SIZE)
shakspeard.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           22016     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
gru_1 (GRU)                  (64, None, 1024)          6297600   
_________________________________________________________________
dense (Dense)                (64, None, 86)            88150     
Total params: 10,346,070
Trainable params: 10,346,070
Non-trainable params: 0
_________________________________________________________________


In [0]:
#Train Model
def loss(labels, logits): #model returns logits, so from_logits=True (log-odds, log of probability)
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [0]:
shakspeard.compile(optimizer="adam", loss=loss) #adam is usally the best choice

In [0]:
check_dir = "/content/drive/My Drive/TextGen RNN/Checkpoints_Bleach"
check_file_dir = os.path.join(check_dir, "checkpoint_{epoch}")

checkpoints = tf.keras.callbacks.ModelCheckpoint(filepath = check_file_dir, save_weights_only=True)

In [27]:
EPOCHS = 10
STEPS= 150
history = shakspeard.fit(dataset, epochs = EPOCHS, steps_per_epoch = STEPS, callbacks = [checkpoints])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
tf.train.latest_checkpoint(check_dir) #get latest chkpoint

'/content/drive/My Drive/TextGen RNN/Checkpoints_Bleach/checkpoint_10'

In [29]:
prediktor = build_model(vocab_size, embedding_dim=embedding_size, rnn_units=rnn_units, batch_size = 1)
prediktor.load_weights(tf.train.latest_checkpoint(check_dir))
prediktor.build(tf.TensorShape([1, None]))
prediktor.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 256)            22016     
_________________________________________________________________
gru_4 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
gru_5 (GRU)                  (1, None, 1024)           6297600   
_________________________________________________________________
dense_2 (Dense)              (1, None, 86)             88150     
Total params: 10,346,070
Trainable params: 10,346,070
Non-trainable params: 0
_________________________________________________________________


In [0]:
def gen_text(model, seed, num_generate, temp):
  """
  :param model: tf/keras model
  :param seed: first line of text to build off of
  :param num_generate: amount of chars to predict
  :param temp: how much variance is allowed in predictions: higher = more varied, and vice versa
  """
  gen_input = [char_to_index[i] for i in seed]
  gen_input = tf.expand_dims(gen_input, 0)
  out = []
  
  model.reset_states()
  for i in range(num_generate):
    pred = model(gen_input)
    pred = tf.squeeze(pred, 0)
    pred = pred/temp
    pred_id = tf.random.categorical(pred, num_samples = 1)[-1, 0].numpy()
    #add predicted output to next output
    gen_input = tf.expand_dims([pred_id], 0)
    out.append(index_to_char[pred_id])
  return (seed + "".join(out))


In [0]:
ex_out = gen_text(prediktor, "Cherry: ", 2000, 1)

In [38]:
print(ex_out)

Cherry: Oh. Right, my bad. The doves will make
up for it.
[Beat. BIANCA begill hith and Alvin provided a better
family for him than I could ever have. I was thinking in
there, and I was thinking about
who signs the birth certificate, isn’t it?
CLARENCE: Bianca?
290 RACHEL: How long have you been here for?
BIANCA: What?
RACHEL: How could I not notice, Ken?
BIANCA: Notice what?
RACHEL, to BIANCA: That your father slept with my sister.
125 BIANCA, turning on RACHEL: And you knew this the whole
time? How come you never did a thing about it? No wonderRACHEL: It’s not like you would understand. What are they
going to do, stop being your friends?
RACHEL: I’ve seen it happen, Bianca. Who woily values
mean when we got to grow understanding looking for anyther
appreciate not having “door privileges” until the age
of seventeen.
RACHEL: I am worrying about
if you really wanted it.
BIANCA: Of course I do.
CLARENCE: You could’ve at least made actom to -our
155 the room for a year and now
you’re enga