In [1]:
import tensorflow as tf

import numpy as np
import os
import time

print(tf.__version__)

Init Plugin
2.5.0
Init Graph Optimizer
Init Kernel


In [9]:
text = open('20k.txt', 'rb').read().decode(encoding='utf-8')
text = '**' + ('**'.join(text.split())) + '*'

print(f'Length of text: {len(text)} characters')
print(text[:250])

Length of text: 175419 characters
**the**of**and**to**a**in**for**is**on**that**by**this**with**i**you**it**not**or**be**are**from**at**as**your**all**have**new**more**an**was**we**will**home**can**us**about**if**page**my**has**search**free**but**our**one**other**do**no**information*


In [10]:
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')
print(vocab)

27 unique characters
['*', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [11]:
# TODO try setting mask_token = 'c' and see what the output looks like

ids_from_chars = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [12]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for element in ids_dataset.take(10):
    print(element, chars_from_ids(element).numpy().decode('utf-8'))

tf.Tensor(1, shape=(), dtype=int64) *
tf.Tensor(1, shape=(), dtype=int64) *
tf.Tensor(21, shape=(), dtype=int64) t
tf.Tensor(9, shape=(), dtype=int64) h
tf.Tensor(6, shape=(), dtype=int64) e
tf.Tensor(1, shape=(), dtype=int64) *
tf.Tensor(1, shape=(), dtype=int64) *
tf.Tensor(16, shape=(), dtype=int64) o
tf.Tensor(7, shape=(), dtype=int64) f
tf.Tensor(1, shape=(), dtype=int64) *


In [13]:
# TODO if we don't need a regular sequence length maybe we could make sequences
# always start and end at '*'.
# Not sure if this matters since we always prompt with * later when running the model

# read karpathi to see how he chooses sequence length and if it's important
# maybe longer is better? but we don't want to keep context between words

# try truncating and padding after the stop token

# try training on only 2 letter words 

seq_length = 15
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(5):
  print(seq, text_from_ids(seq))

tf.Tensor([ 1  1 21  9  6  1  1 16  7  1  1  2 15  5  1  1], shape=(16,), dtype=int64) tf.Tensor(b'**the**of**and**', shape=(), dtype=string)
tf.Tensor([21 16  1  1  2  1  1 10 15  1  1  7 16 19  1  1], shape=(16,), dtype=int64) tf.Tensor(b'to**a**in**for**', shape=(), dtype=string)
tf.Tensor([10 20  1  1 16 15  1  1 21  9  2 21  1  1  3 26], shape=(16,), dtype=int64) tf.Tensor(b'is**on**that**by', shape=(), dtype=string)
tf.Tensor([ 1  1 21  9 10 20  1  1 24 10 21  9  1  1 10  1], shape=(16,), dtype=int64) tf.Tensor(b'**this**with**i*', shape=(), dtype=string)
tf.Tensor([ 1 26 16 22  1  1 10 21  1  1 15 16 21  1  1 16], shape=(16,), dtype=int64) tf.Tensor(b'*you**it**not**o', shape=(), dtype=string)


In [14]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'**the**of**and*'
Target: b'*the**of**and**'


2022-01-15 11:52:08.638928: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-01-15 11:52:08.639197: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [15]:
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

list(dataset.take(1).as_numpy_iterator())

[(array([[20, 10,  8, 15,  6,  5,  1,  1,  2, 13, 16, 15,  8,  1,  1],
         [ 4, 21, 10,  4,  6, 20,  1,  1, 20, 16, 19, 21,  6,  5,  1],
         [20, 21,  6, 19, 20,  1,  1,  2, 20, 21,  6, 19, 10, 20, 12],
         [22, 13, 13, 26,  1,  1,  7,  2, 21,  9,  6, 19,  1,  1,  6],
         [15,  6, 15, 20,  1,  1, 20, 21,  6, 23,  6, 15, 20, 16, 15],
         [19, 19,  6,  4, 21, 15,  6, 20, 20,  1,  1, 24,  6, 20, 21],
         [15, 10, 20,  9,  6,  5,  1,  1, 11, 16, 15,  2, 20,  1,  1],
         [ 1,  1,  7,  2, 23, 16, 19,  1,  1, 23,  6, 21,  6, 19,  2],
         [12,  4,  1,  1,  4, 19,  2, 20,  9,  6, 20,  1,  1, 20, 21],
         [ 1, 13, 16,  2,  5, 20,  1,  1,  7, 19, 10,  6, 15,  5, 20],
         [ 9, 10, 19,  6,  1,  1, 17, 16, 17, 22, 13,  2, 21, 10, 16],
         [ 1,  1,  3,  2,  4, 12, 20,  1,  1, 16,  3, 20,  6, 19, 23],
         [13, 16, 16, 12,  6,  5,  1,  1,  5, 10, 20,  4, 22, 20, 20],
         [25,  1,  1,  4,  9, 10, 15,  2,  1,  1, 14,  2, 12, 10, 15],
      

### Define the model
We use an RNN with GRUs

In [188]:
vocab_size = len(vocab)
embedding_dim = 128
rnn_units = 257

class RnnGRUModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)
    
  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)
    
    if return_state:
      return x, states
    else:
      return x

  def get_config(self):
    return {
        "embedding": self.embedding,
        "gru": self.gru,
        "dense": self.dense,
    }

  @classmethod
  def from_config(cls, config):
    return cls(**config)



model = RnnGRUModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

### Try the untrained model on the first sequence

In [189]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    
# vocab size is 28 because of [UNK]
# TODO we don't actually need [UNK] since input involves all possible characters already
model.summary()

(64, 15, 28) # (batch_size, sequence_length, vocab_size)
Model: "rnn_gru_model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      multiple                  3584      
_________________________________________________________________
gru_6 (GRU)                  multiple                  298377    
_________________________________________________________________
dense_8 (Dense)              multiple                  7224      
Total params: 309,185
Trainable params: 309,185
Non-trainable params: 0
_________________________________________________________________


In [190]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([18,  3,  2,  3,  5,  4, 18, 27,  4, 19,  0, 19,  4,  2,  9])

In [191]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy().decode('utf-8'))
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy().decode('utf-8'))

Input:
 sors**fox**unli
Next Char Predictions:
 qbabdcqzcr[UNK]rcah


In [192]:
# Could try with other loss functions, e.g. CosineSimilarity

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)


Prediction shape:  (64, 15, 28)  # (batch_size, sequence_length, vocab_size)
Mean loss:         3.3323288


### Train the model

In [194]:
model.compile(optimizer='adam', loss=loss)

In [219]:
# Directory where the checkpoints will be saved
checkpoint_dir = '20k_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [196]:
EPOCHS = 10

In [197]:
# TODO: add a callback to print out generated words at a point in training
# TODO: add a callback to print out how the model evaluates a particular word at each point in training

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10


2022-01-17 10:21:13.235064: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-17 10:21:13.426814: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


  4/171 [..............................] - ETA: 3s - loss: 3.3107  

2022-01-17 10:21:13.479811: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Create ModelWrapper which samples from the model one step at a time and evaluates words

In [198]:
class ModelWrapper():
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=0.5):
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

  def probability_of_letter(this, logits, letter):
    return (tf.squeeze(logits)[this.ids_from_chars(letter).numpy()]).numpy()

  def evaluate_word(this, word, show_work):
      # Seed the model with '**'
      seed_chars = tf.constant(['**'])
      input_chars = tf.strings.unicode_split(seed_chars, 'UTF-8')
      input_ids = this.ids_from_chars(input_chars).to_tensor()
      states = None

      # get the initial probability distribution
      predicted_logits, states = this.model(inputs=input_ids, states=states, return_state=True)
      predicted_logits = predicted_logits[:, -1, :]/this.temperature

      total_prob = 0
      for letter in word:
        # check the log probability of the letter given current state
        probability = this.probability_of_letter(predicted_logits, letter)
        total_prob += probability
        if show_work:
          print("Probability of", letter, "=", probability)

        # feed this letter into the model
        this_char = tf.constant([letter])        
        input_char = tf.strings.unicode_split(this_char, 'UTF-8')
        this_input_id = this.ids_from_chars(input_char).to_tensor()
        predicted_logits, states = this.model(inputs=this_input_id, states=states,
                                              return_state=True)
        predicted_logits = predicted_logits[:, -1, :]/this.temperature

      # check the log probability of '*' given current state
      probability = this.probability_of_letter(predicted_logits, '*')
      # TODO This assumes that the probability of the word is the product of the probability of each letter
      # the problem is that I'm not really training the RNN to generate probabilities for each letter. In otherwords
      # the log likelihoods in the logits is not real, just directionally correct
      total_prob += probability
      if show_work:
        print("Probability of * =", probability)

      # TODO normalize by length
      # TODO actually, shouldn't the model do this automatically if it's well trained?
      return total_prob

In [199]:
model_wrapper = ModelWrapper(model, chars_from_ids, ids_from_chars)

In [200]:
def create_words(given_model, num_words):
    for m in range(num_words):
      states = None
      next_char = tf.constant(['**char'])
      result = [next_char]

      for n in range(100):
        next_char, states = given_model.generate_one_step(next_char, states=states)
        result.append(next_char)
        if next_char == '*':
          break

      result = tf.strings.join(result)
      end = time.time()
      print(result[0].numpy().decode('utf-8'))

create_words(model_wrapper, 10)

2022-01-17 10:21:53.007935: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-17 10:21:53.032555: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-17 10:21:53.128436: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-17 10:21:53.151722: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


**charize*
**charmand*
**chare*
**charrica*
**charity*
**charture*
**charger*
**charl*
**chargers*
**charrie*
