In [2]:
import tensorflow as tf

import numpy as np
import os
import time

print(tf.__version__)

Init Plugin
Init Graph Optimizer
Init Kernel
2.5.0


In [35]:
full_text = open('20k.txt', 'rb').read().decode(encoding='utf-8')
text = ''
seq_length = 20
for word in full_text.split():
    if len(word) < 4:
        continue
    else:
        word = '_' + word
        for i in range(0, seq_length-len(word)):
            word = word + '*'
    text = text + word

print(f'Length of text: {len(text)} characters')
print(text[:250])

Length of text: 358262 characters
_that***************_this***************_with***************_from***************_your***************_have***************_more***************_will***************_home***************_about**************_page***************_search*************_free*****


In [48]:
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')
print(vocab)

28 unique characters
['*', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [49]:
# TODO try setting mask_token = 'c' and see what the output looks like

ids_from_chars = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [50]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for element in ids_dataset.take(10):
    print(element, chars_from_ids(element).numpy().decode('utf-8'))

tf.Tensor(2, shape=(), dtype=int64) _
tf.Tensor(22, shape=(), dtype=int64) t
tf.Tensor(10, shape=(), dtype=int64) h
tf.Tensor(3, shape=(), dtype=int64) a
tf.Tensor(22, shape=(), dtype=int64) t
tf.Tensor(1, shape=(), dtype=int64) *
tf.Tensor(1, shape=(), dtype=int64) *
tf.Tensor(1, shape=(), dtype=int64) *
tf.Tensor(1, shape=(), dtype=int64) *
tf.Tensor(1, shape=(), dtype=int64) *


In [51]:
# TODO read karpathi to see how he chooses sequence length and if it's important
# maybe longer is better? but we don't want to keep context between words

# try truncating and padding after the stop token

# try training on only 2 letter words 

sequences = ids_dataset.batch(seq_length, drop_remainder=True)

for seq in sequences.take(5):
  print(seq, text_from_ids(seq))

tf.Tensor([ 2 22 10  3 22  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1], shape=(20,), dtype=int64) tf.Tensor(b'_that***************', shape=(), dtype=string)
tf.Tensor([ 2 22 10 11 21  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1], shape=(20,), dtype=int64) tf.Tensor(b'_this***************', shape=(), dtype=string)
tf.Tensor([ 2 25 11 22 10  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1], shape=(20,), dtype=int64) tf.Tensor(b'_with***************', shape=(), dtype=string)
tf.Tensor([ 2  8 20 17 15  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1], shape=(20,), dtype=int64) tf.Tensor(b'_from***************', shape=(), dtype=string)
tf.Tensor([ 2 27 17 23 20  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1], shape=(20,), dtype=int64) tf.Tensor(b'_your***************', shape=(), dtype=string)


In [80]:
def split_input_target(sequence):
    print(sequence)
    input_text = sequence
    target_text = tf.concat([sequence[1:], [1]], 0) # pad with extra *
    return input_text, target_text
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())


Tensor("args_0:0", shape=(20,), dtype=int64)
Input : b'_that***************'
Target: b'that****************'


In [81]:
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

list(dataset.take(1).as_numpy_iterator())

[(array([[ 2, 22, 23, ...,  1,  1,  1],
         [ 2,  3,  6, ...,  1,  1,  1],
         [ 2,  6,  7, ...,  1,  1,  1],
         ...,
         [ 2, 15, 17, ...,  1,  1,  1],
         [ 2, 14,  3, ...,  1,  1,  1],
         [ 2, 21, 13, ...,  1,  1,  1]]),
  array([[22, 23, 20, ...,  1,  1,  1],
         [ 3,  6, 24, ...,  1,  1,  1],
         [ 6,  7,  8, ...,  1,  1,  1],
         ...,
         [15, 17, 22, ...,  1,  1,  1],
         [14,  3, 16, ...,  1,  1,  1],
         [21, 13,  3, ...,  1,  1,  1]]))]

### Define the model
We use an RNN with GRUs

In [82]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 512

class RnnGRUModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)
    
  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)
    
    if return_state:
      return x, states
    else:
      return x

  def get_config(self):
    return {
        "embedding": self.embedding,
        "gru": self.gru,
        "dense": self.dense,
    }

  @classmethod
  def from_config(cls, config):
    return cls(**config)



model = RnnGRUModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

### Try the untrained model on the first sequence

In [83]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    
# vocab size is 29 because of [UNK]
# TODO we don't actually need [UNK] since input involves all possible characters already
model.summary()

(64, 20, 29) # (batch_size, sequence_length, vocab_size)
Model: "rnn_gru_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  7424      
_________________________________________________________________
gru (GRU)                    multiple                  1182720   
_________________________________________________________________
dense (Dense)                multiple                  14877     
Total params: 1,205,021
Trainable params: 1,205,021
Non-trainable params: 0
_________________________________________________________________


In [84]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([ 2, 21,  1, 21, 11,  1,  7, 18, 12, 14,  4,  4, 26,  7, 11, 25, 26,
       23, 14, 25])

In [85]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy().decode('utf-8'))
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy().decode('utf-8'))

Input:
 _save***************
Next Char Predictions:
 _s*si*epjlbbxeiwxulw


In [86]:
# Could try with other loss functions, e.g. CosineSimilarity

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)


Prediction shape:  (64, 20, 29)  # (batch_size, sequence_length, vocab_size)
Mean loss:         3.3746936


### Train the model

In [87]:
model.compile(optimizer='adam', loss=loss)

In [88]:
# Directory where the checkpoints will be saved
checkpoint_dir = '20k_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [89]:
EPOCHS = 30

In [90]:
# TODO: add a callback to print out generated words at a point in training
# TODO: add a callback to print out how the model evaluates a particular word at each point in training

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/30


2022-01-17 15:16:27.899779: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-17 15:16:28.106708: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-17 15:16:28.771316: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### Create ModelWrapper which samples from the model one step at a time and evaluates words

In [98]:
class ModelWrapper():
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1):
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

  def probability_of_letter(this, logits, letter):
    return (tf.squeeze(logits)[this.ids_from_chars(letter).numpy()]).numpy()

  def evaluate_word(this, word, show_work):
      # Seed the model with '**'
      seed_chars = tf.constant(['**'])
      input_chars = tf.strings.unicode_split(seed_chars, 'UTF-8')
      input_ids = this.ids_from_chars(input_chars).to_tensor()
      states = None

      # get the initial probability distribution
      predicted_logits, states = this.model(inputs=input_ids, states=states, return_state=True)
      predicted_logits = predicted_logits[:, -1, :]/this.temperature

      total_prob = 0
      for letter in word:
        # check the log probability of the letter given current state
        probability = this.probability_of_letter(predicted_logits, letter)
        total_prob += probability
        if show_work:
          print("Probability of", letter, "=", probability)

        # feed this letter into the model
        this_char = tf.constant([letter])        
        input_char = tf.strings.unicode_split(this_char, 'UTF-8')
        this_input_id = this.ids_from_chars(input_char).to_tensor()
        predicted_logits, states = this.model(inputs=this_input_id, states=states,
                                              return_state=True)
        predicted_logits = predicted_logits[:, -1, :]/this.temperature

      # check the log probability of '*' given current state
      probability = this.probability_of_letter(predicted_logits, '*')
      # TODO This assumes that the probability of the word is the product of the probability of each letter
      # the problem is that I'm not really training the RNN to generate probabilities for each letter. In otherwords
      # the log likelihoods in the logits is not real, just directionally correct
      total_prob += probability
      if show_work:
        print("Probability of * =", probability)

      # TODO normalize by length
      # TODO actually, shouldn't the model do this automatically if it's well trained?
      return total_prob

In [100]:
model_wrapper = ModelWrapper(model, chars_from_ids, ids_from_chars, temperature=1)

In [103]:
def create_words(given_model, num_words):
    for m in range(num_words):
      states = None
      next_char = tf.constant(['_'])
      result = [next_char]

      for n in range(100):
        next_char, states = given_model.generate_one_step(next_char, states=states)
        result.append(next_char)
        if next_char == '*':
          break

      result = tf.strings.join(result)
      end = time.time()
      print(result[0].numpy().decode('utf-8'))


In [105]:
create_words(model_wrapper, 50)

_workings*
_bouldigh*
_flyer*
_completely*
_stored*
_disable*
_women*
_summerie*
_alterers*
_downlodk*
_bioteerahion*
_salled*
_toren*
_importing*
_hurfy*
_viewers*
_durby*
_fins*
_authorizer*
_bulldence*
_rocks*
_link*
_sundancing*
_tapfued*
_strippers*
_inspected*
_chimpionship*
_copand*
_fulfillment*
_turh*
_selections*
_roommate*
_drawa*
_buddsets*
_ktrides*
_unpeaut*
_physiologiss*
_harder*
_alphobe*
_leaker*
_faeron*
_christi*
_femoding*
_fayle*
_browson*
_nozhare*
_communisment*
_horizon*
_mankigh*
_gift*
