In [1]:
# TODO based on https://www.tensorflow.org/text/tutorials/text_generation

import tensorflow as tf

import numpy as np
import os
import time

print(tf.__version__)

Init Plugin
Init Graph Optimizer
Init Kernel
2.5.0


In [2]:
text = open('words_alpha.txt', 'rb').read().decode(encoding='utf-8')
text = '*' + ('**'.join(text.split())) + '*'

print(f'Length of text: {len(text)} characters')
print(text[:250])

Length of text: 4234930 characters
*polyciliate**telemetrograph**trionfo**exclaimer**colotomy**waistless**anthramine**undupable**anthomedusan**jargonising**untruthfulness**fugacious**unluckiest**clatch**singfo**maronian**logogriph**spooky**induplicate**hexaradial**acrodont**gambia**st


In [3]:
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')
print(vocab)

27 unique characters
['*', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
ids_from_chars = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

Metal device set to: Apple M1


2022-01-17 14:24:56.605902: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-01-17 14:24:56.606088: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for element in ids_dataset.take(10):
    print(element, chars_from_ids(element).numpy().decode('utf-8'))

tf.Tensor(1, shape=(), dtype=int64) *
tf.Tensor(17, shape=(), dtype=int64) p
tf.Tensor(16, shape=(), dtype=int64) o
tf.Tensor(13, shape=(), dtype=int64) l
tf.Tensor(26, shape=(), dtype=int64) y
tf.Tensor(4, shape=(), dtype=int64) c
tf.Tensor(10, shape=(), dtype=int64) i
tf.Tensor(13, shape=(), dtype=int64) l
tf.Tensor(10, shape=(), dtype=int64) i
tf.Tensor(2, shape=(), dtype=int64) a


In [6]:
# TODO if we don't need a regular sequence length maybe we could make sequences
# always start and end at '*'.
# Not sure if this matters since we always prompt with * later when running the model

# read karpathi to see how he chooses sequence length and if it's important
# maybe longer is better? but we don't want to keep context between words

# try truncating and padding after the stop token
# sequence length = shortest word length, change the start character into _

seq_length = 15
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(5):
  print(seq, text_from_ids(seq))

tf.Tensor([ 1 17 16 13 26  4 10 13 10  2 21  6  1  1 21  6], shape=(16,), dtype=int64) tf.Tensor(b'*polyciliate**te', shape=(), dtype=string)
tf.Tensor([13  6 14  6 21 19 16  8 19  2 17  9  1  1 21 19], shape=(16,), dtype=int64) tf.Tensor(b'lemetrograph**tr', shape=(), dtype=string)
tf.Tensor([10 16 15  7 16  1  1  6 25  4 13  2 10 14  6 19], shape=(16,), dtype=int64) tf.Tensor(b'ionfo**exclaimer', shape=(), dtype=string)
tf.Tensor([ 1  1  4 16 13 16 21 16 14 26  1  1 24  2 10 20], shape=(16,), dtype=int64) tf.Tensor(b'**colotomy**wais', shape=(), dtype=string)
tf.Tensor([21 13  6 20 20  1  1  2 15 21  9 19  2 14 10 15], shape=(16,), dtype=int64) tf.Tensor(b'tless**anthramin', shape=(), dtype=string)


In [7]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'*polyciliate**t'
Target: b'polyciliate**te'


2022-01-17 14:24:57.979420: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-01-17 14:24:57.979570: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [8]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))


### Define the model
We use an RNN with GRUs defined in rnn_gru_model.py

In [9]:
from rnn_gru_model import RnnGRUModel
model = RnnGRUModel()

### Try the untrained model on the first sequence

In [10]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    
# vocab size is 28 because of [UNK]
# TODO we don't actually need [UNK] since input involves all possible characters already
model.summary()

(64, 15, 28) # (batch_size, sequence_length, vocab_size)
Model: "rnn_gru_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  7168      
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  28700     
_________________________________________________________________
dense_1 (Dense)              multiple                  812       
Total params: 3,974,984
Trainable params: 3,974,984
Non-trainable params: 0
_________________________________________________________________


In [11]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([10, 25, 23,  7, 19, 19, 12,  4, 16, 27, 14, 14,  5,  7, 14])

In [12]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy().decode('utf-8'))
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy().decode('utf-8'))

Input:
 corn**larded**h
Next Char Predictions:
 ixvfrrkcozmmdfm


In [13]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)


Prediction shape:  (64, 15, 28)  # (batch_size, sequence_length, vocab_size)
Mean loss:         3.3311896


In [14]:
# A newly initialized model shouldn't be too sure of itself, the output logits should
# all have similar magnitudes. To confirm this you can check that the exponential of the mean
# loss is approximately equal to the vocabulary size. A much higher loss means the model is
# sure of its wrong answers, and is badly initialized:

tf.exp(mean_loss).numpy()

27.971594

### Train the model

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = 'training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS = 5

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

### Create wrapper which generates and evaluates word plausiblity

In [20]:
from rnn_plausiblewords import RnnWordPlausibilityEvaluator
import logging
model_wrapper = RnnWordPlausibilityEvaluator(logging, temperature=1)
#                                             model=model,
#                                             ids_from_chars=ids_from_chars,
#                                             chars_from_ids=chars_from_ids,
#                                             temperature=0.8)



In [21]:
# TODO generate some words using Spolling Bree letters and see how the models agree or disagree
def create_words(given_model, num_words):
    for m in range(num_words):
      states = None
      next_char = tf.constant(['**'])
      result = [next_char]

      for n in range(100):
        next_char, states = given_model.generate_one_step(next_char, states=states)
        result.append(next_char)
        if next_char == '*':
          break

      result = tf.strings.join(result)
      end = time.time()
      print(result[0].numpy().decode('utf-8'))

create_words(model_wrapper, 5)

**ayuse*
**upweed*
**chastralite*
**petalouportionazeal*
**social*


In [None]:
model_wrapper.evaluate_word('test')

### Save and load the model

In [None]:
## Save the model weights only
model.save_weights("base_model_saved_weights")

In [None]:
## Save the layers needed to regenerate ModelWrapper
tf.saved_model.save(chars_from_ids, 'chars_from_ids')
tf.saved_model.save(ids_from_chars, 'ids_from_chars')

In [None]:
# loaded_model.load_weights("base_model_saved_weights")