In [51]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nltk
import os
import time

from datasets import load_dataset

In [52]:
# Hyperparameters
block_size = 64
buffer_size = 10000 # maximum amount to be held in the memory buffer

## Get our Dataset ##

In [53]:
# Need to use the Hugging Face CLI to load this dataset
# Also make sure to set the package fsspec==2023.9.2 to avoid errors
dataset = load_dataset("roneneldan/TinyStories")

Found cached dataset parquet (/Users/jauliegoe/.cache/huggingface/datasets/roneneldan___parquet/roneneldan--TinyStories-a62fc98e062666ca/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00,  4.15it/s]


In [54]:
# Hugging Face Datasets already have a split
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})


In [55]:
#text = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')
train_data = dataset["train"]
val_data = dataset["validation"]
column_text = train_data['text']
print("There are", len(column_text), "tiny stories.")

There are 2119719 tiny stories.


In [56]:
# We need the raw text from the columns. 
raw_text = " ".join(column_text[0:20000]) # Lets take the first 20,000 stories to train out smaller model
print(raw_text[0:100])

One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with


## Encode the Text ##
Now we vectorize the text by encoding the strings with a numerical representation. Each character will be mapped to a value using a function. A decoder function will also map the values back to the same characters.

In [57]:
chars = sorted(set(raw_text))
vocab_size = len(chars)
print("The size of the vocabulary is", vocab_size, " unique characters.")

# Create a dict which maps each unique character to an index (Encoder)
string_to_int = {ch:i for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]

# Create a dict which maps each unique index to a character (Decoder)
int_to_string = {i:ch for i,ch in enumerate(chars)}
decode = lambda l: ''.join([int_to_string[i] for i in l])

The size of the vocabulary is 97  unique characters.


In [58]:
#Let's look at all the unique characters that are in our dataset
print(chars)

['\n', ' ', '!', '"', '$', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '¡', '¦', '©', '«', '±', '³', '»', 'Â', 'Ã', 'â', 'œ', '˜', '“', '”', '€', '™']


### StringLookup Layer in Tensorflow ###
Alternatively, this can be done with a StringLookup Layer in Tensorflow, which converts each character to a value based on a vocabulary. An inverted StringLookup Layer with the argument `invert=True` will convert a list of values back to the corresponding characters.

In [59]:
# StringLookup Layer
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=chars, mask_token=None)
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=chars, mask_token=None, invert=True)
ids = ids_from_chars(chars)
print(ids)
invert = chars_from_ids(ids)
print(invert)

# Function to join the characters in the Tensor back into a String 
# (they will not be in the order of the original String but in the order they appear in the vocabulary)
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1).numpy()

word = ['H', 'e', 'l','l','o',' ', 'W', 'o','r','l','d']
word_ids = ids_from_chars(word)
print(word_ids)
print(text_from_ids(word_ids))

tf.Tensor(
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
 97], shape=(97,), dtype=int64)
tf.Tensor(
[b'\n' b' ' b'!' b'"' b'$' b'&' b"'" b'(' b')' b'*' b'+' b',' b'-' b'.'
 b'/' b'0' b'1' b'2' b'3' b'4' b'5' b'6' b'7' b'8' b'9' b':' b';' b'?'
 b'A' b'B' b'C' b'D' b'E' b'F' b'G' b'H' b'I' b'J' b'K' b'L' b'M' b'N'
 b'O' b'P' b'Q' b'R' b'S' b'T' b'U' b'V' b'W' b'X' b'Y' b'Z' b'a' b'b'
 b'c' b'd' b'e' b'f' b'g' b'h' b'i' b'j' b'k' b'l' b'm' b'n' b'o' b'p'
 b'q' b'r' b's' b't' b'u' b'v' b'w' b'x' b'y' b'z' b'\xc2\xa0' b'\xc2\xa1'
 b'\xc2\xa6' b'\xc2\xa9' b'\xc2\xab' b'\xc2\xb1' b'\xc2\xb3' b'\xc2\xbb'
 b'\xc3\x82' b'\xc3\x83' b'\xc3\xa2' b'\xc5\x93' b'\xcb\x9c'
 b'\xe2\x80\x9c' b'\xe2\x80\x9d' b'\xe2\x82\xac' b'\xe2\x84\xa2'], shape=(97,), dtype=str

## Prediction Task ##

### Create an Input Pipeline ###

What is the most probable next character? The input to the model we will create will be a sequence of n characters, and the model will be trained to predict the character at n+1 at each step. We will use the `tf.data.Dataset.from_tensor_slices` function to create an input pipeline from a tensor of character ids called `all_ids`. Each element of the dataset corresponds to a slice of the input tensors.

In [60]:
all_ids = ids_from_chars(tf.strings.unicode_split(raw_text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids) # Create a tf.data.Dataset object for batching

seq_len = 100 
# We take seq_len+1 size batches so that when we create (input,label) pairs later, they are of length seq_len
sequences = ids_dataset.batch(seq_len+1,drop_remainder=True)

# Prints all character derived from ids in one batch
for seq in sequences.take(1):
    print(chars_from_ids(seq))

# Prints all words formed by the characters in 5 batches
for seq in sequences.take(5):
  print(text_from_ids(seq))

tf.Tensor(
[b'O' b'n' b'e' b' ' b'd' b'a' b'y' b',' b' ' b'a' b' ' b'l' b'i' b't'
 b't' b'l' b'e' b' ' b'g' b'i' b'r' b'l' b' ' b'n' b'a' b'm' b'e' b'd'
 b' ' b'L' b'i' b'l' b'y' b' ' b'f' b'o' b'u' b'n' b'd' b' ' b'a' b' '
 b'n' b'e' b'e' b'd' b'l' b'e' b' ' b'i' b'n' b' ' b'h' b'e' b'r' b' '
 b'r' b'o' b'o' b'm' b'.' b' ' b'S' b'h' b'e' b' ' b'k' b'n' b'e' b'w'
 b' ' b'i' b't' b' ' b'w' b'a' b's' b' ' b'd' b'i' b'f' b'f' b'i' b'c'
 b'u' b'l' b't' b' ' b't' b'o' b' ' b'p' b'l' b'a' b'y' b' ' b'w' b'i'
 b't' b'h' b' '], shape=(101,), dtype=string)
b'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with '
b'it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on h'
b'er shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew'
b' my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogeth'
b"er, they shared the 

### Create a Dataset ###
Now that we have a dataset of sequences for our pipeline, we need to create a dataset of (input, label) pairs for training. The input will be the sequence and the label will be the sequence n+1. 

In [61]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [62]:
# Creates a dataset where each sequence maps to another 
# sequence of the same size but shifted forward by 1
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example))
    print("Target:", text_from_ids(target_example))

Input : b'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with'
Target: b'ne day, a little girl named Lily found a needle in her room. She knew it was difficult to play with '


In [63]:
# Shuffle the data and put it into new batches before feeding it to the model
dataset = (
    dataset
    .shuffle(buffer_size)
    .batch(block_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)) # allows tf to prefetch the next batch of data 
                                              # while the current batch is being processed by the model

### Build the Model ###
Create a Model using a subclass of `tf.keras.Model`. This allows for the model to be customized for this use case. 

This model will have three layers:
- `tf.keras.layers.Embedding`: The input layer. A trainable lookup table that will map each character-ID to a vector. Creating an `embedding_dim` dimensional embedding vector for each character.
- `tf.keras.layers.GRU`: A Gated Recurrent Unit (GRU) Layer is a type of RNN used commonly for sequence modeling. 
- `tf.keras.layers.Dense`: This is the output layer. It will output a `vocab_size` (number of unique characters) output. For each character in the vocabulary, it will have calculated a logit, the log-likelihood of each character according to the model.

In [64]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [65]:
# Create the Model subclass
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True, # returns sequences of outputs for each time step in the input sequence
                                   return_state=True) # returns the hidden state in addition to the output
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [66]:
# Create the Model itself
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [67]:
# Test the model by sampling a batch
example_batch = dataset.take(1)
input_example_batch, target_example_batch = next(iter(example_batch))
example_batch_predictions = model(input_example_batch)
print(example_batch_predictions.shape)

(64, 100, 98)


In [68]:
model.summary()

Model: "my_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     multiple                  25088     
                                                                 
 gru_2 (GRU)                 multiple                  3938304   
                                                                 
 dense_2 (Dense)             multiple                  100450    
                                                                 
Total params: 4,063,842
Trainable params: 4,063,842
Non-trainable params: 0
_________________________________________________________________


In [69]:
# Check out what next characters are predicted by the untrained model for the sample batch
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1)
print(sampled_indices)
print("Input:\n", text_from_ids(input_example_batch[0]))
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices))

tf.Tensor(
[18 45 48 86 88 76 92 94  7 38 88 43 24 30  6 65 55 79 50 13 27  4 80 62
 39 55 71 80 33 13 10 63 63 65 52 61 13 60 48 90 54 76  5 11  7  7 29  8
 16 17 23 90 22  7 58 61 34 20 76  7 63 76 56 38 37 50 77 46 49  6 69 16
 27 50 84 96 69  7 44 55 88 33 95  5 92 23 25  1 75 46 15 12 79 94 72 82
 88 18  7 40], shape=(100,), dtype=int64)
Input:
 b'pened. They hope he can fix the net. They want to test it again. Tom and Lily were playing in the pa'

Next Char Predictions:
 b'2QT\xc2\xb1\xc2\xbbv\xc5\x93\xe2\x80\x9c\'J\xc2\xbbO8B&kayV-;"zhKaqzE-*iikXg-fT\xc3\x83Zv$+\'\'A(017\xc3\x836\'dgF4v\'ivbJIVwRU&o0;V\xc2\xa9\xe2\x82\xaco\'Pa\xc2\xbbE\xe2\x80\x9d$\xc5\x9379\nuR/,y\xe2\x80\x9cr\xc2\xa1\xc2\xbb2\'L'


### Train the Model ###
The predictions so far tell us very little since the model has yet to be trained. Given the previous RNN state and the new input from this time step, we will predict the next character.

In [70]:
# Loss function
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) # set to return logits

In [71]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 98)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.5833273, shape=(), dtype=float32)


An untrained, newly initialized model should have output logits that are roughly equal to one another. If this is the case, then the exponential of the mean should be approximately the size of the vocabulary. Let's check that this is the case.

In [72]:
tf.exp(example_batch_mean_loss).numpy()

97.83939

We configure the model's training using `tf.keras.Model.compile` which configures various training-related settings. We define the optimization algorithm to be used, Adam, and the loss function, Spare Categorical Cross Entropy. The optimization algorithm is responsible for updating the model's weights based on the computed gradients. Adam (Adaptive Moment Estimation) is a popular optimization algorithm for training NNs because it has adaptive learning rates, momentum, bias correction, and is pretty efficient. The loss function (defined above) is the function whose value should be minimized (optimized). 

In [73]:
model.compile(optimizer='adam', loss=loss)

### Checkpoint Callback ###
A checkpoint callback is a feature used to periodically save the model's weights and training progress during the training process. It allows the creation of backups or checkpoints of your model at specific intervals during training, so that training can later be resumed from those saved points or they can be used for evaluation or inference.

In [74]:
# Ensure checkpoints are saved
checkpoint_dir = './training_checkpoints' # directory where the checkpoints will be saved
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [75]:
EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Generate Text Using the Model ##
Now, we run the pre-trained model in a loop, keeping track of its internal state as it is executed. Each time you call the model you pass in some text and an internal state. The model returns a prediction for the next character and its new state. Pass the prediction and state back in to continue generating text.

In [76]:
# A model to make a single step prediction
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [77]:
# Initialize the single step model
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [78]:
# Run the model in a loop to continue generating the next character
start = time.time()
states = None
next_char = tf.constant(['A'])
result = [next_char]

# We do 1000 steps, generating a 1000 character text sample
for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

A dog, a book. It was blue and black. Sam did not know how.

"Hey, kids, Sue, to throw his food, but it still freezes."

Lily smiled and said, "Yes, you can play with you. A knob with them on cloth. We can take turns to fix it." They are still sad, but they are sad and angry. They splash and slide and swing. They found some water on a leaf on the table. It was not cold. It called from a pond.

"What is that later, it's full of you? Can you hear her heart," Lily said.

"Let's buy it and play again."

Bun fanters the lemon.

Lily is happy.

Sara and Ben are playing in the park with music. He takes them all around in the sand and sit still ones. One day, they want to play bans. They throw the red view down the slide.

But the dunns will not touch the colors. They swing on the swings. They share the laindres, but they are good.

"Let's play a game without thing, surks," mom says.

"Can you open the box? Wh two wind answers and pictures or lights or that monkeys. We will pull thim."

"Mom, 

In [79]:
tf.saved_model.save(one_step_model, 'one_step')





INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets
