# Neural Nourishment

This project draws on examples from the Keras tutorials [Text generation with a miniature GPT](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) and [GPT text generation from scratch with KerasNLP](https://keras.io/examples/generative/text_generation_gpt/), as well as the papers ["Attention is All You Need"](https://arxiv.org/abs/1706.03762) by Vaswani et al. and ["Language Models are Few-Shot Listeners"](https://arxiv.org/abs/2005.14165) by Brown et al.

It uses [WordPiece Tokenization](https://research.google/blog/a-fast-wordpiece-tokenization-system/) and is trained on the [RecipeNLG dataset](https://www.kaggle.com/datasets/paultimothymooney/recipenlg) of 2,231,142 cooking recipes.


**Import neccesary libraries**

In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import keras_nlp
import pickle
import tensorflow as tf
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings
import tensorflow.io as tf_io

In [None]:
# train on TPU if appropriate
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)
print("GPUS: ", tf.config.list_physical_devices('GPU'))


# Check GPU availability
print("GPU Available:", tf.test.is_gpu_available())

# Check TPU availability
tpu_available = False
devices = tf.config.list_logical_devices()
for device in devices:
    if device.device_type == 'TPU':
        tpu_available = True
        break

print("TPU Available:", tpu_available)


---
**Define model constants**

In [None]:
# Data
BATCH_SIZE = 64 # Batch size we train on
MIN_STRING_LEN = 512  # Strings shorter than this will be discarded
SEQ_LEN = 512  # Length of training sequences, in tokens. AKA the context size

# Model
EMBED_DIM = 256 # size of each token's embedding
FEED_FORWARD_DIM = 128 # feed forward network inside the transformer
NUM_HEADS = 4 # number of attention heads
NUM_LAYERS = 4 # number of transformers to stack
VOCAB_SIZE = 2048  # token vocabulary size

# Training
EPOCHS = 40

# Inference
NUM_TOKENS_TO_GENERATE = 80

# Special tokens
START_OF_RECIPE = "<|recipe_start|>"
END_OF_RECIPE = "<|recipe_end|>"
PAD = "<|pad|>"
OOV = "<|oov|>"
SPECIAL_TOKENS = [PAD, START_OF_RECIPE, END_OF_RECIPE, OOV]

# File names
VOCAB_FILE = "vocab.pickle"

---
**Define the dataset as strings of full recipes**

To keep training managable for a laptop, we load the dataset into a tensorflow dataset object. This allows us to load data into memory as needed, opposed to all at once.

In [None]:
def csv_row_to_json(row):
    row = tf_io.decode_csv(records=row, record_defaults=[tf.constant([],dtype=tf.string)] * 7)

    title = row[1]
    ingredients = row[2]
    directions = row[3]
    ner = row[6]

    return tf_strings.join([
        '{"ner": ', ner, ', ',
        '"title": "', title, '", ',
        '"ingredients": ', ingredients, ', ',
        '"directions": ', directions, '}',
    ])


dataset = (
    tf_data.TextLineDataset("RecipeNLG/RecipeNLG_dataset.csv") # load the csv file line by line
    # tf_data.TextLineDataset("/kaggle/input/recipenlg/RecipeNLG_dataset.csv") # load inside kaggle notebook
    .skip(1) # skip the header row
    .shuffle(buffer_size=256) # store 256 shuffled records in memory at a time before reshuffling and refetching
    .map(lambda row: csv_row_to_json(row)) # map each row of the csv to a jsonified recipe
    # .ignore_errors() # ignore any errors in the csv file
    .apply(tf.data.experimental.ignore_errors()) # ignore any errors in the csv file
    .batch(BATCH_SIZE) # batch the dataset to train on multiple records at once
)

---
**Tokenize the dataset**

We train a WordPiece tokenizer on the dataset, reserving special tokens for the beginning and end of recipes. We can load the vocabulary and use the Keras `WordPieceTokenizer` to tokenize our tensors within the `tf.data` pipeline.

In [None]:
# train the tokenizer's vocabulary
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    data=dataset,
    vocabulary_size=VOCAB_SIZE,
    reserved_tokens=SPECIAL_TOKENS,
)

# save the vocabulary (so this step can be skipped in the future)
with open(VOCAB_FILE, 'wb') as f:
    pickle.dump(vocab, f)

In [None]:
# load the vocabulary
with open(VOCAB_FILE, "rb") as f:
    vocab = pickle.load(f)

# load the tokenizer object with the trained vocbulary
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    special_tokens_in_strings=True,
    special_tokens=SPECIAL_TOKENS,
    oov_token=OOV,
)

# Add start and end tokens, then pad the sequences
packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id(START_OF_RECIPE),
    end_value=tokenizer.token_to_id(END_OF_RECIPE),
    pad_value=tokenizer.token_to_id(PAD),
)

def preprocess(recipe_batch):
    outputs = tokenizer(recipe_batch)
    features = packer(outputs)
    labels = outputs
    return features, labels

dataset = dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

---
**Define the model's architecture**

In [None]:
def create_model():
    inputs = keras.layers.Input(shape=(None,), dtype="int32")

    # token embedding layer
    embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
        vocabulary_size=VOCAB_SIZE,
        sequence_length=SEQ_LEN,
        embedding_dim=EMBED_DIM,
        mask_zero=True,
    )

    # transformer decoders
    transformer_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
        dropout=0.1
    )

    # output layer
    output_layer = keras.layers.Dense(VOCAB_SIZE)

    # assemble the model
    x = embedding_layer(inputs)
    for _ in range(NUM_LAYERS): x = transformer_layer(x)
    outputs = output_layer(x)
    model = keras.Model(inputs=inputs, outputs=outputs)

    loss_function = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)

    model.compile(optimizer="adam", loss=loss_function, metrics=[perplexity])

    return model

In [None]:
model = create_model()
model.summary()

---
**Define a custom callback for text generation**

In [None]:
class TopKTextGenerator(keras.callbacks.Callback):
    def __init__(self, k):
        self.sampler = keras_nlp.samplers.TopKSampler(k)
        self.prompt_tokens = packer(tokenizer([START_OF_RECIPE]))

    def _next(self, prompt, cache, index):
        logits = model(prompt)[:, index-1, :]
        hidden_states = None,
        return logits, hidden_states, cache

    def on_epoch_end(self, epoch, logs=None):
        output_tokens = self.sampler(
            next=self._next,
            prompt=self.prompt_tokens,
            index=1,
        )
        txt = tokenizer.detokenize(output_tokens)
        print(f"Top-K search generated text: \n{txt}\n")

---
**Fit the model!**

In [None]:
# training one epoch on my M1 macbook pro with 16GB of RAM takes roughly 5 hours 20 minutes ... :(
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath='checkpoints/checkpoint_{epoch:02d}.keras',
    save_best_only=False,
)
text_generation_callback = TopKTextGenerator(k=10)

callbacks = [
    checkpoint_callback,
    # text_generation_callback,
]

model.fit(
    dataset,
    epochs=EPOCHS,
    callbacks=callbacks,
)