# Neural Nourishment

This project draws on examples from the Keras tutorials [Text generation with a miniature GPT](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) and [GPT text generation from scratch with KerasNLP](https://keras.io/examples/generative/text_generation_gpt/), as well as the papers ["Attention is All You Need"](https://arxiv.org/abs/1706.03762) by Vaswani et al. and ["Language Models are Few-Shot Listeners"](https://arxiv.org/abs/2005.14165) by Brown et al.

It uses a custom-trained [byte pair encoding tokenization algorithm](https://en.wikipedia.org/wiki/Byte_pair_encoding) and is trained on the [RecipeNLG dataset](https://www.kaggle.com/datasets/paultimothymooney/recipenlg) of 2,231,142 cooking recipes.


**Import neccesary libraries and the Tokenizer class**

In [15]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import csv
import json
import keras
from keras import layers
from keras import ops
import keras_nlp
import numpy as np
import tensorflow as tf
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings

from tokenizer import Tokenizer

---
**Define hyperparameters**

In [16]:
# Data
BATCH_SIZE = 64
MIN_STRING_LEN = 512  # Strings shorter than this will be discarded
SEQ_LEN = 128  # Length of training sequences, in tokens

# Model
EMBED_DIM = 256
FEED_FORWARD_DIM = 128
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 4096  # Limits parameters in model.

# Training
EPOCHS = 5
TOKENIZER_TRAINING_SIZE = 256

# Inference
NUM_TOKENS_TO_GENERATE = 80

# Special tokens
START_OF_RECIPE = "<|recipe_start|>"
END_OF_RECIPE = "<|recipe_end|>"
SPECIAL_TOKENS = {
    START_OF_RECIPE: VOCAB_SIZE - 1,
    END_OF_RECIPE: VOCAB_SIZE - 2,
}


---
**Define the dataset as strings of full recipes**

To keep training managable for a laptop, we load the dataset into a tensorflow dataset object. This allows us to load data into memory as needed, opposed to all at once. 

In [17]:
@tf.py_function(Tout=tf.string)
def parse_csv_row(row):
    row = tf_strings.as_string(row)
    row_values = next(csv.reader([row.numpy().decode('utf-8')]))

    ner = eval(row_values[6])
    title = row_values[1]
    ingredients = eval(row_values[2])
    directions = eval(row_values[3])

    stringified_recipe = json.dumps({
        'ner': ner,
        'title': title,
        'ingredients': ingredients,
        'directions': directions,
    })

    return START_OF_RECIPE + stringified_recipe + END_OF_RECIPE

# load in the csv file line by line
dataset = (
    tf_data.TextLineDataset("RecipeNLG/RecipeNLG_dataset.csv") # load the csv file line by line
    .skip(1) # skip the header row
    .shuffle(buffer_size=256) # store 256 shuffled records in memory at a time before reshuffling and refetching
    .map(lambda row: parse_csv_row(row)) # map each row of the csv to a stringified recipe
)

---
**Train the BPE tokenizer**

On a subset of our data, we train a custom byte-pair-encoding tokenizer. Special tokens are used for denoting the beginning and end of recipes.

In [4]:
tokenizer = Tokenizer()

tokenizer_dataset = ""
for recipe in dataset.take(TOKENIZER_TRAINING_SIZE):
    string_recipe = recipe.numpy().decode('utf-8')
    tokenizer_dataset += string_recipe

tokenizer.register_special_tokens(SPECIAL_TOKENS)
tokenizer.train(tokenizer_dataset, VOCAB_SIZE - len(SPECIAL_TOKENS))

2024-04-25 15:17:16.609816: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


---
**Tokenize the dataset**

Recipes are batched, tokenized representation, and prepared for training

In [18]:
@tf.py_function(Tout=(tf.int32, tf.int32))
def prepare_model_inputs(recipe):
    recipe = tf_strings.as_string(recipe).numpy().decode('utf-8') # convert the tensor to a string
    tokenized_recipe = tokenizer.encode(recipe) # tokenize the string
    if len(tokenized_recipe) > SEQ_LEN:
        X = tokenized_recipe[:SEQ_LEN]
        y = tokenized_recipe[1:SEQ_LEN + 1]
    else:
        padding = [0] * (SEQ_LEN - len(tokenized_recipe) + 1) # pad shorter token sequences with zeros to ensure uniform size
        X = tokenized_recipe[:] + padding[:-1]
        y = tokenized_recipe[1:] + padding[:]
    
    # X is a list of tokens, y is the same list shifted by one token. Both lists are of length SEQ_LEN
    return X, y

dataset = dataset.map(prepare_model_inputs, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.prefetch(tf.data.AUTOTUNE).batch(BATCH_SIZE) # batch so the model trains on multiple examples at once

In [19]:
def create_model():
    inputs = keras.layers.Input(shape=(BATCH_SIZE, SEQ_LEN), dtype="int32")

    # token embedding layer
    embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
        vocabulary_size=VOCAB_SIZE,
        sequence_length=SEQ_LEN,
        embedding_dim=EMBED_DIM,
        mask_zero=True, # might throw an error if index zero is in the vocabulary(??)
    )

    # transformer decoders
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )

    # output layer
    output_layer = keras.layers.Dense(VOCAB_SIZE)

    x = embedding_layer(inputs)
    for _ in range(NUM_LAYERS):
        x = decoder_layer(x)
    outputs = output_layer(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    loss_function = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)

    model.compile(optimizer="adam", loss=loss_function, metrics=[perplexity])
    
    return model

In [20]:
model = create_model()
model.summary()

---
**Fit the model**

In [21]:
model.fit(
    dataset,
    verbose=2,
    epochs=EPOCHS,
    callbacks=[
        keras.callbacks.ModelCheckpoint("drop_save_at_{epoch}.keras"),
    ],
)

Epoch 1/5


ValueError: as_list() is not defined on an unknown TensorShape.

In [22]:
for X, y in dataset.take(1):
    print(f"---- X:")
    print(X.numpy())
    print(X.shape)
    print(f"---- y:")
    print(y.numpy())
    print(y.shape)


---- X:
[[4095  123   34 ...    0    0    0]
 [4095  123   34 ...    0    0    0]
 [4095  123   34 ... 1242  274  256]
 ...
 [4095  123   34 ...  518 1959  274]
 [4095  123   34 ...  780  257  256]
 [4095  123   34 ...  822  274  256]]
(64, 128)
---- y:
[[ 123   34  341 ...    0    0    0]
 [ 123   34  341 ...    0    0    0]
 [ 123   34  341 ...  274  256 1791]
 ...
 [ 123   34  341 ... 1959  274  256]
 [ 123   34  341 ...  257  256   50]
 [ 123   34  341 ...  274  256 2312]]
(64, 128)


2024-04-25 15:27:49.081936: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
