**Import neccesary libraries and the Tokenizer class**

In [47]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import csv
import json
import keras
from keras import layers
from keras import ops
import numpy as np
import tensorflow as tf
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings

from tokenizer import Tokenizer


---
**Define the dataset as strings of full recipes**

To keep training managable for a laptop, we load the dataset into a tensorflow dataset object. This allows us to load data into memory as needed, opposed to all at once. 

In [48]:
START_OF_RECIPE = "<|recipe_start|>"
END_OF_RECIPE = "<|recipe_end|>"

@tf.py_function(Tout=tf.string)
def parse_csv_row(row):
    row = tf_strings.as_string(row)
    row_values = next(csv.reader([row.numpy().decode('utf-8')]))

    ner = eval(row_values[6])
    title = row_values[1]
    ingredients = eval(row_values[2])
    directions = eval(row_values[3])

    stringified_recipe = json.dumps({
        'ner': ner,
        'title': title,
        'ingredients': ingredients,
        'directions': directions,
    })

    return START_OF_RECIPE + stringified_recipe + END_OF_RECIPE

# load in the csv file line by line
dataset = tf_data.TextLineDataset("RecipeNLG/RecipeNLG_dataset.csv")
 # skip the header row
dataset = dataset.skip(1)
# shuffles the ordering of the dataset. Stores 256 shuffled records in memory at a time before reshuffling
dataset = dataset.shuffle(buffer_size=256)
# map each row of the csv to a stringified recipe
dataset = dataset.map(lambda row: parse_csv_row(row))

---
**Train the BPE tokenizer**

On a subset of our data, we train a custom byte-pair-encoding tokenizer. Special tokens are used for denoting the beginning and end of recipes.

In [49]:
tokenizer = Tokenizer()

VOCAB_SIZE = 2048

SPECIAL_TOKENS = {
    START_OF_RECIPE: VOCAB_SIZE - 1,
    END_OF_RECIPE: VOCAB_SIZE - 2,
}

tokenizer_dataset = ""
limit = 250
for recipe in dataset:
    if limit == 0: break
    tokenizer_dataset += recipe.numpy().decode('utf-8')
    limit -= 1

tokenizer.register_special_tokens(SPECIAL_TOKENS)
tokenizer.train(tokenizer_dataset, VOCAB_SIZE - len(SPECIAL_TOKENS))


---
**Tokenize the dataset**

Recipes are batched, tokenized representation, and prepared for training

In [50]:
BATCH_SIZE = 4 # defines the number of recipes we will process at a time
MAX_LEN = 4096  # Max sequence size

dataset = dataset.batch(batch_size=BATCH_SIZE) 

@tf.py_function(Tout=(tf.int32, tf.int32))
def prepare_model_inputs(batch_of_recipes):
    tokens = []
    
    for batch in batch_of_recipes:
        recipe = tf_strings.as_string(batch) # convert the tensor to a string
        tokenized_recipe = tokenizer.encode(recipe.numpy().decode('utf-8')) # tokenize the string
        tokens.append(tokenized_recipe)
        
    X, y = [], []
    for t in tokens:
        zeros = [0] * (MAX_LEN - len(t)) # pad shorter token sequences with zeros to ensure uniform size
        X.append(t[:-1] + zeros)
        y.append(t[1:] + zeros)

    return X, y

dataset = dataset.map(prepare_model_inputs, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.prefetch(tf.data.AUTOTUNE)

---
**Implement a TransformerBlock layer and a TokenAndPositionEmbedding layer.**

Based on the examples from [this Keras tutorial](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) and the paper ["Attention is All You Need"](https://arxiv.org/abs/1706.03762) by Vaswani et al.


In [51]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = ops.arange(n_dest)[:, None]
    j = ops.arange(n_src)
    m = i >= j - n_src + n_dest
    mask = ops.cast(m, dtype)
    mask = ops.reshape(mask, [1, n_dest, n_src])
    mult = ops.concatenate(
        [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])], 0
    )
    return ops.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, "bool")
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)
    

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(0, maxlen, 1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

---
**Define the model architecture and hyperparameters**

Again, based on the examples from [this Keras tutorial](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) and the paper ["Attention is All You Need"](https://arxiv.org/abs/1706.03762) by Vaswani et al.

In [52]:
EMBEDDING_SIZE = 256 # Embedding size for each token
NUM_ATTENTION_HEADS = 2 # Number of attention heads
FEED_FORWARD_SIZE = 256 # Feed forward size in each transformer block

def create_model():
    inputs = layers.Input(shape=(MAX_LEN,), dtype="int32")
    embedding_layer = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_SIZE)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(EMBEDDING_SIZE, NUM_ATTENTION_HEADS, FEED_FORWARD_SIZE)
    x = transformer_block(x)
    outputs = layers.Dense(VOCAB_SIZE)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam",
        loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model


**Define our RecipeGenerator class**

Inspired by the example in [the Keras tutorial](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) but adapted to fit this use case

In [53]:
class RecipeGenerator(keras.callbacks.Callback):
    """A callback to generate a recipe from our trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input 

    Arguments:
        start_tokens: List of integers, the tokens for the starting prompt.
        top_k: Integer, sample from the `top_k` token predictions. Defaults to 10.
        print_every: Integer, print after this many epochs. Defaults to 1.
        max_tokens: Integer, the maximum number of tokens to be generated after prompt. 
            Generation will end early when we reach the `<|recipe_end|>` token. Defaults to 300.
    """
    def __init__(self, start_tokens, top_k=10, print_every=1, max_tokens=300):
        self.start_tokens = start_tokens
        self.top_k = top_k
        self.print_every = print_every
        self.max_tokens = max_tokens

    def sample_from(self, logits):
        logits, idxs = ops.top_k(logits, k=self.top_k, sorted=True)
        idsx = np.asarray(idxs).astype("int32")
        predictions = keras.activations.softmax(ops.expand_dims(logits, 0))[0]
        predictions = np.asarray(predictions).astype("float32")
        return np.random.choice(idxs, p=predictions)
    
    def on_epoch_end(self, epoch):
        if (epoch + 1) % self.print_every != 0: return
        
        start_tokens = [token for token in self.start_tokens]
        tokens_generated = []

        while len(tokens_generated) <= self.max_tokens and tokens_generated[-1] != SPECIAL_TOKENS[END_OF_RECIPE]:
            pad_len = MAX_LEN - len(start_tokens)
            sample_index = len(start_tokens) - 1

            if pad_len < 0:
                x = start_tokens[:MAX_LEN]
                sample_index = MAX_LEN - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens

            x = np.array([x])
            y, _ = self.model.predict(x, verbose=0)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
        
        generated_text = tokenizer.decode(self.start_tokens + tokens_generated)
        print(f"Generated text on epoch {epoch + 1}:\n{generated_text}")
        try:
            formatted_recipe = json.loads(generated_text)
            print(json.dumps(formatted_recipe, indent=3))
        except:
            print("Something went wrong, failed to parse recipe into a JSON object.")

In [46]:
start_prompt = START_OF_RECIPE +'{"ner": ["garlic",'
start_tokens = tokenizer.encode(start_prompt)
recipe_gen_callback = RecipeGenerator(start_tokens)

model = create_model()

model.fit(
    dataset,
    verbose=2,
    epochs=25,
    callbacks=[
        recipe_gen_callback,
        keras.callbacks.ModelCheckpoint("drop_save_at_{epoch}.keras"),
    ],
)

Epoch 1/25


ValueError: as_list() is not defined on an unknown TensorShape.