**Import neccesary libraries**

In [1]:
import json
import keras
from keras import layers
from keras import ops
import numpy as np
import os
import pandas as pd
import tiktoken


os.environ["KERAS_BACKEND"] = "tensorflow"

---
**Define the `DEBUG` variable**

Set `DEBUG` to `False` when you're ready to run and train the full model. To speed up debugging, we operate on a subset of the total dataset.


In [12]:
DEBUG = True

---
**Load the data and drop unwanted recipe sources**

To keep training managable for a laptop, we drop some recipe sources from the dataset. We also reshuffle the recipes to ensure the model learns evenly from all remaining recipe sources.

In [3]:
df = pd.read_csv('RecipeNLG/RecipeNLG_dataset.csv')
print(f'df size before: {len(df)}')

df size before: 2231142


In [13]:
df = df[~df['link'].str.contains('www.cookbooks.com')]
print(f'df size without cookbooks.com: {len(df)}')
df = df[~df['link'].str.contains('www.allrecipes.com')]
print(f'df size without allrecipes.com: {len(df)}')
df = df.sample(1000) if DEBUG else df.sample(frac=1)


df size without cookbooks.com: 1000
df size without allrecipes.com: 1000


---
**Transform the recipe dataset into a tokenization-ready string**

Recipes are transformed into the string representation of a json object for easy manipulation once generated. Special tokens for denoting the start and end of a recipe are added to both ends of the stringified recipe.

In [5]:
START_OF_RECIPE = "<|recipe_start|>"
END_OF_RECIPE = "<|recipe_end|>"

def stringify_recipe(recipe):
    title = recipe['title']
    ingredients = eval(recipe['ingredients'])
    directions = eval(recipe['directions'])
    ner = eval(recipe['NER'])

    stringified_recipe = json.dumps({
        'ner': ner,
        'title': title,
        'ingredients': ingredients,
        'directions': directions,
    })
    return START_OF_RECIPE + stringified_recipe + END_OF_RECIPE

stringified_recipes = df.apply(stringify_recipe, axis=1)

---
**Tokenize the recipes**

Using OpenAI's Base Pair Encoding tokenizer, [tiktoken](https://github.com/openai/tiktoken?tab=readme-ov-file), we tokenize the the recipe dataset.

In [6]:
cl100k_base = tiktoken.get_encoding("cl100k_base")

SPECIAL_TOKENS = {
    START_OF_RECIPE: 100264,
    END_OF_RECIPE: 100265,
}

enc = tiktoken.Encoding(
    name="cl100k_im",
    pat_str=cl100k_base._pat_str,
    mergeable_ranks=cl100k_base._mergeable_ranks,
    special_tokens={
        **cl100k_base._special_tokens,
        **SPECIAL_TOKENS,
    }
)

encoded_recipes = np.array([], dtype=np.int64)
for recipe in stringified_recipes:
    encoded_recipe = enc.encode(recipe, allowed_special="all")
    encoded_recipes = np.append(encoded_recipes, encoded_recipe)

VOCAB_SIZE = len(set(encoded_recipes))

print(type(encoded_recipes), encoded_recipes.shape)

print("[", end='')
for token in encoded_recipes[:1000]:
    print(f"  {token}  , ", end='')
print("]")


print("[", end='')
for token in encoded_recipes[:1000]:
    print(f"  {enc.decode([token])}  , ", end='')
print("]")


<class 'numpy.ndarray'> (315768,)
[  100264  ,   5018  ,   1215  ,   794  ,   4482  ,   12440  ,   416  ,   498  ,   330  ,   337  ,   535  ,   5707  ,   498  ,   330  ,   566  ,   309  ,   23661  ,   498  ,   330  ,   273  ,   1677  ,   23661  ,   498  ,   330  ,   29468  ,   94582  ,   2857  ,   498  ,   330  ,   73480  ,   3258  ,   498  ,   330  ,   83  ,   1138  ,   6241  ,   498  ,   330  ,   72408  ,   523  ,   1924  ,   498  ,   330  ,   277  ,   773  ,   5724  ,   8073  ,   330  ,   2150  ,   794  ,   330  ,   1542  ,   1636  ,   2522  ,   37037  ,   1725  ,   449  ,   12093  ,   15386  ,   12279  ,   277  ,   416  ,   21252  ,   352  ,   263  ,   498  ,   330  ,   39220  ,   794  ,   4482  ,   17  ,   14971  ,   315  ,   31735  ,   498  ,   330  ,   19  ,   93200  ,   33213  ,   5707  ,   498  ,   330  ,   16  ,   220  ,   16  ,   14  ,   19  ,   16701  ,   9581  ,   24964  ,   54789  ,   477  ,   23542  ,   24964  ,   54789  ,   498  ,   330  ,   16  ,   14  ,   17  ,   1074

---
**Split into training and validation datasets**

In [7]:
TRAIN_VAL_SPLIT = 0.9 # 90% training, 10% validation

n = int(TRAIN_VAL_SPLIT * len(encoded_recipes))
training_data = encoded_recipes[:n]
validation_data = encoded_recipes[n:]

---
**Define the data batching function**

The `X` term is a random slice of length `CONTEXT_SIZE` from the specified data source (training or valdation). The `Y` term is the same but offset by one.

In [8]:
CONTEXT_SIZE = 5

def get_batch(split):
    data = training_data if split == "train" else validation_data
    start_i = np.random.randint(len(data) - CONTEXT_SIZE) 

    X = data[start_i:start_i+CONTEXT_SIZE]
    y = data[start_i+1:start_i+CONTEXT_SIZE+1]

    return X, y

Xb, yb = get_batch("train")
print(Xb)
print(yb)

for c in range(CONTEXT_SIZE):
    context = Xb[:c+1]
    target = yb[c]
    print(f"with context={context.tolist()} the target is {target}")



[91178  3156 21411   389  1855]
[ 3156 21411   389  1855  3185]
with context=[91178] the target is 3156
with context=[91178, 3156] the target is 21411
with context=[91178, 3156, 21411] the target is 389
with context=[91178, 3156, 21411, 389] the target is 1855
with context=[91178, 3156, 21411, 389, 1855] the target is 3185


---
**Implement a TransformerBlock layer and a TokenAndPositionEmbedding layer.**

Based on the examples from [this Keras tutorial](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) and the paper ["Attention is All You Need"](https://arxiv.org/abs/1706.03762) by Vaswani et al.


In [9]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = ops.arange(n_dest)[:, None]
    j = ops.arange(n_src)
    m = i >= j - n_src + n_dest
    mask = ops.cast(m, dtype)
    mask = ops.reshape(mask, [1, n_dest, n_src])
    mult = ops.concatenate(
        [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])], 0
    )
    return ops.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, "bool")
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)
    

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(0, maxlen, 1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

---
**Define the model architecture and hyperparameters**

Again, based on the examples from [this Keras tutorial](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) and the paper ["Attention is All You Need"](https://arxiv.org/abs/1706.03762) by Vaswani et al.

In [10]:
maxlen = 80  # Max sequence size
EMBEDDING_SIZE = 256
NUM_ATTENTION_HEADS = 2
FEED_FORWARD_SIZE = 256


def create_model():
    inputs = layers.Input(shape=(maxlen,), dtype="int32")
    embedding_layer = TokenAndPositionEmbedding(maxlen, VOCAB_SIZE, EMBEDDING_SIZE)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(EMBEDDING_SIZE, NUM_ATTENTION_HEADS, FEED_FORWARD_SIZE)
    x = transformer_block(x)
    outputs = layers.Dense(VOCAB_SIZE)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam",
        loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model


**Define our RecipeGenerator class**

Inspired by the example in [the Keras tutorial](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) but adapted to fit this use case

In [11]:
class RecipeGenerator(keras.callbacks.Callback):
    """A callback to generate a recipe from our trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input 

    Arguments:
        start_tokens: List of integers, the tokens for the starting prompt.
        top_k: Integer, sample from the `top_k` token predictions. Defaults to 10.
        print_every: Integer, print after this many epochs. Defaults to 1.
        max_tokens: Integer, the maximum number of tokens to be generated after prompt. 
            Generation will end early when we reach the `<|recipe_end|>` token. Defaults to 300.
    """
    def __init__(self, start_tokens, top_k=10, print_every=1, max_tokens=300):
        self.start_tokens = start_tokens
        self.top_k = top_k
        self.print_every = print_every
        self.max_tokens = max_tokens

    def sample_from(self, logits):
        logits, idxs = ops.top_k(logits, k=self.top_k, sorted=True)
        idsx = np.asarray(idxs).astype("int32")
        predictions = keras.activations.softmax(ops.expand_dims(logits, 0))[0]
        predictions = np.asarray(predictions).astype("float32")
        return np.random.choice(idxs, p=predictions)
    
    def on_epoch_end(self, epoch):
        start_tokens = [token for token in self.start_tokens]
        if (epoch + 1) % self.print_every != 0: return

        num_tokens_generated = 0
        tokens_generated = []

        while num_tokens_generated <= self.max_tokens and tokens_generated[-1] != SPECIAL_TOKENS[END_OF_RECIPE]:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1

            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens

            x = np.array([x])
            y, _ = self.model.predict(x, verbose=0)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        
        return enc.decode(self.start_tokens + tokens_generated)