In [1]:
import json
import numpy as np
import pandas as pd
import tiktoken
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Set to True when ready to run the full model, to speed up debugging we can operate on a subset.
DEBUG = True

# Set to True to print information as the code runs. Automatically set to False if DEBUG is False. Override at your own risk
VERBOSE = True 
if not DEBUG: VERBOSE = False

def print(*args, **kwargs):
    if VERBOSE:
        __builtins__.print(*args, **kwargs)

In [3]:
df = pd.read_csv('RecipeNLG/RecipeNLG_dataset.csv')
print(f'df size before: {len(df)}')
if VERBOSE: df.head()

df size before: 2231142


In [4]:
df = df[~df['link'].str.contains('www.cookbooks.com')]
print(f'df size without cookbooks.com: {len(df)}')
df = df[~df['link'].str.contains('www.allrecipes.com')]
print(f'df size without allrecipes.com: {len(df)}')
if DEBUG: df = df.sample(1000)

if VERBOSE: df.head()


df size without cookbooks.com: 1334801
df size without allrecipes.com: 1273403


In [5]:
START_OF_RECIPE = "<|recipe_start|>"
END_OF_RECIPE = "<|recipe_end|>"

def stringify_recipe(recipe):
    title = recipe['title']
    ingredients = eval(recipe['ingredients'])
    directions = eval(recipe['directions'])
    ner = eval(recipe['NER'])

    stringified_recipe = json.dumps({
        'ner': ner,
        'title': title,
        'ingredients': ingredients,
        'directions': directions,
    })
    return START_OF_RECIPE + stringified_recipe + END_OF_RECIPE

stringified_recipes = df.apply(stringify_recipe, axis=1)

In [6]:
cl100k_base = tiktoken.get_encoding("cl100k_base")
enc = tiktoken.Encoding(
    name="cl100k_im",
    pat_str=cl100k_base._pat_str,
    mergeable_ranks=cl100k_base._mergeable_ranks,
    special_tokens={
        **cl100k_base._special_tokens,
        START_OF_RECIPE: 100264,
        END_OF_RECIPE: 100265,
    }
)

encoded_recipes = np.array([], dtype=np.int64)
for recipe in stringified_recipes:
    encoded_recipe = enc.encode(recipe, allowed_special="all")
    encoded_recipes = np.append(encoded_recipes, encoded_recipe)

VOCAB_SIZE = len(set(encoded_recipes))

print(type(encoded_recipes), encoded_recipes.shape)

print("[", end='')
for token in encoded_recipes[:1000]:
    print(f"  {token}  , ", end='')
print("]")


print("[", end='')
for token in encoded_recipes[:1000]:
    print(f"  {enc.decode([token])}  , ", end='')
print("]")


<class 'numpy.ndarray'> (322154,)
[  100264  ,   5018  ,   1215  ,   794  ,   4482  ,   5755  ,   498  ,   330  ,   4071  ,   466  ,   498  ,   330  ,   46  ,   25859  ,   15895  ,   498  ,   330  ,   45030  ,   416  ,   498  ,   330  ,   47  ,   1590  ,   3258  ,   3061  ,   2094  ,   498  ,   330  ,   41211  ,   63524  ,   45419  ,   498  ,   330  ,   44  ,   9700  ,   89  ,   76031  ,   45419  ,   8073  ,   330  ,   2150  ,   794  ,   330  ,   31631  ,   54424  ,   8602  ,   65741  ,   95825  ,   386  ,   1386  ,   1354  ,   498  ,   330  ,   39220  ,   794  ,   4482  ,   16  ,   4459  ,   26371  ,   1789  ,   35918  ,   90395  ,   320  ,   40  ,   5560  ,   3092  ,   40745  ,   38091  ,   6207  ,   763  ,   3092  ,   26371  ,   8425  ,   11844  ,   330  ,   23  ,   6771  ,   2203  ,   29639  ,   31457  ,   11  ,   386  ,   3903  ,   291  ,   498  ,   330  ,   17  ,   6771  ,   2203  ,   29639  ,   47814  ,   15895  ,   498  ,   330  ,   18  ,   85388  ,   95825  ,   11  ,   3468  ,

In [7]:
TRAIN_VAL_SPLIT = 0.9 # 90% training, 10% validation

n = int(TRAIN_VAL_SPLIT * len(encoded_recipes))
training_data = encoded_recipes[:n]
validation_data = encoded_recipes[n:]

CONTEXT_SIZE = 5

def get_batch(split):
    data = training_data if split == "train" else validation_data
    start_i = np.random.randint(len(data) - CONTEXT_SIZE) 

    X = data[start_i:start_i+CONTEXT_SIZE]
    y = data[start_i+1:start_i+CONTEXT_SIZE+1]

    return X, y

In [8]:
Xb, yb = get_batch("train")
print(Xb)
print(yb)

for c in range(CONTEXT_SIZE):
    context = Xb[:c+1]
    target = yb[c]
    print(f"with context={context.tolist()} the target is {target}")

[5061 5061 5061  220   16]
[5061 5061  220   16   14]
with context=[5061] the target is 5061
with context=[5061, 5061] the target is 5061
with context=[5061, 5061, 5061] the target is 220
with context=[5061, 5061, 5061, 220] the target is 16
with context=[5061, 5061, 5061, 220, 16] the target is 14
