In [39]:
import pandas as pd
import numpy as np
import pickle
import tqdm
import tensorflow as tf
import ast

Load in the ingredient map (maps ingredient IDs to the corresponding string) and the recipes:

In [40]:
ingredient_map = np.load("../../data/tokenised_recipes/ingr_map.pkl",allow_pickle=True)

In [41]:
ingredient_map

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308
...,...,...,...,...,...,...,...
11654,soybeans,1,soybean,7,soybean,31,6702
11655,goose,1,goose,5,goose,8,3318
11656,ajwain,1,ajwain,6,ajwain,13,47
11657,brinjals,1,brinjal,7,brinjal,2,750


In [42]:
vocab = ingredient_map.groupby("id").first()["replaced"].values
vocab_size = len(vocab)

In [43]:
vocabIndex = lambda query: np.argmax(vocab == query)

Add the null padding character:

In [44]:
vocab = np.insert(vocab, 0, "")

In [45]:
recipes = pd.read_csv("../../data/tokenised_recipes/PP_recipes.csv")

In [46]:
recipes["ingredient_ids"][0]

'[389, 7655, 6270, 1527, 3406]'

Map the string list of ingredients to an actual list:

In [47]:
# function to parse strings of lists as Python lists
def parseTupleFunc(tuple_str: str):

    try:
        return ast.literal_eval(tuple_str)

    except Exception as e:

        print(f"Exception: {e}")
        
# convert to list
recipes["ingredient_ids"] = recipes["ingredient_ids"].apply(parseTupleFunc)
# get ingredient list length
recipes["ingredient_len"] = recipes["ingredient_ids"].apply(lambda ingredients: len(ingredients))

# drop recipes with only 1 ingredient
recipes_ingredients = recipes[recipes["ingredient_len"] > 1]["ingredient_ids"]

Update the recipe IDs to account for the null character:

In [48]:
id_updater = lambda ingredients: [ingredient_id + 1 for ingredient_id in ingredients]

recipes_ingredients = recipes_ingredients.apply(id_updater)

Pad the recipes to have the same length:

In [49]:
max_num_ingredients = max([len(ings) for ings in recipes["ingredient_ids"].values])

# create array of 0s to fill
padded_recipes = np.zeros((len(recipes_ingredients),max_num_ingredients),dtype="int64")

for i, row in enumerate(recipes_ingredients):
    padded_recipes[i, :len(row)] += row

In [50]:
window_size = 4
num_negative_samples = 5

X = []
Y = []

# for each recipe
for recipe in tqdm.tqdm(padded_recipes):
    
    # generate all positive skip grams using the given window size
    skip_grams, recipe_labels = tf.keras.preprocessing.sequence.skipgrams(
        recipe,
        vocabulary_size=vocab_size,
        window_size=window_size,
        # will generate negative samples separately as the returned format is unhelpful
        negative_samples=num_negative_samples
    )
    
    
    recipe_features = np.array(list(zip(*skip_grams)))
    
    if len(recipe_features) == 0:
        print(recipe)
    
    X.append(recipe_features)
    
    Y.append(recipe_labels)

100%|██████████| 178263/178263 [01:52<00:00, 1582.63it/s]


In [54]:
with open("X.pkl", "wb") as f:
    pickle.dump(X,f)
with open("Y.pkl", "wb") as f:
    pickle.dump(Y, f)
with open("vocab.pkl","wb") as f:
    pickle.dump(vocab, f)