# Recipe database
- Downloaded data from [Kaggle](https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions?resource=download)
- Most of these files are too big for git so I've added the folder to my .gitignore
- If you check out that Kaggle data, you'll see there are files for the recipes, ingredients and user interactions. I'm not currently
interested in the user interaction data, only the recipes & ingredients.

## what's in this notebook:
- raw recipe data
- pre-processed recipes
- ingredient map
- understanding the processed data
- search map for ingredients

In [1]:
import re
import pandas as pd

## raw recipe data

In [81]:
raw_recipes = pd.read_csv("data/kaggle_food_dot_com/RAW_recipes.csv")
raw_recipes.shape

(231637, 12)

In [82]:
raw_recipes.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

In [4]:
raw_recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [5]:
raw_recipes["n_ingredients"].mean()

9.051153313158087

In [6]:
raw_recipes["n_steps"].mean()

9.7654994668382

In [8]:
raw_recipes["minutes"].mean()
# that seems like a lot

9398.546009488984

In [11]:
raw_recipes.loc[raw_recipes["minutes"] == raw_recipes["minutes"].max()]
# that's 4,085 years

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
144074,no bake granola balls,261647,2147483647,464080,2007-10-26,"['60-minutes-or-less', 'time-to-make', 'course...","[330.3, 23.0, 110.0, 4.0, 15.0, 24.0, 15.0]",9,"['preheat the oven to 350 degrees', 'spread oa...",healthy snacks that kids (and grown ups) will ...,"['rolled oats', 'unsweetened dried shredded co...",8


In [13]:
raw_recipes.loc[raw_recipes["minutes"] > 525600]  # one year, as we know from Broadway
# LOL

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
109624,how to preserve a husband,447963,1051200,576273,2011-02-01,"['time-to-make', 'course', 'preparation', 'for...","[407.4, 57.0, 50.0, 1.0, 7.0, 115.0, 5.0]",9,"['be careful in your selection', ""don't choose...","found this in a local wyoming cookbook ""a coll...","['cream', 'peach']",2
144074,no bake granola balls,261647,2147483647,464080,2007-10-26,"['60-minutes-or-less', 'time-to-make', 'course...","[330.3, 23.0, 110.0, 4.0, 15.0, 24.0, 15.0]",9,"['preheat the oven to 350 degrees', 'spread oa...",healthy snacks that kids (and grown ups) will ...,"['rolled oats', 'unsweetened dried shredded co...",8


In [31]:
# from stack overflow: https://stackoverflow.com/questions/58528989/pandas-get-unique-values-from-column-of-lists
raw_recipes["tags"].explode().unique()

array(["['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']",
       "['30-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'breakfast', 'main-dish', 'pork', 'american', 'oven', 'easy', 'kid-friendly', 'pizza', 'dietary', 'northeastern-united-states', 'meat', 'equipment']",
       "['time-to-make', 'course', 'preparation', 'main-dish', 'chili', 'crock-pot-slow-cooker', 'dietary', 'equipment', '4-hours-or-less']",
       ...,
       "['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'preparation', 'appetizers', 'eggs-dairy', 'easy', 'finger-food', 'eggs', 'presentation', 'served-cold', '3-steps-or-less']",
       "['30-minutes-or-less', 'time-to-make', 'course', 'preparation', 'for-large-g

In [40]:
# ugh, that's a list wrapped in a string
raw_recipes["tags"][0]

"['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']"

In [61]:
type(raw_recipes["tags"][0])

str

In [79]:
# regex for the win!!
re.findall(r"\b[\w+-?]+", raw_recipes["tags"][0])

['60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'occasion',
 'north-american',
 'side-dishes',
 'vegetables',
 'mexican',
 'easy',
 'fall',
 'holiday-event',
 'vegetarian',
 'winter',
 'dietary',
 'christmas',
 'seasonal',
 'squash']

In [43]:
raw_recipes.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

---
## pre-processed recipes
- Kaggle includes the tokenized data, after going through the GPT subword tokenizer with start-of-step, etc. tokens. 
- See [this blog post](https://techcommunity.microsoft.com/t5/healthcare-and-life-sciences/unlocking-the-power-of-tokens-optimizing-token-usage-in-gpt-for/ba-p/3826665) about the GPT tokenizer and [this article](https://towardsdatascience.com/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
about byte-pair encoding, which is part of that GPT tokenizing process.

In [14]:
p_recipes = pd.read_csv("data/kaggle_food_dot_com/PP_recipes.csv")
p_recipes.head()

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,74301,168258,"[40480, 10025, 31156, 40481]","[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,76272,109030,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]"


In [15]:
p_recipes.shape

(178265, 8)

---
## ingredient map

In [17]:
ingr_map = pd.read_pickle("data/kaggle_food_dot_com/ingr_map.pkl")
ingr_map.head()

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308


In [18]:
ingr_map.shape

(11659, 7)

In [19]:
len(ingr_map["replaced"].unique())

8023

In [20]:
len(ingr_map["id"].unique())

8023

In [21]:
len(ingr_map["count"].unique())

840

---
## understanding the processed data

In [66]:
p_recipes.head(1)[["id", "ingredient_ids"]]

Unnamed: 0,id,ingredient_ids
0,424415,"[389, 7655, 6270, 1527, 3406]"


In [70]:
print(raw_recipes.loc[raw_recipes["id"] == 424415]["ingredients"])

8586    ['basmati rice', 'water', 'salt', 'cinnamon st...
Name: ingredients, dtype: object


In [75]:
print(f"ingr 389: {ingr_map.loc[ingr_map['id'] == 389]['raw_ingr']}\n")
print(f"ingr 7655: {ingr_map.loc[ingr_map['id'] == 7655]['raw_ingr']}\n")
print(f"ingr 6270: {ingr_map.loc[ingr_map['id'] == 6270]['raw_ingr']}\n")
print(f"ingr 1527: {ingr_map.loc[ingr_map['id'] == 1527]['raw_ingr']}\n")

ingr 389: 9485    basmati rice
Name: raw_ingr, dtype: object

ingr 7655: 10946    water
Name: raw_ingr, dtype: object

ingr 6270: 902    aunt jane's krazy mixed up salt
903                    low sodium salt
904                               salt
Name: raw_ingr, dtype: object

ingr 1527: 8933     cinnamon stick
8934    cinnamon sticks
Name: raw_ingr, dtype: object



In [76]:
# if I understand how the tokenizer works, each element of this list is the tokens generated from
#  an individual ingredient; the ingr that occur the most frequently, like "salt" or "water", might be
#  a single token but those that are less frequent like "basmati rice" would end up as more tokens
p_recipes.head(1)[["ingredient_tokens"]]

Unnamed: 0,ingredient_tokens
0,"[[2911, 1019, 249, 6878], [1353], [6953], [153..."


---
## search map for ingredients
- using the ingredients from the recipes I pulled earlier from the API

In [25]:
search_ingr = ["egg", "eggs", "spinach", "tomato", "tomatoes", "cherry tomato", "cherry tomatoes",
               "rocket", "arugula", "olive oil", "oregano", "chili", "chilli", "bread", "sourdough",
               "sourdough bread"]

results = {}

for i in search_ingr:
    results[i] = ingr_map.loc[ingr_map["replaced"].str.lower() == i].shape[0]
    
results

{'egg': 2,
 'eggs': 0,
 'spinach': 1,
 'tomato': 16,
 'tomatoes': 0,
 'cherry tomato': 1,
 'cherry tomatoes': 0,
 'rocket': 1,
 'arugula': 1,
 'olive oil': 29,
 'oregano': 1,
 'chili': 4,
 'chilli': 0,
 'bread': 7,
 'sourdough': 0,
 'sourdough bread': 1}