# Recipe database
- trying a different approach: pull in a recipe database & use that to build a predictor model
- downloaded data from [kaggle](https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions?resource=download)
- as with the foodb data, some of these files are too big for git so I've added the folder to my .gitignore

In [1]:
import re
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

---
## pre-processed recipes
- kaggle includes the tokenized data, after going through a BPE tokenizer
- see [this article](https://towardsdatascience.com/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)

In [2]:
df = pd.read_csv("data/kaggle_food_dot_com/PP_recipes.csv")
df.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."


In [3]:
df.shape

(178265, 8)

In [4]:
df.dtypes

id                    int64
i                     int64
name_tokens          object
ingredient_tokens    object
steps_tokens         object
techniques           object
calorie_level         int64
ingredient_ids       object
dtype: object

In [5]:
df["ingredient_tokens"][0]

'[[2911, 1019, 249, 6878], [1353], [6953], [15341, 3261], [2056, 857, 643, 1631, 20480]]'

In [6]:
df["ingredient_ids"][0]

'[389, 7655, 6270, 1527, 3406]'

- I want to build a model that learns which ingredients tend to occur together
- then use that model to predict an ingredient, given the presence of other ingredients
- first I need to get these lists to act like lists instead of strings

In [7]:
result = []
for element in df["ingredient_tokens"][0].split("]"):
    tokens = re.findall(r"\b\d+", element)
    if len(tokens) > 0:
        tokens = [ int(token) for token in tokens ]
        result.append(tokens)
result

[[2911, 1019, 249, 6878],
 [1353],
 [6953],
 [15341, 3261],
 [2056, 857, 643, 1631, 20480]]

In [8]:
[ int(id) for id in df["ingredient_ids"][0].replace("[", "").replace("]", "").split(", ") ]

[389, 7655, 6270, 1527, 3406]

In [9]:
# confirming that `split` doesn't care if it has anything to split
[ int(id) for id in "[17]".replace("[", "").replace("]", "").split(", ") ]

[17]

In [10]:
def ingr_ids_to_list(id_list):
    return [ int(id) for id in  id_list.replace("[", "").replace("]","").split(", ") ]

df["ingr_ints"] = df["ingredient_ids"].apply(ingr_ids_to_list)

In [11]:
df.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids,ingr_ints
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]","[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...","[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...","[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."


- these integers don't have any meaning -- ie. they're still just ids to map to the ingredients
- binarize them so the model just has flags for which ones are present

In [12]:
map_df = pd.read_pickle("data/kaggle_food_dot_com/ingr_map.pkl")
map_df.head()

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308


- I think I want to dummify these ingredients

In [58]:
test_df = pd.DataFrame()
test_series = pd.Series([[12, 4308, 5], [3, 29, 12]])
test_ids = [4308, 5, 29, 12]

for id_col in test_ids:
    col = []
    for recipe_row in range(0, test_series.shape[0]):
        flag = 1 if id_col in test_series.iloc[recipe_row] else 0
        col.append(flag)
    # print(col)
    test_df[id_col] = col
    
test_df.head()

# this is not terribly efficient but let's see if that's a problem with this data

Unnamed: 0,4308,5,29,12
0,1,1,0,1
1,0,0,1,1


In [57]:
ingr_ids = map_df["id"].unique()
len(ingr_ids)

8023

In [61]:
flags_df = pd.DataFrame()
cols = []
for id_col in ingr_ids:
    col = []
    for recipe_row in range(0, df.shape[0]):
        flag = 1 if id_col in df["ingr_ints"].iloc[recipe_row] else 0
        col.append(flag)
    cols.append(col)
 
flags_df = pd.concat(cols, axis=1)    
flags_df.shape

TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

In [63]:
col_series = []
for col in cols:
    col_series.append(pd.Series(col))

In [64]:
flags_df = pd.concat(col_series, axis=1)    
flags_df.shape

(178265, 8023)

In [65]:
file_name = "data/ingr_dummies.parquet"
flags_df.to_parquet(file_name, engine="pyarrow", compression="gzip")