In [1]:
import pickle
import re

import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score

In [2]:
flags_df = pd.read_parquet("data/generated_data/ingr_dummies.parquet")
flags_df.shape

(178265, 8023)

In [3]:
sample_flags = flags_df.sample(frac=0.25, replace=False)
sample_flags.shape

(44566, 8023)

In [5]:
kmeans_params = {
    "clusters": range(5, 31, 5),
    "max_iter": [300, 500]
}

dbscan_params = {
    "eps": [0.1, 0.25, 0.5, 1.0, 1.25],
    "algorithm": ["auto", "kd_tree"]
}

results = {}

In [7]:
kmeans_models = {}

for cluster in kmeans_params["clusters"]:
    for max_i in kmeans_params["max_iter"]:
        model_name = f"kmeans_{cluster}c_{max_i}i"
        details = {}
        details["clusters"] = cluster
        details["max_iter"] = max_i
        km = KMeans(n_clusters=cluster, max_iter=max_i, n_init="auto", random_state=42)
        km.fit(sample_flags)
        
        sil_score = silhouette_score(sample_flags, km.labels_)
        print(f"{model_name} score = {sil_score}")
        details["score"] = sil_score
        details["inertia"] = km.inertia_
        
        kmeans_models[model_name] = details
        
        with open(f"models/{model_name}.pkl", "wb") as f:
            pickle.dump(km, f)
        

kmeans_5c_300i score = 0.025359556961519467
kmeans_5c_500i score = 0.025359556961519467
kmeans_10c_300i score = 0.01024073806683298
kmeans_10c_500i score = 0.01024073806683298
kmeans_15c_300i score = 0.012603195125328414
kmeans_15c_500i score = 0.012603195125328414
kmeans_20c_300i score = 0.006635561539243761
kmeans_20c_500i score = 0.006635561539243761
kmeans_25c_300i score = 0.00571858492315883
kmeans_25c_500i score = 0.00571858492315883
kmeans_30c_300i score = 0.007765257017801726
kmeans_30c_500i score = 0.007765257017801726


In [8]:
dbscan_models = {}

for eps in dbscan_params["eps"]:
    for algo in dbscan_params["algorithm"]:
        model_name = f"dbscan_{eps}e_{algo}a"
        details = {}
        details["eps"] = eps
        details["algorithm"] = algo
        dbscan = DBSCAN(eps=eps, algorithm=algo, n_jobs=-1)
        dbscan.fit(sample_flags)
        
        sil_score = silhouette_score(sample_flags, dbscan.labels_)
        print(f"{model_name} score = {sil_score}")
        details["score"] = sil_score
        
        dbscan_models[model_name] = details
        
        with open(f"models/{model_name}", "wb") as f:
            pickle.dump(dbscan, f)

dbscan_0.1e_autoa score = -0.1137818994288678
dbscan_0.1e_kd_treea score = -0.1137818994288678
dbscan_0.25e_autoa score = -0.1137818994288678
dbscan_0.25e_kd_treea score = -0.1137818994288678
dbscan_0.5e_autoa score = -0.1137818994288678
dbscan_0.5e_kd_treea score = -0.1137818994288678
dbscan_1.0e_autoa score = -0.2008541704716405
dbscan_1.0e_kd_treea score = -0.2008541704716405
dbscan_1.25e_autoa score = -0.2008541704716405
dbscan_1.25e_kd_treea score = -0.2008541704716405


In [9]:
for cluster in range(35, 101, 5):
    max_i = "default_"
    model_name = f"kmeans_{cluster}c_{max_i}i"
    details = {}
    details["clusters"] = cluster
    details["max_iter"] = max_i
    km = KMeans(n_clusters=cluster, n_init="auto", random_state=42)
    km.fit(sample_flags)
    
    sil_score = silhouette_score(sample_flags, km.labels_)
    print(f"{model_name} score = {sil_score}")
    details["score"] = sil_score
    details["inertia"] = km.inertia_
    
    kmeans_models[model_name] = details
    
    with open(f"models/{model_name}.pkl", "wb") as f:
        pickle.dump(km, f)

kmeans_35c_default_i score = 0.008281905915136617
kmeans_40c_default_i score = 0.007494932185157251
kmeans_45c_default_i score = 0.007244433685604815
kmeans_50c_default_i score = 0.0076677207469152
kmeans_55c_default_i score = 0.004070570047512586
kmeans_60c_default_i score = 0.005761999899310619
kmeans_65c_default_i score = 0.0065947546451767345
kmeans_70c_default_i score = 0.005518304051742554
kmeans_75c_default_i score = 0.0054834518212471895
kmeans_80c_default_i score = 0.005751633972537051
kmeans_85c_default_i score = 0.005776518868208993
kmeans_90c_default_i score = 0.005115344110135365
kmeans_95c_default_i score = 0.00500923247561521
kmeans_100c_default_i score = 0.005285331211656954


In [10]:
dbscan_eps = [0.0001, 0.001, 0.01]
powers = [2, 3]

for eps in dbscan_eps:
    for power in powers:
        model_name = f"dbscan_{eps}e_{power}p"
        details = {}
        details["eps"] = eps
        details["power"] = power
        dbscan = DBSCAN(eps=eps, p=power, n_jobs=-1)
        dbscan.fit(sample_flags)
        
        sil_score = silhouette_score(sample_flags, dbscan.labels_)
        print(f"{model_name} score = {sil_score}")
        details["score"] = sil_score
        
        dbscan_models[model_name] = details
        
        with open(f"models/{model_name}", "wb") as f:
            pickle.dump(dbscan, f)

dbscan_0.0001e_2p score = -0.1137818994288678
dbscan_0.0001e_3p score = -0.1137818994288678
dbscan_0.001e_2p score = -0.1137818994288678
dbscan_0.001e_3p score = -0.1137818994288678
dbscan_0.01e_2p score = -0.1137818994288678
dbscan_0.01e_3p score = -0.1137818994288678


----
## filter for main dishes only
- the full recipe database has all kinds of recipes so it's not too surprising that these models have been unimpressive
- let's see if filtering the recipes to just "main course" will improve the results

In [2]:
raw = pd.read_csv("data/kaggle_food_dot_com/RAW_recipes.csv")
raw.shape

(231637, 12)

In [3]:
raw.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13


In [4]:
# reminder that this is a list wrapped in a string
raw["tags"][0]

"['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']"

In [5]:
# regex for the win!!
re.findall(r"\b[\w+-?]+", raw["tags"][0])

['60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'occasion',
 'north-american',
 'side-dishes',
 'vegetables',
 'mexican',
 'easy',
 'fall',
 'holiday-event',
 'vegetarian',
 'winter',
 'dietary',
 'christmas',
 'seasonal',
 'squash']

In [6]:
def tags_to_list(tag_list):
    return re.findall(r"\b[\w+-?]+", tag_list)

raw["tag_list"] = raw["tags"].apply(tags_to_list)
raw["tag_list"].head()

0    [60-minutes-or-less, time-to-make, course, mai...
1    [30-minutes-or-less, time-to-make, course, mai...
2    [time-to-make, course, preparation, main-dish,...
3    [60-minutes-or-less, time-to-make, course, mai...
4    [weeknight, time-to-make, course, main-ingredi...
Name: tag_list, dtype: object

In [7]:
tags = raw["tag_list"].explode().unique()

# getting an error trying to write these tags to a file: TypeError: write() argument must be str, not float
# also need a `\n` between each element so the file is readable
tags = [ f"{tag}\n" for tag in tags if type(tag) == str]
len(tags)

560

In [8]:
# I need to get a good look at these tags so I know what to filter on
f = open("data/generated_data/tags.txt", "a")  # from https://www.w3schools.com/python/ref_file_writelines.asp
f.writelines(tags)
f.close()

In [9]:
# filtering for main course tags should include: ["*-main-dish-*", "main-dish", "dinner-party"] (searched tags for "main" and "dinner")
test_tags = ["middle-eastern-main-dish", "dinner-party", "main-dish-chicken", "pasta"]

for _ in test_tags:
    matches = re.search(r"main-dish|dinner", _)
    if matches != None:
        print(matches)

<re.Match object; span=(15, 24), match='main-dish'>
<re.Match object; span=(0, 6), match='dinner'>
<re.Match object; span=(0, 9), match='main-dish'>


In [10]:
def flag_main(tag_list):
    matches = re.search(r"main-dish|dinner", " ".join(tag_list))
    return 1 if matches != None else 0

raw["main_course"] = raw["tag_list"].apply(flag_main)

In [11]:
sum(raw["main_course"])

96201

In [13]:
mains = raw.loc[raw["main_course"] == 1]
mains.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,tag_list,main_course
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,"[30-minutes-or-less, time-to-make, course, mai...",1
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,"[time-to-make, course, preparation, main-dish,...",1
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,"[60-minutes-or-less, time-to-make, course, mai...",1
6,aww marinated olives,25274,15,21730,2002-04-14,"['15-minutes-or-less', 'time-to-make', 'course...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,['toast the fennel seeds and lightly crush the...,my italian mil was thoroughly impressed by my ...,"['fennel seeds', 'green olives', 'ripe olives'...",9,"[15-minutes-or-less, time-to-make, course, mai...",1
7,backyard style barbecued ribs,67888,120,10404,2003-07-30,"['weeknight', 'time-to-make', 'course', 'main-...","[1109.5, 83.0, 378.0, 275.0, 96.0, 86.0, 36.0]",10,['in a medium saucepan combine all the ingredi...,this recipe is posted by request and was origi...,"['pork spareribs', 'soy sauce', 'fresh garlic'...",22,"[weeknight, time-to-make, course, main-ingredi...",1


In [18]:
main_course_ids = list(mains["id"])
len(main_course_ids)

96201

In [20]:
type(main_course_ids[0])

int

- pull in the processed recipes to match the recipe ids to this filtered data

In [21]:
proc_df = pd.read_parquet("data/generated_data/processed_recipes.parquet")
proc_df.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids,ingr_ints
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]","[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...","[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...","[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."


In [25]:
type(proc_df["id"][0])

numpy.int64

In [27]:
proc_mains = proc_df.loc[proc_df["id"].isin(main_course_ids)]

In [28]:
proc_mains.shape

(75297, 9)

In [29]:
ingredients = proc_mains["ingr_ints"].explode().unique()
len(ingredients)

6863

- main course has 6,863 ingredients; the "ingr_map" has 8,023 ingredients, seems reasonable that this filtered list would have about 85% of those
- dummify these recipes

In [30]:
cols = []
for id_col in ingredients:
    col = []
    for recipe_row in range(0, proc_mains.shape[0]):
        flag = 1 if id_col in proc_mains["ingr_ints"].iloc[recipe_row] else 0
        col.append(flag)
    cols.append(pd.Series(col))
    
flag_proc_df = pd.concat(cols, axis=1)
flag_proc_df.shape

(75297, 6863)

In [32]:
file_name = "data/generated_data/main_dish_dummies.parquet"
flag_proc_df.to_parquet(file_name, engine="pyarrow", compression="gzip")

- please be better, please be better, please be better

In [34]:
models_details = []

for cluster in range(5, 51, 5):
    trial_details = {}
    trial_details["model_type"] = "kmeans"
    trial_details["clusters"] = cluster
    km = KMeans(n_clusters=cluster, n_init="auto", random_state=42)
    km.fit(flag_proc_df)
    
    sil_score = silhouette_score(flag_proc_df, km.labels_)
    trial_details["score"] = sil_score
    trial_details["inertia"] = km.inertia_
    
    models_details.append(trial_details)
    
    model_name = f"kmeans_{cluster}c_mains"
    print(f"{model_name} score = {sil_score}")
    
    with open(f"models/main_course_recipes_models/{model_name}.pkl", "wb") as f:
        pickle.dump(km, f)

kmeans_5c_mains score = 0.02611545207525439
kmeans_10c_mains score = 0.01898906917349764
kmeans_15c_mains score = 0.01550170395454996
kmeans_20c_mains score = 0.012670817726076799
kmeans_25c_mains score = 0.009665977288290393
kmeans_30c_mains score = 0.008708792730123028
kmeans_35c_mains score = 0.007397730183817275
kmeans_40c_mains score = 0.006950586773818742
kmeans_45c_mains score = 0.004783978207344153
kmeans_50c_mains score = 0.005927257987330646


In [35]:
kmeans_models = pd.DataFrame.from_dict(models_details)
kmeans_models.to_parquet("data/generated_data/filtered_kmeans_model_trials.parquet", engine="pyarrow", compression= "gzip")

In [36]:
dbscan_eps = [0.0001, 0.001, 0.01, 0.1, 0.25, 0.5, 1.0, 1.25]
dbscan_model_details = []

for eps in dbscan_eps:
    trial_details = {}
    trial_details["eps"] = eps
    dbscan = DBSCAN(eps=eps, n_jobs=-1)
    dbscan.fit(flag_proc_df)
    
    sil_score = silhouette_score(flag_proc_df, dbscan.labels_)
    trial_details["score"] = sil_score
    
    dbscan_model_details.append(trial_details)
    
    model_name = f"dbscan_{eps}e_mains"
    print(f"{model_name} score = {sil_score}")
    
    with open(f"models/main_course_recipes_models/{model_name}", "wb") as f:
        pickle.dump(dbscan, f)

dbscan_0.0001e_mains score = -0.11207505922147183
dbscan_0.001e_mains score = -0.11207505922147183
dbscan_0.01e_mains score = -0.11207505922147183
dbscan_0.1e_mains score = -0.11207505922147183
dbscan_0.25e_mains score = -0.11207505922147183
dbscan_0.5e_mains score = -0.11207505922147183
dbscan_1.0e_mains score = -0.18771775409269265
dbscan_1.25e_mains score = -0.18771775409269265


In [37]:
dbscan_models = pd.DataFrame.from_dict(dbscan_model_details)
dbscan_models.to_parquet("data/generated_data/filtered_dbscan_model_trials.parquet", engine="pyarrow", compression= "gzip")

---
## filter most frequent ingredients
- maybe 6,863 ingredients is too many
- let's try with fewer ingredients

In [39]:
# this is stored at "data/generated_data/main_dish_dummies.parquet"
flag_proc_df.shape

(75297, 6863)