In [1]:
import importlib
import pickle
import re

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import modeling

In [77]:
importlib.reload(modeling);

In [2]:
flags_df = pd.read_parquet("data/generated_data/ingr_dummies.parquet")
flags_df.shape

(178265, 8023)

In [3]:
sample_flags = flags_df.sample(frac=0.25, replace=False)
sample_flags.shape

(44566, 8023)

In [5]:
kmeans_params = {
    "clusters": range(5, 31, 5),
    "max_iter": [300, 500]
}

dbscan_params = {
    "eps": [0.1, 0.25, 0.5, 1.0, 1.25],
    "algorithm": ["auto", "kd_tree"]
}

results = {}

In [7]:
kmeans_models = {}

for cluster in kmeans_params["clusters"]:
    for max_i in kmeans_params["max_iter"]:
        model_name = f"kmeans_{cluster}c_{max_i}i"
        details = {}
        details["clusters"] = cluster
        details["max_iter"] = max_i
        km = KMeans(n_clusters=cluster, max_iter=max_i, n_init="auto", random_state=42)
        km.fit(sample_flags)
        
        sil_score = silhouette_score(sample_flags, km.labels_)
        print(f"{model_name} score = {sil_score}")
        details["score"] = sil_score
        details["inertia"] = km.inertia_
        
        kmeans_models[model_name] = details
        
        with open(f"models/{model_name}.pkl", "wb") as f:
            pickle.dump(km, f)
        

kmeans_5c_300i score = 0.025359556961519467
kmeans_5c_500i score = 0.025359556961519467
kmeans_10c_300i score = 0.01024073806683298
kmeans_10c_500i score = 0.01024073806683298
kmeans_15c_300i score = 0.012603195125328414
kmeans_15c_500i score = 0.012603195125328414
kmeans_20c_300i score = 0.006635561539243761
kmeans_20c_500i score = 0.006635561539243761
kmeans_25c_300i score = 0.00571858492315883
kmeans_25c_500i score = 0.00571858492315883
kmeans_30c_300i score = 0.007765257017801726
kmeans_30c_500i score = 0.007765257017801726


In [8]:
dbscan_models = {}

for eps in dbscan_params["eps"]:
    for algo in dbscan_params["algorithm"]:
        model_name = f"dbscan_{eps}e_{algo}a"
        details = {}
        details["eps"] = eps
        details["algorithm"] = algo
        dbscan = DBSCAN(eps=eps, algorithm=algo, n_jobs=-1)
        dbscan.fit(sample_flags)
        
        sil_score = silhouette_score(sample_flags, dbscan.labels_)
        print(f"{model_name} score = {sil_score}")
        details["score"] = sil_score
        
        dbscan_models[model_name] = details
        
        with open(f"models/{model_name}", "wb") as f:
            pickle.dump(dbscan, f)

dbscan_0.1e_autoa score = -0.1137818994288678
dbscan_0.1e_kd_treea score = -0.1137818994288678
dbscan_0.25e_autoa score = -0.1137818994288678
dbscan_0.25e_kd_treea score = -0.1137818994288678
dbscan_0.5e_autoa score = -0.1137818994288678
dbscan_0.5e_kd_treea score = -0.1137818994288678
dbscan_1.0e_autoa score = -0.2008541704716405
dbscan_1.0e_kd_treea score = -0.2008541704716405
dbscan_1.25e_autoa score = -0.2008541704716405
dbscan_1.25e_kd_treea score = -0.2008541704716405


In [9]:
for cluster in range(35, 101, 5):
    max_i = "default_"
    model_name = f"kmeans_{cluster}c_{max_i}i"
    details = {}
    details["clusters"] = cluster
    details["max_iter"] = max_i
    km = KMeans(n_clusters=cluster, n_init="auto", random_state=42)
    km.fit(sample_flags)
    
    sil_score = silhouette_score(sample_flags, km.labels_)
    print(f"{model_name} score = {sil_score}")
    details["score"] = sil_score
    details["inertia"] = km.inertia_
    
    kmeans_models[model_name] = details
    
    with open(f"models/{model_name}.pkl", "wb") as f:
        pickle.dump(km, f)

kmeans_35c_default_i score = 0.008281905915136617
kmeans_40c_default_i score = 0.007494932185157251
kmeans_45c_default_i score = 0.007244433685604815
kmeans_50c_default_i score = 0.0076677207469152
kmeans_55c_default_i score = 0.004070570047512586
kmeans_60c_default_i score = 0.005761999899310619
kmeans_65c_default_i score = 0.0065947546451767345
kmeans_70c_default_i score = 0.005518304051742554
kmeans_75c_default_i score = 0.0054834518212471895
kmeans_80c_default_i score = 0.005751633972537051
kmeans_85c_default_i score = 0.005776518868208993
kmeans_90c_default_i score = 0.005115344110135365
kmeans_95c_default_i score = 0.00500923247561521
kmeans_100c_default_i score = 0.005285331211656954


In [10]:
dbscan_eps = [0.0001, 0.001, 0.01]
powers = [2, 3]

for eps in dbscan_eps:
    for power in powers:
        model_name = f"dbscan_{eps}e_{power}p"
        details = {}
        details["eps"] = eps
        details["power"] = power
        dbscan = DBSCAN(eps=eps, p=power, n_jobs=-1)
        dbscan.fit(sample_flags)
        
        sil_score = silhouette_score(sample_flags, dbscan.labels_)
        print(f"{model_name} score = {sil_score}")
        details["score"] = sil_score
        
        dbscan_models[model_name] = details
        
        with open(f"models/{model_name}", "wb") as f:
            pickle.dump(dbscan, f)

dbscan_0.0001e_2p score = -0.1137818994288678
dbscan_0.0001e_3p score = -0.1137818994288678
dbscan_0.001e_2p score = -0.1137818994288678
dbscan_0.001e_3p score = -0.1137818994288678
dbscan_0.01e_2p score = -0.1137818994288678
dbscan_0.01e_3p score = -0.1137818994288678


----
## filter for main dishes only
- the full recipe database has all kinds of recipes so it's not too surprising that these models have been unimpressive
- let's see if filtering the recipes to just "main course" will improve the results

In [6]:
raw = pd.read_csv("data/kaggle_food_dot_com/RAW_recipes.csv")
raw.shape

(231637, 12)

In [7]:
raw.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13


In [8]:
# reminder that this is a list wrapped in a string
raw["tags"][0]

"['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']"

In [9]:
# regex for the win!!
re.findall(r"\b[\w+-?]+", raw["tags"][0])

['60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'occasion',
 'north-american',
 'side-dishes',
 'vegetables',
 'mexican',
 'easy',
 'fall',
 'holiday-event',
 'vegetarian',
 'winter',
 'dietary',
 'christmas',
 'seasonal',
 'squash']

In [10]:
def tags_to_list(tag_list):
    return re.findall(r"\b[\w+-?]+", tag_list)

raw["tag_list"] = raw["tags"].apply(tags_to_list)
raw["tag_list"].head()

0    [60-minutes-or-less, time-to-make, course, mai...
1    [30-minutes-or-less, time-to-make, course, mai...
2    [time-to-make, course, preparation, main-dish,...
3    [60-minutes-or-less, time-to-make, course, mai...
4    [weeknight, time-to-make, course, main-ingredi...
Name: tag_list, dtype: object

In [11]:
tags = raw["tag_list"].explode().unique()

# getting an error trying to write these tags to a file: TypeError: write() argument must be str, not float
# also need a `\n` between each element so the file is readable
tags = [ f"{tag}\n" for tag in tags if type(tag) == str]
len(tags)

560

In [13]:
# I need to get a good look at these tags so I know what to filter on
f = open("data/generated_data/tags.txt", "a")  # from https://www.w3schools.com/python/ref_file_writelines.asp
f.writelines(tags)
f.close()

In [14]:
# filtering for main course tags should include: ["*-main-dish-*", "main-dish", "dinner-party"] (searched tags for "main" and "dinner")
test_tags = ["middle-eastern-main-dish", "dinner-party", "main-dish-chicken", "pasta"]

for _ in test_tags:
    matches = re.search(r"main-dish|dinner", _)
    if matches != None:
        print(matches)

<re.Match object; span=(15, 24), match='main-dish'>
<re.Match object; span=(0, 6), match='dinner'>
<re.Match object; span=(0, 9), match='main-dish'>


In [15]:
def flag_main(tag_list):
    matches = re.search(r"main-dish|dinner", " ".join(tag_list))
    return 1 if matches != None else 0

raw["main_course"] = raw["tag_list"].apply(flag_main)

In [16]:
sum(raw["main_course"])

96201

In [17]:
mains = raw.loc[raw["main_course"] == 1]
mains.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,tag_list,main_course
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,"[30-minutes-or-less, time-to-make, course, mai...",1
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,"[time-to-make, course, preparation, main-dish,...",1
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,"[60-minutes-or-less, time-to-make, course, mai...",1
6,aww marinated olives,25274,15,21730,2002-04-14,"['15-minutes-or-less', 'time-to-make', 'course...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,['toast the fennel seeds and lightly crush the...,my italian mil was thoroughly impressed by my ...,"['fennel seeds', 'green olives', 'ripe olives'...",9,"[15-minutes-or-less, time-to-make, course, mai...",1
7,backyard style barbecued ribs,67888,120,10404,2003-07-30,"['weeknight', 'time-to-make', 'course', 'main-...","[1109.5, 83.0, 378.0, 275.0, 96.0, 86.0, 36.0]",10,['in a medium saucepan combine all the ingredi...,this recipe is posted by request and was origi...,"['pork spareribs', 'soy sauce', 'fresh garlic'...",22,"[weeknight, time-to-make, course, main-ingredi...",1


In [18]:
main_course_ids = list(mains["id"])
len(main_course_ids)

96201

In [19]:
type(main_course_ids[0])

int

- pull in the processed recipes to match the recipe ids to this filtered data

In [20]:
proc_df = pd.read_parquet("data/generated_data/processed_recipes.parquet")
proc_df.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids,ingr_ints
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]","[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...","[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...","[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."


In [21]:
type(proc_df["id"][0])

numpy.int64

In [22]:
proc_mains = proc_df.loc[proc_df["id"].isin(main_course_ids)]

In [23]:
proc_mains.shape

(75297, 9)

In [24]:
ingredients = proc_mains["ingr_ints"].explode().unique()
len(ingredients)

6863

- main course has 6,863 ingredients; the "ingr_map" has 8,023 ingredients, seems reasonable that this filtered list would have about 85% of those
- dummify these recipes

In [25]:
cols = []
for id_col in ingredients:
    col = []
    for recipe_row in range(0, proc_mains.shape[0]):
        flag = 1 if id_col in proc_mains["ingr_ints"].iloc[recipe_row] else 0
        col.append(flag)
    cols.append(pd.Series(col))
    
flag_proc_df = pd.concat(cols, axis=1)
flag_proc_df.shape

(75297, 6863)

- I need to keep the recipe id with the flagged data

In [26]:
flag_proc_df["recipe_id"] = proc_mains["id"]

In [27]:
file_name = "data/generated_data/main_dish_dummies.parquet"
flag_proc_df.to_parquet(file_name, engine="pyarrow", compression="gzip")

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


---
## mysterious nulls in the "recipe_id" column

In [29]:
flag_proc_df["recipe_id"].isna().sum()

43339

- huh?

In [31]:
proc_mains.shape[0] == flag_proc_df.shape[0]

True

In [42]:
proc_mains["id"].isna().sum()

0

- there are no nulls in the un-dummified data and 43,339 (57%!) nulls in the dummified data

In [56]:
# I don't know why adding that column in a normal way is giving me nulls but I don't have time to figure it out
for idx in range(proc_mains.shape[0]):
    flag_proc_df.iloc[idx]["recipe_id"] = proc_mains.iloc[idx]["id"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flag_proc_df.iloc[idx]["recipe_id"] = proc_mains.iloc[idx]["id"]


In [57]:
flag_proc_df["recipe_id"].isna().sum()

43339

- yup, still all those nulls

In [62]:
suspect_ids = [ f"{id}\n" for id in flag_proc_df["recipe_id"] ]
f = open("data/generated_data/ids.txt", "a")  # from https://www.w3schools.com/python/ref_file_writelines.asp
f.writelines(suspect_ids)
f.close()

In [63]:
orig_ids = [ f"{id}\n" for id in proc_mains["id"] ]
f = open("data/generated_data/orig_ids.txt", "a")
f.writelines(orig_ids)
f.close()

- okay I still don't know why this is happening but those two files are...interesting

In [64]:
flag_proc_df["recipe_id"] = [int(id) for id in proc_mains["id"]]
flag_proc_df["recipe_id"].isna().sum()

0

- I'm sure there's some dumb pandas reason why I had to do it that way

In [66]:
file_name = "data/generated_data/main_dish_dummies.parquet"
flag_proc_df.to_parquet(file_name, engine="pyarrow", compression="gzip")

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


- this warning is actually about the COLUMN names and not the row types but I'm glad it came up earlier because I hadn't even thought to check for nulls in those recipe ids

---
## kmeans with main-dish recipes only
- please be better, please be better, please be better

In [78]:
new_model_details = pd.DataFrame()

kmeans_mains = modeling.try_kmeans_models(cluster_range=range(5, 50, 7), name_modifier="all_mains", 
                                          data_df=flag_proc_df.drop(columns=["recipe_id"]), models_df=new_model_details)

kmeans_5c_mains_all_mains score = 0.02611545207525439
kmeans_12c_mains_all_mains score = 0.014283476656228896
kmeans_19c_mains_all_mains score = 0.012512314175224897
kmeans_26c_mains_all_mains score = 0.008327280580775554
kmeans_33c_mains_all_mains score = 0.008020459279035021
kmeans_40c_mains_all_mains score = 0.006950586773818742
kmeans_47c_mains_all_mains score = 0.005277396017073416


In [79]:
kmeans_mains

Unnamed: 0,model_type,clusters,score,inertia,name
0,kmeans,5,0.026115,636096.272441,kmeans_5c_mains_all_mains
1,kmeans,12,0.014283,607286.457808,kmeans_12c_mains_all_mains
2,kmeans,19,0.012512,593800.245345,kmeans_19c_mains_all_mains
3,kmeans,26,0.008327,585401.72676,kmeans_26c_mains_all_mains
4,kmeans,33,0.00802,579045.612712,kmeans_33c_mains_all_mains
5,kmeans,40,0.006951,574295.436875,kmeans_40c_mains_all_mains
6,kmeans,47,0.005277,568123.103745,kmeans_47c_mains_all_mains


In [36]:
dbscan_eps = [0.0001, 0.001, 0.01, 0.1, 0.25, 0.5, 1.0, 1.25]
dbscan_model_details = []

for eps in dbscan_eps:
    trial_details = {}
    trial_details["eps"] = eps
    dbscan = DBSCAN(eps=eps, n_jobs=-1)
    dbscan.fit(flag_proc_df)
    
    sil_score = silhouette_score(flag_proc_df, dbscan.labels_)
    trial_details["score"] = sil_score
    
    dbscan_model_details.append(trial_details)
    
    model_name = f"dbscan_{eps}e_mains"
    print(f"{model_name} score = {sil_score}")
    
    with open(f"models/main_course_recipes_models/{model_name}", "wb") as f:
        pickle.dump(dbscan, f)

dbscan_0.0001e_mains score = -0.11207505922147183
dbscan_0.001e_mains score = -0.11207505922147183
dbscan_0.01e_mains score = -0.11207505922147183
dbscan_0.1e_mains score = -0.11207505922147183
dbscan_0.25e_mains score = -0.11207505922147183
dbscan_0.5e_mains score = -0.11207505922147183
dbscan_1.0e_mains score = -0.18771775409269265
dbscan_1.25e_mains score = -0.18771775409269265


In [37]:
dbscan_models = pd.DataFrame.from_dict(dbscan_model_details)
dbscan_models.to_parquet("data/generated_data/filtered_dbscan_model_trials.parquet", engine="pyarrow", compression= "gzip")

---
## filter most frequent ingredients
- maybe 6,863 ingredients is too many
- let's try with fewer ingredients

In [2]:
flag_proc_df = pd.read_parquet("data/generated_data/main_dish_dummies.parquet")
flag_proc_df.shape

(75297, 6863)

In [71]:
# use only ingredients that show up in more than 5 recipes from: https://www.geeksforgeeks.org/pandas-filter-a-dataframe-by-the-sum-of-rows-or-columns/
df = flag_proc_df.loc[:, flag_proc_df.sum(axis=0) > 5]  # that 5 is totally arbitrary

In [80]:
df.shape

(75297, 3526)

In [82]:
updated_kmeans_results = modeling.try_kmeans_models(cluster_range=range(5,50,7), name_modifier="half_features", 
                                                    data_df=df.drop(columns=["recipe_id"]), models_df=kmeans_mains)

kmeans_5c_mains_half_features score = 0.029910841250235503
kmeans_12c_mains_half_features score = 0.013916367465081743
kmeans_19c_mains_half_features score = 0.010370923066061931
kmeans_26c_mains_half_features score = 0.003903981046639545
kmeans_33c_mains_half_features score = 0.005742282437713488
kmeans_40c_mains_half_features score = 0.0016878940560285907
kmeans_47c_mains_half_features score = 0.0035814081637385695


- that was no better than the results of all the main dishes
- I also don't know why my `range(5,55,7)` was stepping by 5 instead of 7?? 
    - figure out that it was because I hadn't reimported modeling.py after changing the loop in that function from the hard-coded range

---
## PCA
- instead of me deciding how many features to include, let's let sklearn actually figure out what's important
- this is from lesson 2.23

In [23]:
# per docs, "randomized" solver is used for input data larger than 500x500, 
#  I'm specifying it here so I remember that later
#  first ran this cell when df did not include the recipe_id column
pca = PCA(svd_solver="randomized", random_state=42)
pca.fit(df)
trained = pca.transform(df)

In [25]:
# Pull the explained variance attribute.
# shows how much of the variance is explained by each PCA, these values will always add up to 1
var_exp = pca.explained_variance_ratio_
print(f"Explained variance (first 20 components): {np.round(var_exp[:20], 3)}\n")

# Generate the cumulative explained variance.
cum_var_exp = np.cumsum(var_exp)
print(f'Cumulative explained variance (first 30 components): {np.round(cum_var_exp[:30], 3)}') 

Explained variance (first 20 components): [0.034 0.031 0.025 0.021 0.018 0.015 0.014 0.013 0.012 0.011 0.011 0.01
 0.009 0.009 0.009 0.009 0.008 0.008 0.008 0.007]

Cumulative explained variance (first 30 components): [0.034 0.065 0.09  0.111 0.129 0.144 0.159 0.172 0.184 0.195 0.206 0.216
 0.225 0.235 0.244 0.252 0.261 0.268 0.276 0.283 0.291 0.298 0.304 0.311
 0.317 0.324 0.33  0.336 0.341 0.346]


In [33]:
# how many features do I want to include? found this at: https://www.geeksforgeeks.org/python-get-the-index-of-first-element-greater-than-k/
i_90 = next(x for x, val in enumerate(cum_var_exp)if val > 0.9)
print(f"the first {i_90-1} features out of {len(cum_var_exp)} explain 90% of the variance")

the first 827 features out of 3525 explain 90% of the variance


In [84]:
pca_90 = PCA(svd_solver="randomized", random_state=42, n_components=827)
pca_90.fit(df.drop(columns=["recipe_id"]))
trained = pca_90.transform(df.drop(columns=["recipe_id"]))

In [85]:
pca_kmeans_results = modeling.try_kmeans_models(cluster_range=range(5, 20, 7), name_modifier="pca_90", 
                                                data_df=trained, models_df=updated_kmeans_results)

kmeans_5c_mains_pca_90 score = 0.026829644684107607
kmeans_12c_mains_pca_90 score = 0.011722777805553905
kmeans_19c_mains_pca_90 score = 0.004633226682016011


In [87]:
kmeans_results_file = "data/generated_data/filtered_kmeans_model_trials.parquet"
pca_kmeans_results.to_parquet(kmeans_results_file, engine="pyarrow", compression= "gzip")

In [90]:
pca_kmeans_results["name"]

0         kmeans_5c_mains_all_mains
1        kmeans_12c_mains_all_mains
2        kmeans_19c_mains_all_mains
3        kmeans_26c_mains_all_mains
4        kmeans_33c_mains_all_mains
5        kmeans_40c_mains_all_mains
6        kmeans_47c_mains_all_mains
0     kmeans_5c_mains_half_features
1    kmeans_12c_mains_half_features
2    kmeans_19c_mains_half_features
3    kmeans_26c_mains_half_features
4    kmeans_33c_mains_half_features
5    kmeans_40c_mains_half_features
6    kmeans_47c_mains_half_features
0            kmeans_5c_mains_pca_90
1           kmeans_12c_mains_pca_90
2           kmeans_19c_mains_pca_90
Name: name, dtype: object

---
## AgglomerativeClustering
- suggestion from Sumit to use hierarchical clustering

In [48]:
# starting with all default values & fitting on the non-PCA subset of features
agg_c = AgglomerativeClustering()
agg_c.fit(df)

: 

In [None]:
# getting data after kernel crashed
flag_proc_df = pd.read_parquet("data/generated_data/main_dish_dummies.parquet")

# we'll try this on the smaller, PCA data set
pca_90 = PCA(svd_solver="randomized", random_state=42, n_components=827)
pca_90.fit(flag_proc_df)
trained = pca_90.transform(flag_proc_df)

In [6]:
# starting with all default values & fitting on the PCA subset of features
agg_c = AgglomerativeClustering()
agg_c.fit(trained)

: 

- [the article](https://towardsdatascience.com/a-practical-introduction-to-hierarchical-clustering-from-scikit-learn-ffaf8ee2670c) I read about AgglomerativeClustering says that you can't use the silhouette score to evaluate the model
- according to that article, you'd have to rely on your domain knowledge to say if the model is generating anything useful
- I'm going to calculate it anyway

In [None]:
silhouette_score(trained, agg_c.labels_)