In [6]:
import pickle

import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score

In [2]:
flags_df = pd.read_parquet("data/ingr_dummies.parquet")
flags_df.shape

(178265, 8023)

In [3]:
sample_flags = flags_df.sample(frac=0.25, replace=False)
sample_flags.shape

(44566, 8023)

In [5]:
kmeans_params = {
    "clusters": range(5, 31, 5),
    "max_iter": [300, 500]
}

dbscan_params = {
    "eps": [0.1, 0.25, 0.5, 1.0, 1.25],
    "algorithm": ["auto", "kd_tree"]
}

results = {}

In [7]:
kmeans_models = {}

for cluster in kmeans_params["clusters"]:
    for max_i in kmeans_params["max_iter"]:
        model_name = f"kmeans_{cluster}c_{max_i}i"
        details = {}
        details["clusters"] = cluster
        details["max_iter"] = max_i
        km = KMeans(n_clusters=cluster, max_iter=max_i, n_init="auto", random_state=42)
        km.fit(sample_flags)
        
        sil_score = silhouette_score(sample_flags, km.labels_)
        print(f"{model_name} score = {sil_score}")
        details["score"] = sil_score
        details["inertia"] = km.inertia_
        
        kmeans_models[model_name] = details
        
        with open(f"models/{model_name}.pkl", "wb") as f:
            pickle.dump(km, f)
        

kmeans_5c_300i score = 0.025359556961519467
kmeans_5c_500i score = 0.025359556961519467
kmeans_10c_300i score = 0.01024073806683298
kmeans_10c_500i score = 0.01024073806683298
kmeans_15c_300i score = 0.012603195125328414
kmeans_15c_500i score = 0.012603195125328414
kmeans_20c_300i score = 0.006635561539243761
kmeans_20c_500i score = 0.006635561539243761
kmeans_25c_300i score = 0.00571858492315883
kmeans_25c_500i score = 0.00571858492315883
kmeans_30c_300i score = 0.007765257017801726
kmeans_30c_500i score = 0.007765257017801726


In [8]:
dbscan_models = {}

for eps in dbscan_params["eps"]:
    for algo in dbscan_params["algorithm"]:
        model_name = f"dbscan_{eps}e_{algo}a"
        details = {}
        details["eps"] = eps
        details["algorithm"] = algo
        dbscan = DBSCAN(eps=eps, algorithm=algo, n_jobs=-1)
        dbscan.fit(sample_flags)
        
        sil_score = silhouette_score(sample_flags, dbscan.labels_)
        print(f"{model_name} score = {sil_score}")
        details["score"] = sil_score
        
        dbscan_models[model_name] = details
        
        with open(f"models/{model_name}", "wb") as f:
            pickle.dump(dbscan, f)

dbscan_0.1e_autoa score = -0.1137818994288678
dbscan_0.1e_kd_treea score = -0.1137818994288678
dbscan_0.25e_autoa score = -0.1137818994288678
dbscan_0.25e_kd_treea score = -0.1137818994288678
dbscan_0.5e_autoa score = -0.1137818994288678
dbscan_0.5e_kd_treea score = -0.1137818994288678
dbscan_1.0e_autoa score = -0.2008541704716405
dbscan_1.0e_kd_treea score = -0.2008541704716405
dbscan_1.25e_autoa score = -0.2008541704716405
dbscan_1.25e_kd_treea score = -0.2008541704716405


In [9]:
for cluster in range(35, 101, 5):
    max_i = "default_"
    model_name = f"kmeans_{cluster}c_{max_i}i"
    details = {}
    details["clusters"] = cluster
    details["max_iter"] = max_i
    km = KMeans(n_clusters=cluster, n_init="auto", random_state=42)
    km.fit(sample_flags)
    
    sil_score = silhouette_score(sample_flags, km.labels_)
    print(f"{model_name} score = {sil_score}")
    details["score"] = sil_score
    details["inertia"] = km.inertia_
    
    kmeans_models[model_name] = details
    
    with open(f"models/{model_name}.pkl", "wb") as f:
        pickle.dump(km, f)

kmeans_35c_default_i score = 0.008281905915136617
kmeans_40c_default_i score = 0.007494932185157251
kmeans_45c_default_i score = 0.007244433685604815
kmeans_50c_default_i score = 0.0076677207469152
kmeans_55c_default_i score = 0.004070570047512586
kmeans_60c_default_i score = 0.005761999899310619
kmeans_65c_default_i score = 0.0065947546451767345
kmeans_70c_default_i score = 0.005518304051742554
kmeans_75c_default_i score = 0.0054834518212471895
kmeans_80c_default_i score = 0.005751633972537051
kmeans_85c_default_i score = 0.005776518868208993
kmeans_90c_default_i score = 0.005115344110135365
kmeans_95c_default_i score = 0.00500923247561521
kmeans_100c_default_i score = 0.005285331211656954


In [10]:
smaller_eps = [0.0001, 0.001, 0.01]
powers = [2, 3]

for eps in smaller_eps:
    for power in powers:
        model_name = f"dbscan_{eps}e_{power}p"
        details = {}
        details["eps"] = eps
        details["power"] = power
        dbscan = DBSCAN(eps=eps, p=power, n_jobs=-1)
        dbscan.fit(sample_flags)
        
        sil_score = silhouette_score(sample_flags, dbscan.labels_)
        print(f"{model_name} score = {sil_score}")
        details["score"] = sil_score
        
        dbscan_models[model_name] = details
        
        with open(f"models/{model_name}", "wb") as f:
            pickle.dump(dbscan, f)

dbscan_0.0001e_2p score = -0.1137818994288678
dbscan_0.0001e_3p score = -0.1137818994288678
dbscan_0.001e_2p score = -0.1137818994288678
dbscan_0.001e_3p score = -0.1137818994288678
dbscan_0.01e_2p score = -0.1137818994288678
dbscan_0.01e_3p score = -0.1137818994288678
