In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes
from sklearn.cluster import AgglomerativeClustering
import gc

from load_datasets import *
from duped_modules.gower_duped import gower_matrix as gower_matrix_duped

In [2]:
datasets = [
    load_soybean_disease(),
    load_heart_disease(),
    load_breast_cancer(),
    load_bank_marketing(),
    load_census_income(),
    load_credit_approval()
]

nmis = {d.name: {} for d in datasets}

In [3]:
# Naive KMeans
for d in datasets:
    print(f"Calculating for {d.name}...")
    total_nmi = 0
    df = d.df.copy()
    
    df[d.cat_cols] = StandardScaler().fit_transform(df[d.cat_cols])

    for i in range(100):
        kmeans = KMeans(n_clusters=len(np.unique(d.y)), n_init="auto").fit(df.values)
        total_nmi += normalized_mutual_info_score(d.y, kmeans.labels_)
        
    nmis[d.name]["Naive k-means"] = total_nmi / 100

Calculating for Soybean Disease...
Calculating for Heart Disease...
Calculating for Breast Cancer...
Calculating for Bank Marketing...
Calculating for Census Income...
Calculating for Credit Approval...


In [4]:
# KMeans with One-Hot Encoding
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[feature_to_encode].astype(str), dtype=float, prefix=feature_to_encode)
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

for d in datasets:
    print(f"Calculating for {d.name}...")
    total_nmi = 0
    df = d.df.copy(deep=True)

    for col in d.cat_cols:
        df = encode_feature(df, col)

    for i in range(100):
        kmeans = KMeans(n_clusters=len(np.unique(d.y)), n_init="auto").fit(df.values)
        total_nmi += normalized_mutual_info_score(d.y, kmeans.labels_)

    nmis[d.name]["k-means one-hot"] = total_nmi / 100

Calculating for Soybean Disease...
Calculating for Heart Disease...
Calculating for Breast Cancer...
Calculating for Bank Marketing...
Calculating for Census Income...
Calculating for Credit Approval...


In [5]:
# KPrototypes
for d in datasets:
    print(f"Calculating for {d.name}...")
    total_nmi = 0
    df = d.df.copy(deep=True)

    for i in range(1):
        if not d.cont_cols:
            clust = KModes(n_clusters=len(np.unique(d.y)), verbose=None, n_jobs=-1).fit(df.values)
        else:
            clust = KPrototypes(n_clusters=len(np.unique(d.y)), verbose=None, n_jobs=-1).fit(df.values, categorical=[df.columns.get_loc(col) for col in d.cat_cols])
        total_nmi += normalized_mutual_info_score(d.y, clust.labels_)

    nmis[d.name]["k-prototypes"] = total_nmi / 1

Calculating for Soybean Disease...
Calculating for Heart Disease...
Calculating for Breast Cancer...
Calculating for Bank Marketing...
Calculating for Census Income...
Calculating for Credit Approval...


In [8]:
# Gower Distance + Agglomerative Clustering
for d in datasets:
    print(f"Calculating for {d.name}...")
    total_nmi = 0
    df = d.df.copy(deep=True)
    if df.shape[0] > 10000:
        df = df.head(2000)
        d.y = np.copy(d.y)[:2000]

    for i in range(1):
        distance_matrix = gower_matrix_duped(df.astype(float))
        clust = AgglomerativeClustering(n_clusters=len(np.unique(d.y)), metric="precomputed", linkage="average").fit_predict(distance_matrix)
        total_nmi += normalized_mutual_info_score(d.y, clust)
        del distance_matrix, clust
        gc.collect()

    nmis[d.name]["Gower Distance + Agglomerative Clustering"] = total_nmi / 1

Calculating for Soybean Disease...


100%|██████████| 562/562 [00:00<00:00, 13943.22it/s]


Calculating for Heart Disease...


100%|██████████| 299/299 [00:00<00:00, 42085.20it/s]


Calculating for Breast Cancer...


100%|██████████| 683/683 [00:00<00:00, 31513.92it/s]


Calculating for Bank Marketing...


100%|██████████| 2000/2000 [00:00<00:00, 12768.73it/s]


Calculating for Census Income...


100%|██████████| 2000/2000 [00:00<00:00, 12287.76it/s]


Calculating for Credit Approval...


100%|██████████| 653/653 [00:00<00:00, 28326.70it/s]


In [9]:
pd.DataFrame(nmis.values(), index=nmis.keys())

Unnamed: 0,Naive k-means,k-means one-hot,k-prototypes,Gower Distance + Agglomerative Clustering
Soybean Disease,0.682423,0.678341,0.694658,0.669526
Heart Disease,0.199321,0.192299,0.189973,0.140792
Breast Cancer,0.750027,0.735109,0.58796,0.537113
Bank Marketing,0.016164,0.029781,0.017566,0.000356
Census Income,0.075318,0.153712,0.033596,0.000507
Credit Approval,0.180201,0.159079,0.114664,0.003465
