In [1]:
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes

from load_datasets import *
from duped_modules.gower_duped import gower_matrix as gower_matrix_duped

In [2]:
datasets = [
    load_soybean_disease(),
    load_heart_disease(),
    load_breast_cancer(),
    load_bank_marketing(),
    load_census_income(),
    load_credit_approval()
]

nmis = {d.name: {} for d in datasets}

In [3]:
# Naive KMeans
for d in datasets:
    print(f"Calculating for {d.name}...")
    total_nmi = 0
    df = d.df.copy()
    
    df[d.cat_cols] = StandardScaler().fit_transform(df[d.cat_cols])

    for i in range(100):
        kmeans = KMeans(n_clusters=len(np.unique(d.y)), n_init="auto").fit(df.values)
        total_nmi += normalized_mutual_info_score(d.y, kmeans.labels_)
        
    nmis[d.name]["Naive KMeans"] = total_nmi / 100

Calculating for Soybean Disease...
Calculating for Heart Disease...
Calculating for Breast Cancer...
Calculating for Bank Marketing...
Calculating for Census Income...
Calculating for Credit Approval...


In [4]:
# KMeans with One-Hot Encoding
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[feature_to_encode].astype(str), dtype=float, prefix=feature_to_encode)
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

for d in datasets:
    print(f"Calculating for {d.name}...")
    total_nmi = 0
    df = d.df.copy()

    for col in d.cat_cols:
        df = encode_feature(df, col)

    for i in range(100):
        kmeans = KMeans(n_clusters=len(np.unique(d.y)), n_init="auto").fit(df.values)
        total_nmi += normalized_mutual_info_score(d.y, kmeans.labels_)

    nmis[d.name]["k-means One-Hot"] = total_nmi / 100

Calculating for Soybean Disease...
Calculating for Heart Disease...
Calculating for Breast Cancer...
Calculating for Bank Marketing...
Calculating for Census Income...
Calculating for Credit Approval...


In [5]:
# KPrototypes
for d in datasets:
    print(f"Calculating for {d.name}...")
    total_nmi = 0
    df = d.df.copy()

    for i in range(1):
        if not d.cont_cols:
            k = KModes(n_clusters=len(np.unique(d.y)), verbose=None, n_jobs=-1).fit(df.values)
        else:
            k = KPrototypes(n_clusters=len(np.unique(d.y)), verbose=None, n_jobs=-1).fit(df.values, categorical=[df.columns.get_loc(col) for col in d.cat_cols])
        total_nmi += normalized_mutual_info_score(d.y, k.labels_)

    nmis[d.name]["k-Prototypes"] = total_nmi / 100

Calculating for Soybean Disease...
Calculating for Heart Disease...
Calculating for Breast Cancer...
Calculating for Bank Marketing...
Calculating for Census Income...
Calculating for Credit Approval...


In [6]:
pd.DataFrame(nmis.values(), index=nmis.keys())

Unnamed: 0,Naive KMeans,k-means One-Hot,k-Prototypes
Soybean Disease,0.686038,0.681343,0.006947
Heart Disease,0.200956,0.192618,0.001833
Breast Cancer,0.749538,0.735159,0.00588
Bank Marketing,0.01738,0.023863,0.000176
Census Income,0.081302,0.147739,0.001349
Credit Approval,0.203933,0.157413,0.00109
