In [1]:
from copy import deepcopy
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from kmodes.kprototypes import KPrototypes

from load_datasets import *
from duped_modules.gower_duped import gower_matrix as gower_matrix_duped

In [2]:
datasets = [
    load_soybean_disease(),
    load_heart_disease(),
    load_breast_cancer(),
    load_bank_marketing(),
    load_census_income(),
    load_credit_approval()
]

nmis = {d.name: {} for d in datasets}

In [3]:
# Naive KMeans
for d in datasets:
    print(f"Calculating for {d.name}...")
    total_nmi = 0
    df = d.df.copy()
    
    df[d.cat_cols] = StandardScaler().fit_transform(df[d.cat_cols])

    for i in range(100):
        kmeans = KMeans(n_clusters=len(np.unique(d.y)), n_init="auto").fit(df.values)
        total_nmi += normalized_mutual_info_score(d.y, kmeans.labels_)
        
    nmis[d.name]["Naive KMeans"] = total_nmi / 100

Calculating for Soybean Disease...
Calculating for Heart Disease...
Calculating for Breast Cancer...
Calculating for Bank Marketing...
Calculating for Census Income...
Calculating for Credit Approval...


In [11]:
# KMeans with One-Hot Encoding
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[feature_to_encode].astype(str), dtype=float, prefix=feature_to_encode)
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

for d in datasets:
    print(f"Calculating for {d.name}...")
    total_nmi = 0
    df = d.df.copy()

    for col in d.cat_cols:
        df = encode_feature(df, col)

    for i in range(100):
        kmeans = KMeans(n_clusters=len(np.unique(d.y)), n_init="auto").fit(df.values)
        total_nmi += normalized_mutual_info_score(d.y, kmeans.labels_)

    nmis[d.name]["KMeans One-Hot"] = total_nmi / 100

Calculating for Soybean Disease...
Calculating for Heart Disease...
Calculating for Breast Cancer...
Calculating for Bank Marketing...
Calculating for Census Income...
Calculating for Credit Approval...


In [12]:
df

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_0,A1_1,A4_0,A4_1,...,A7_8,A9_0,A9_1,A10_0,A10_1,A12_0,A12_1,A13_0,A13_1,A13_2
0,-0.056962,-0.961440,-0.295171,-0.302596,0.128682,-0.193125,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,2.296536,-0.073565,0.236217,0.704516,-0.816802,-0.086443,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,-0.592078,-0.861903,-0.220955,-0.504019,0.592504,-0.036150,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,-0.310572,-0.654865,0.446990,0.503093,-0.477855,-0.192553,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,-0.958122,0.158358,-0.158613,-0.504019,-0.358926,-0.193125,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,-0.881193,1.046233,-0.295171,-0.504019,0.473575,-0.193125,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
686,-0.746780,-0.812134,-0.072523,-0.101174,0.116789,-0.118066,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
687,-0.528676,1.726075,-0.072523,-0.302596,0.116789,-0.192934,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
688,-1.148329,-0.920630,-0.654378,-0.504019,0.592504,-0.050247,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [13]:
pd.DataFrame(nmis.values(), index=nmis.keys())

Unnamed: 0,Naive KMeans,KMeans One-Hot
Soybean Disease,0.690885,0.681042
Heart Disease,0.201467,0.192174
Breast Cancer,0.750728,0.734709
Bank Marketing,0.015489,0.025165
Census Income,0.077428,0.146353
Credit Approval,0.222717,0.164793
