In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes
from sklearn.cluster import AgglomerativeClustering
import gc

from utils import *
from load_datasets import load_all_datasets
from duped_modules.gower_duped import gower_matrix as gower_matrix_duped

In [2]:
np.random.seed(0)

datasets = load_all_datasets(max_rows=5000)
accuracies = {d.name: {} for d in datasets}
nmis = {d.name: {} for d in datasets}
for d in datasets:
    print(f"{d.name}: Input dim: {d.input_dim}; Cat dim: {d.cat_dim}; Cont dim: {d.cont_dim}")

Abalone: Input dim: 9; Cat dim: 2; Cont dim: 7
Auction Verification: Input dim: 13; Cat dim: 12; Cont dim: 1
Bank Marketing: Input dim: 22; Cat dim: 17; Cont dim: 5
Breast Cancer: Input dim: 45; Cat dim: 45; Cont dim: 0
Census Income: Input dim: 58; Cat dim: 52; Cont dim: 6
Credit Approval: Input dim: 28; Cat dim: 22; Cont dim: 6
Heart Disease: Input dim: 17; Cat dim: 11; Cont dim: 6
Soybean Disease: Input dim: 55; Cat dim: 55; Cont dim: 0


In [3]:
# Naive KMeans
for d in datasets:
    print(f"Calculating for {d.name}...")

    df = d.df.copy(deep=True)
    df[d.cat_cols] = StandardScaler().fit_transform(df[d.cat_cols])

    kmeans = KMeans(n_clusters=d.n_targets, init="random", max_iter=300, n_init=1, random_state=0, algorithm="lloyd").fit(df.values)
    nmis[d.name]["Naive k-means"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["Naive k-means"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Calculating for Auction Verification...
Calculating for Bank Marketing...
Calculating for Breast Cancer...
Calculating for Census Income...
Calculating for Credit Approval...
Calculating for Heart Disease...
Calculating for Soybean Disease...


In [4]:
# KMeans with One-Hot Encoding
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[feature_to_encode].astype(str), dtype=float, prefix=feature_to_encode)
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

for d in datasets:
    print(f"Calculating for {d.name}...")
    df = d.df.copy(deep=True)

    for col in d.cat_cols:
        df = encode_feature(df, col)

    kmeans = KMeans(n_clusters=d.n_targets, init="random", max_iter=300, n_init=1, random_state=0, algorithm="lloyd").fit(df.values)
    nmis[d.name]["k-means one-hot"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["k-means one-hot"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Calculating for Auction Verification...
Calculating for Bank Marketing...
Calculating for Breast Cancer...
Calculating for Census Income...
Calculating for Credit Approval...
Calculating for Heart Disease...
Calculating for Soybean Disease...


In [5]:
# KPrototypes
for d in datasets:
    print(f"Calculating for {d.name}...")
    df = d.df.copy(deep=True)

    n_clusters = len(np.unique(d.y))
    if not d.cont_cols:
        clust = KModes(init="Huang", n_init=1, n_clusters=n_clusters, verbose=None, n_jobs=-1, random_state=0).fit(df.values)
    else:
        cat_cols_indices = [df.columns.get_loc(col) for col in d.cat_cols]
        cont_cols_indices = [df.columns.get_loc(col) for col in d.cont_cols]
        try:
            clust = KPrototypes(init="Huang", n_init=1, n_clusters=n_clusters, verbose=None, n_jobs=-1, random_state=0, max_iter=300).fit(df.values, categorical=cat_cols_indices)
        # The initilization can go wrong for big k values, so we can initialize ourselfs
        except ValueError:
            init_centroids = [df.values[:, cont_cols_indices][:n_clusters], df.values[:, cat_cols_indices][:n_clusters]]
            clust = KPrototypes(init=init_centroids, n_init=1, n_clusters=n_clusters, verbose=None, n_jobs=-1, random_state=0, max_iter=300).fit(df.values, categorical=cat_cols_indices)

    nmis[d.name]["k-prototypes"] = normalized_mutual_info_score(d.y, clust.labels_)
    accuracies[d.name]["k-prototypes"] = cluster_accuracy(d.y, clust.labels_)

Calculating for Abalone...
Calculating for Auction Verification...
Calculating for Bank Marketing...
Calculating for Breast Cancer...
Calculating for Census Income...
Calculating for Credit Approval...
Calculating for Heart Disease...
Calculating for Soybean Disease...


In [6]:
# Gower Distance + Agglomerative Clustering
for d in datasets:
    print(f"Calculating for {d.name}...")
    df = d.df.copy(deep=True)

    distance_matrix = gower_matrix_duped(df.astype(float))
    clust = AgglomerativeClustering(n_clusters=d.n_targets, metric="precomputed", linkage="average").fit_predict(distance_matrix)
    nmis[d.name]["Gower distance"] = normalized_mutual_info_score(d.y, clust)
    accuracies[d.name]["Gower distance"] = cluster_accuracy(d.y, clust)
    del distance_matrix, clust
    gc.collect()

Calculating for Abalone...


  0%|          | 0/4177 [00:00<?, ?it/s]

100%|██████████| 4177/4177 [00:01<00:00, 2674.56it/s]


Calculating for Auction Verification...


100%|██████████| 2043/2043 [00:00<00:00, 6040.01it/s]


Calculating for Bank Marketing...


100%|██████████| 5000/5000 [00:03<00:00, 1549.12it/s]


Calculating for Breast Cancer...


100%|██████████| 683/683 [00:00<00:00, 7873.91it/s]


Calculating for Census Income...


100%|██████████| 5000/5000 [00:02<00:00, 2300.48it/s]


Calculating for Credit Approval...


100%|██████████| 653/653 [00:00<00:00, 13457.95it/s]

Calculating for Heart Disease...



100%|██████████| 299/299 [00:00<00:00, 17972.93it/s]


Calculating for Soybean Disease...


100%|██████████| 562/562 [00:00<00:00, 11572.48it/s]


In [7]:
pd.DataFrame(nmis.values(), index=nmis.keys()).round(4)

Unnamed: 0,Naive k-means,k-means one-hot,k-prototypes,Gower distance
Abalone,0.1718,0.174,0.1716,0.1614
Auction Verification,0.0162,0.0071,0.0077,0.0062
Bank Marketing,0.0198,0.0261,0.0195,0.0013
Breast Cancer,0.7468,0.7363,0.5925,0.5537
Census Income,0.108,0.185,0.1417,0.0043
Credit Approval,0.3131,0.171,0.1166,0.0035
Heart Disease,0.2046,0.1645,0.1893,0.1408
Soybean Disease,0.6722,0.7102,0.5676,0.6695


In [8]:
pd.DataFrame(accuracies.values(), index=accuracies.keys()).round(4)

Unnamed: 0,Naive k-means,k-means one-hot,k-prototypes,Gower distance
Abalone,0.1353,0.1314,0.1343,0.1954
Auction Verification,0.6647,0.5761,0.5805,0.8008
Bank Marketing,0.7796,0.7866,0.7872,0.8842
Breast Cancer,0.9605,0.9502,0.9151,0.9004
Census Income,0.6082,0.6976,0.6256,0.7684
Credit Approval,0.8086,0.706,0.6662,0.5482
Heart Disease,0.3344,0.3211,0.4247,0.5652
Soybean Disease,0.5765,0.5996,0.4715,0.5018
