In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes
from sklearn.cluster import AgglomerativeClustering
from scipy.optimize import linear_sum_assignment
import gc

from load_datasets import load_all_datasets
from duped_modules.gower_duped import gower_matrix as gower_matrix_duped

In [2]:
def cluster_accuracy(y_true, y_pred):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_true)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_true.shape[0]):
        cost_matrix[y_true[i], y_pred[i]] += 1
    inverted_cost_matrix = cost_matrix.max() - cost_matrix
    row_ind, col_ind = linear_sum_assignment(inverted_cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [3]:
datasets = load_all_datasets(max_rows=5000)

accuracies = {d.name: {} for d in datasets}
nmis = {d.name: {} for d in datasets}

In [4]:
# Naive KMeans
for d in datasets:
    print(f"Calculating for {d.name}...")

    df = d.df.copy(deep=True)
    df[d.cat_cols] = StandardScaler().fit_transform(df[d.cat_cols])

    kmeans = KMeans(n_clusters=d.n_targets, init="random", max_iter=300, n_init=1, random_state=0, algorithm="lloyd").fit(df.values)
    nmis[d.name]["Naive k-means"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["Naive k-means"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Calculating for Auction Verification...
Calculating for Bank Marketing...
Calculating for Breast Cancer...
Calculating for Census Income...
Calculating for Credit Approval...
Calculating for Heart Disease...
Calculating for Soybean Disease...


In [5]:
# KMeans with One-Hot Encoding
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[feature_to_encode].astype(str), dtype=float, prefix=feature_to_encode)
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

for d in datasets:
    print(f"Calculating for {d.name}...")
    df = d.df.copy(deep=True)

    for col in d.cat_cols:
        df = encode_feature(df, col)

    kmeans = KMeans(n_clusters=d.n_targets, init="random", max_iter=300, n_init=1, random_state=0, algorithm="lloyd").fit(df.values)
    nmis[d.name]["k-means one-hot"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["k-means one-hot"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Calculating for Auction Verification...
Calculating for Bank Marketing...
Calculating for Breast Cancer...
Calculating for Census Income...
Calculating for Credit Approval...
Calculating for Heart Disease...
Calculating for Soybean Disease...


In [6]:
# KPrototypes
for d in datasets:
    print(f"Calculating for {d.name}...")
    df = d.df.copy(deep=True)

    n_clusters = len(np.unique(d.y))
    if not d.cont_cols:
        clust = KModes(init="Huang", n_init=1, n_clusters=n_clusters, verbose=None, n_jobs=-1, random_state=0).fit(df.values)
    else:
        cat_cols_indices = [df.columns.get_loc(col) for col in d.cat_cols]
        cont_cols_indices = [df.columns.get_loc(col) for col in d.cont_cols]
        try:
            clust = KPrototypes(init="Huang", n_init=1, n_clusters=n_clusters, verbose=None, n_jobs=-1, random_state=0, max_iter=300).fit(df.values, categorical=cat_cols_indices)
        # The initilization can go wrong for big k values, so we can initialize ourselfs
        except ValueError:
            init_centroids = [df.values[:, cont_cols_indices][:n_clusters], df.values[:, cat_cols_indices][:n_clusters]]
            clust = KPrototypes(init=init_centroids, n_init=1, n_clusters=n_clusters, verbose=None, n_jobs=-1, random_state=0, max_iter=300).fit(df.values, categorical=cat_cols_indices)

    nmis[d.name]["k-prototypes"] = normalized_mutual_info_score(d.y, clust.labels_)
    accuracies[d.name]["k-prototypes"] = cluster_accuracy(d.y, clust.labels_)

Calculating for Abalone...
Calculating for Auction Verification...
Calculating for Bank Marketing...
Calculating for Breast Cancer...
Calculating for Census Income...
Calculating for Credit Approval...
Calculating for Heart Disease...
Calculating for Soybean Disease...


In [7]:
# Gower Distance + Agglomerative Clustering
for d in datasets:
    print(f"Calculating for {d.name}...")
    df = d.df.copy(deep=True)

    distance_matrix = gower_matrix_duped(df.astype(float))
    clust = AgglomerativeClustering(n_clusters=d.n_targets, metric="precomputed", linkage="average").fit_predict(distance_matrix)
    nmis[d.name]["Gower distance"] = normalized_mutual_info_score(d.y, clust)
    accuracies[d.name]["Gower distance"] = cluster_accuracy(d.y, clust)
    del distance_matrix, clust
    gc.collect()

Calculating for Abalone...


100%|██████████| 4177/4177 [00:00<00:00, 4920.31it/s]


Calculating for Auction Verification...


100%|██████████| 2043/2043 [00:00<00:00, 9395.54it/s]


Calculating for Bank Marketing...


100%|██████████| 5000/5000 [00:01<00:00, 3469.95it/s]


Calculating for Breast Cancer...


100%|██████████| 683/683 [00:00<00:00, 17804.40it/s]


Calculating for Census Income...


100%|██████████| 5000/5000 [00:01<00:00, 3460.61it/s]


Calculating for Credit Approval...


100%|██████████| 653/653 [00:00<00:00, 16614.98it/s]


Calculating for Heart Disease...


100%|██████████| 299/299 [00:00<00:00, 22067.13it/s]

Calculating for Soybean Disease...



100%|██████████| 562/562 [00:00<00:00, 13283.96it/s]


In [8]:
pd.DataFrame(nmis.values(), index=nmis.keys()).round(4)

Unnamed: 0,Naive k-means,k-means one-hot,k-prototypes,Gower distance
Abalone,0.171795,0.173982,0.171639,0.161416
Auction Verification,0.016172,0.007087,0.007667,0.00617
Bank Marketing,0.019781,0.02606,0.019522,0.001334
Breast Cancer,0.746818,0.73631,0.59248,0.553707
Census Income,0.108029,0.184979,0.141737,0.004259
Credit Approval,0.313076,0.171038,0.116579,0.003465
Heart Disease,0.204577,0.164486,0.189264,0.140792
Soybean Disease,0.672229,0.710164,0.567635,0.669526


In [9]:
pd.DataFrame(accuracies.values(), index=accuracies.keys()).round(4)

Unnamed: 0,Naive k-means,k-means one-hot,k-prototypes,Gower distance
Abalone,0.135265,0.131434,0.134307,0.195356
Auction Verification,0.664709,0.576114,0.580519,0.800783
Bank Marketing,0.7796,0.7866,0.7872,0.8842
Breast Cancer,0.960469,0.95022,0.915081,0.900439
Census Income,0.6082,0.6976,0.6256,0.7684
Credit Approval,0.808576,0.705972,0.666156,0.548239
Heart Disease,0.334448,0.32107,0.424749,0.565217
Soybean Disease,0.576512,0.599644,0.47153,0.501779
