In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes
from sklearn.cluster import AgglomerativeClustering
from scipy.optimize import linear_sum_assignment
import gc

from load_datasets import load_all_datasets
from duped_modules.kprototypes_duped import KPrototypesDuped
from duped_modules.gower_duped import gower_matrix as gower_matrix_duped

In [2]:
def cluster_accuracy(y_true, y_pred):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_true)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_true.shape[0]):
        cost_matrix[y_true[i], y_pred[i]] += 1
    inverted_cost_matrix = cost_matrix.max() - cost_matrix
    row_ind, col_ind = linear_sum_assignment(inverted_cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [3]:
datasets = load_all_datasets(max_rows=5000)

accuracies = {d.name: {} for d in datasets}
nmis = {d.name: {} for d in datasets}

In [4]:
# Naive KMeans
for d in datasets:
    print(f"Calculating for {d.name}...")

    df = d.df.copy(deep=True)
    df[d.cat_cols] = StandardScaler().fit_transform(df[d.cat_cols])

    kmeans = KMeans(init="random", n_init=1, n_clusters=len(np.unique(d.y)), random_state=0).fit(df.values)
    nmis[d.name]["Naive k-means"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["Naive k-means"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone Age...
Calculating for Auction Verification...
Calculating for Bank Marketing...
Calculating for Breast Cancer...
Calculating for Census Income...
Calculating for Credit Approval...
Calculating for Heart Disease...
Calculating for Soybean Disease...


In [5]:
# KMeans with One-Hot Encoding
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[feature_to_encode].astype(str), dtype=float, prefix=feature_to_encode)
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

for d in datasets:
    print(f"Calculating for {d.name}...")
    df = d.df.copy(deep=True)

    for col in d.cat_cols:
        df = encode_feature(df, col)

    kmeans = KMeans(init="random", n_init=1, n_clusters=len(np.unique(d.y)), random_state=0).fit(df.values)
    nmis[d.name]["k-means one-hot"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["k-means one-hot"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone Age...
Calculating for Auction Verification...
Calculating for Bank Marketing...
Calculating for Breast Cancer...
Calculating for Census Income...
Calculating for Credit Approval...
Calculating for Heart Disease...
Calculating for Soybean Disease...


In [6]:
# KPrototypes
for d in datasets:
    print(f"Calculating for {d.name}...")
    df = d.df.copy(deep=True)

    if not d.cont_cols:
        clust = KModes(init="Huang", n_init=1, n_clusters=len(np.unique(d.y)), verbose=None, n_jobs=-1, random_state=0).fit(df.values)
    else:
        clust = KPrototypesDuped(init="Huang", n_init=1, n_clusters=len(np.unique(d.y)), verbose=None, n_jobs=-1, random_state=0).fit(df.values, categorical=[df.columns.get_loc(col) for col in d.cat_cols])

    nmis[d.name]["k-prototypes"] = normalized_mutual_info_score(d.y, clust.labels_)
    accuracies[d.name]["k-prototypes"] = cluster_accuracy(d.y, clust.labels_)

Calculating for Abalone Age...


ValueError: Missing values detected in numerical columns.

In [7]:
# Gower Distance + Agglomerative Clustering
for d in datasets:
    print(f"Calculating for {d.name}...")
    df = d.df.copy(deep=True)

    distance_matrix = gower_matrix_duped(df.astype(float))
    clust = AgglomerativeClustering(n_clusters=len(np.unique(d.y)), metric="precomputed", linkage="average", memory="/.cache").fit_predict(distance_matrix)
    nmis[d.name]["Gower distance"] = normalized_mutual_info_score(d.y, clust)
    accuracies[d.name]["Gower distance"] = cluster_accuracy(d.y, clust)
    del distance_matrix, clust
    gc.collect()

Calculating for Abalone Age...


  0%|          | 0/4177 [00:00<?, ?it/s]

100%|██████████| 4177/4177 [00:00<00:00, 5746.04it/s]


Calculating for Auction Verification...


100%|██████████| 2043/2043 [00:00<00:00, 10860.47it/s]


Calculating for Bank Marketing...


100%|██████████| 5000/5000 [00:01<00:00, 4002.63it/s]


Calculating for Breast Cancer...


100%|██████████| 683/683 [00:00<00:00, 17978.94it/s]


Calculating for Census Income...


100%|██████████| 5000/5000 [00:01<00:00, 3545.87it/s]


Calculating for Credit Approval...


100%|██████████| 653/653 [00:00<00:00, 18600.97it/s]


Calculating for Heart Disease...


100%|██████████| 299/299 [00:00<00:00, 29834.59it/s]

Calculating for Soybean Disease...



100%|██████████| 562/562 [00:00<00:00, 14545.22it/s]


In [8]:
pd.DataFrame(nmis.values(), index=nmis.keys())

Unnamed: 0,Naive k-means,k-means one-hot,Gower distance
Abalone Age,0.17049,0.169787,0.161416
Auction Verification,0.016172,0.007087,0.00617
Bank Marketing,0.00511,0.023589,0.00113
Breast Cancer,0.746818,0.726304,0.537113
Census Income,0.090032,0.186274,0.003018
Credit Approval,0.313076,0.148259,0.003465
Heart Disease,0.20208,0.218775,0.140792
Soybean Disease,0.643389,0.653183,0.669526


In [9]:
pd.DataFrame(accuracies.values(), index=accuracies.keys())

Unnamed: 0,Naive k-means,k-means one-hot,Gower distance
Abalone Age,0.133828,0.137659,0.195356
Auction Verification,0.664709,0.576114,0.800783
Bank Marketing,0.753,0.8064,0.8936
Breast Cancer,0.960469,0.947291,0.894583
Census Income,0.6016,0.6844,0.7564
Credit Approval,0.808576,0.689127,0.548239
Heart Disease,0.35786,0.344482,0.565217
Soybean Disease,0.533808,0.494662,0.501779
