In [1]:
import gower
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [2]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
6,62,Female,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [3]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [4]:
df = og_df.copy()
df.drop(columns="num", inplace=True)
df[categorial_columns] = df[categorial_columns].apply(LabelEncoder().fit_transform)
df[categorial_columns] = MinMaxScaler().fit_transform(df[categorial_columns])

df[cont_columns] = MinMaxScaler().fit_transform(df[cont_columns])
df = df.fillna(df.mean())
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725000,0.386401,0.5,0.000000,0.633803,0.0,0.556818,0.000000,0.000000,0.000000
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
3,0.183673,1.0,0.666667,0.650000,0.414594,0.0,0.333333,0.894366,0.0,0.693182,0.000000,0.000000,0.333333
6,0.693878,0.0,0.000000,0.700000,0.444444,0.0,0.000000,0.704225,0.0,0.704545,0.000000,0.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.254088,1.000000
916,0.693878,1.0,1.000000,0.661976,0.230514,0.0,0.666667,0.535258,1.0,0.404359,1.000000,0.254088,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.254088,0.000000
918,0.612245,1.0,0.000000,0.661976,0.638474,0.5,0.000000,0.535258,1.0,0.404359,1.000000,0.254088,1.000000


In [5]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.x = torch.tensor(df.values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.x[idx]

    def __len__(self):
        return self.x.shape[0]

In [6]:
dataset = HeartDiseaseDataset(df)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [7]:
class MixedTypeClusteringModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(13, 10),
            torch.nn.Sigmoid(),
            torch.nn.Linear(10, 7),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(7, 10),
            torch.nn.Sigmoid(),
            torch.nn.Linear(10, 13),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = MixedTypeClusteringModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x in dataloader:
        optimizer.zero_grad()
        outputs = model(x)
        train_loss = criterion(outputs,  x)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.110827
epoch: 2/100, loss = 0.106579
epoch: 3/100, loss = 0.102622
epoch: 4/100, loss = 0.098990
epoch: 5/100, loss = 0.095648
epoch: 6/100, loss = 0.092632
epoch: 7/100, loss = 0.089865
epoch: 8/100, loss = 0.087325
epoch: 9/100, loss = 0.085004
epoch: 10/100, loss = 0.082926
epoch: 11/100, loss = 0.081002
epoch: 12/100, loss = 0.079316
epoch: 13/100, loss = 0.077754
epoch: 14/100, loss = 0.076350
epoch: 15/100, loss = 0.075137
epoch: 16/100, loss = 0.074010
epoch: 17/100, loss = 0.073058
epoch: 18/100, loss = 0.072218
epoch: 19/100, loss = 0.071497
epoch: 20/100, loss = 0.070845
epoch: 21/100, loss = 0.070297
epoch: 22/100, loss = 0.069852
epoch: 23/100, loss = 0.069463
epoch: 24/100, loss = 0.069098
epoch: 25/100, loss = 0.068842
epoch: 26/100, loss = 0.068552
epoch: 27/100, loss = 0.068358
epoch: 28/100, loss = 0.068203
epoch: 29/100, loss = 0.068050
epoch: 30/100, loss = 0.067912
epoch: 31/100, loss = 0.067856
epoch: 32/100, loss = 0.067731
epoch: 33/100, lo

In [8]:
features = model.encoder(torch.tensor(df.values, dtype=torch.float)).detach().numpy()
features

array([[0.7384497 , 0.80147445, 0.7270733 , ..., 0.57350755, 0.5622038 ,
        0.5905046 ],
       [0.7620603 , 0.8037559 , 0.8195917 , ..., 0.50657797, 0.6793048 ,
        0.5390647 ],
       [0.7617321 , 0.80132955, 0.8048098 , ..., 0.52465683, 0.6497073 ,
        0.5562443 ],
       ...,
       [0.7229362 , 0.7695522 , 0.6895884 , ..., 0.5734773 , 0.51163906,
        0.62139714],
       [0.6831347 , 0.7255636 , 0.7642904 , ..., 0.46241686, 0.61892873,
        0.49990717],
       [0.7491404 , 0.78529394, 0.7193781 , ..., 0.5869002 , 0.5157275 ,
        0.62380904]], dtype=float32)

In [9]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [10]:
autoencoder_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
autoencoder_acc = cluster_accuracy(autoencoder_kmeans.labels_, og_df["num"].to_numpy())
autoencoder_acc

0.30614805520702637

In [11]:
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[[feature_to_encode]])
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

In [12]:
df_one_hot = og_df.copy()
df_one_hot.drop(columns="num", inplace=True)

for col in categorial_columns:
    df_one_hot = encode_feature(df_one_hot, col)
df_one_hot.loc[:, ~df_one_hot.columns.isin(cont_columns)] = df_one_hot.loc[:, ~df_one_hot.columns.isin(cont_columns)].apply(LabelEncoder().fit_transform)
df_one_hot = df_one_hot.fillna(df_one_hot.mean())
df_one_hot

Unnamed: 0,age,trestbps,chol,thalch,oldpeak,ca,sex_Female,sex_Male,cp_asymptomatic,cp_atypical angina,...,restecg_normal,restecg_st-t abnormality,exang_False,exang_True,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
0,63,145.000000,233.0,150.000000,2.300000,0.000000,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
1,67,160.000000,286.0,108.000000,1.500000,3.000000,0,1,1,0,...,0,0,0,1,0,1,0,0,1,0
2,67,120.000000,229.0,129.000000,2.600000,2.000000,0,1,1,0,...,0,0,0,1,0,1,0,0,0,1
3,37,130.000000,250.0,187.000000,3.500000,0.000000,0,1,0,0,...,1,0,1,0,1,0,0,0,1,0
6,62,140.000000,268.0,160.000000,3.600000,2.000000,1,0,1,0,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,127.000000,333.0,154.000000,0.000000,0.762264,1,0,1,0,...,0,1,1,0,0,0,0,0,0,0
916,62,132.395161,139.0,136.006684,0.958356,0.762264,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
917,55,122.000000,223.0,100.000000,0.000000,0.762264,0,1,1,0,...,0,1,1,0,0,0,0,1,0,0
918,58,132.395161,385.0,136.006684,0.958356,0.762264,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
classic_one_hot_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(df_one_hot)
classic_one_hot_acc = cluster_accuracy(classic_one_hot_kmeans.labels_, og_df["num"].to_numpy())
classic_one_hot_acc

0.3199498117942284

In [14]:
one_hot_dataset = HeartDiseaseDataset(df_one_hot)
one_hot_dataloader = DataLoader(one_hot_dataset, batch_size=100, shuffle=True)
len(one_hot_dataset)

797

In [15]:
class OneHotMixedTypeClusteringModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(25, 18),
            torch.nn.Sigmoid(),
            torch.nn.Linear(18, 12),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(12, 18),
            torch.nn.Sigmoid(),
            torch.nn.Linear(18, 25),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

one_hot_model = OneHotMixedTypeClusteringModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x in one_hot_dataloader:
        optimizer.zero_grad()
        outputs = one_hot_model(x)
        train_loss = criterion(outputs,  x)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 3606.196747
epoch: 2/100, loss = 3605.447144
epoch: 3/100, loss = 3606.473877
epoch: 4/100, loss = 3604.447327
epoch: 5/100, loss = 3606.070312
epoch: 6/100, loss = 3605.413422
epoch: 7/100, loss = 3605.102844
epoch: 8/100, loss = 3605.680267
epoch: 9/100, loss = 3605.875061
epoch: 10/100, loss = 3606.041290
epoch: 11/100, loss = 3605.817169
epoch: 12/100, loss = 3606.568115
epoch: 13/100, loss = 3606.928345
epoch: 14/100, loss = 3604.754028
epoch: 15/100, loss = 3606.672485
epoch: 16/100, loss = 3604.506104
epoch: 17/100, loss = 3605.466583
epoch: 18/100, loss = 3605.334656
epoch: 19/100, loss = 3606.126770
epoch: 20/100, loss = 3606.189545
epoch: 21/100, loss = 3606.709290
epoch: 22/100, loss = 3605.392303
epoch: 23/100, loss = 3605.308258
epoch: 24/100, loss = 3605.677368
epoch: 25/100, loss = 3605.926208
epoch: 26/100, loss = 3605.187103
epoch: 27/100, loss = 3606.428406
epoch: 28/100, loss = 3605.001465
epoch: 29/100, loss = 3605.046570
epoch: 30/100, loss = 3

In [16]:
one_hot_features = one_hot_model.encoder(torch.tensor(df_one_hot.values, dtype=torch.float)).detach().numpy()
one_hot_autoencoder_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(one_hot_features)
one_hot_autoencoder_acc = cluster_accuracy(one_hot_autoencoder_kmeans.labels_, og_df["num"].to_numpy())
one_hot_autoencoder_acc

0.3462986198243413

In [17]:
no_nan_df = og_df.interpolate()
distance_matrix = gower.gower_matrix(no_nan_df)
gower_agglo = AgglomerativeClustering(n_clusters=5, metric="precomputed", linkage="average").fit_predict(distance_matrix)
gower_agglo

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [18]:
gower_agglo_acc = cluster_accuracy(gower_agglo, og_df["num"].to_numpy())
gower_agglo_acc

0.36888331242158096

In [19]:
print(f"Naively choose most frequent label: {og_df['num'].value_counts().max()/og_df['num'].count()}")
print(f"Classic Kmeans with OneHot: {classic_one_hot_acc}")
print(f"Gower Distance Agglomerative: {gower_agglo_acc}")
print(f"Autoencoder Kmeans: {autoencoder_acc}")
print(f"Autoencoder Kmeans and OneHot: {one_hot_autoencoder_acc}")

Naively choose most frequent label: 0.3613550815558344
Classic Kmeans with OneHot: 0.3199498117942284
Gower Distance Agglomerative: 0.36888331242158096
Autoencoder Kmeans: 0.30614805520702637
Autoencoder Kmeans and OneHot: 0.3462986198243413
