In [155]:
import gower
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [156]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
5,56,Male,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2


In [157]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [158]:
df = og_df.copy()
df.drop(columns="num", inplace=True)
df[categorial_columns] = df[categorial_columns].apply(LabelEncoder().fit_transform)
df[categorial_columns] = MinMaxScaler().fit_transform(df[categorial_columns])

df[cont_columns] = MinMaxScaler().fit_transform(df[cont_columns])
df = df.fillna(df.mean())
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725000,0.386401,0.5,0.000000,0.633803,0.0,0.556818,0.000000,0.000000,0.000000
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
4,0.265306,0.0,0.333333,0.650000,0.338308,0.0,0.000000,0.788732,0.0,0.454545,0.666667,0.000000,0.333333
5,0.571429,1.0,0.333333,0.600000,0.391376,0.0,0.333333,0.830986,0.0,0.386364,0.666667,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,0.367347,1.0,0.000000,0.670000,0.514096,0.0,0.333333,0.464789,0.0,0.295455,1.000000,0.246488,0.333333
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.246488,1.000000
916,0.693878,1.0,1.000000,0.664569,0.230514,0.0,0.666667,0.537106,1.0,0.404223,1.000000,0.246488,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.246488,0.000000


In [159]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.x = torch.tensor(df.values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.x[idx]

    def __len__(self):
        return self.x.shape[0]

In [160]:
dataset = HeartDiseaseDataset(df)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [161]:
class MixedTypeClusteringModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(13, 10),
            torch.nn.Sigmoid(),
            torch.nn.Linear(10, 7),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(7, 10),
            torch.nn.Sigmoid(),
            torch.nn.Linear(10, 13),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = MixedTypeClusteringModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x in dataloader:
        optimizer.zero_grad()
        outputs = model(x)
        train_loss = criterion(outputs,  x)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.108678
epoch: 2/100, loss = 0.104393
epoch: 3/100, loss = 0.100424
epoch: 4/100, loss = 0.096694
epoch: 5/100, loss = 0.093261
epoch: 6/100, loss = 0.090059
epoch: 7/100, loss = 0.087123
epoch: 8/100, loss = 0.084437
epoch: 9/100, loss = 0.082014
epoch: 10/100, loss = 0.079788
epoch: 11/100, loss = 0.077838
epoch: 12/100, loss = 0.076077
epoch: 13/100, loss = 0.074555
epoch: 14/100, loss = 0.073228
epoch: 15/100, loss = 0.072070
epoch: 16/100, loss = 0.071086
epoch: 17/100, loss = 0.070259
epoch: 18/100, loss = 0.069599
epoch: 19/100, loss = 0.069036
epoch: 20/100, loss = 0.068545
epoch: 21/100, loss = 0.068188
epoch: 22/100, loss = 0.067868
epoch: 23/100, loss = 0.067636
epoch: 24/100, loss = 0.067406
epoch: 25/100, loss = 0.067268
epoch: 26/100, loss = 0.067144
epoch: 27/100, loss = 0.067031
epoch: 28/100, loss = 0.066947
epoch: 29/100, loss = 0.066861
epoch: 30/100, loss = 0.066809
epoch: 31/100, loss = 0.066753
epoch: 32/100, loss = 0.066739
epoch: 33/100, lo

In [162]:
features = model.encoder(torch.tensor(df.values, dtype=torch.float)).detach().numpy()
features

array([[0.38873935, 0.568156  , 0.6324271 , ..., 0.45219344, 0.648023  ,
        0.55143017],
       [0.26643524, 0.712885  , 0.76003903, ..., 0.54008555, 0.7550563 ,
        0.4266476 ],
       [0.25017765, 0.74500644, 0.78310627, ..., 0.58863246, 0.7777908 ,
        0.39654446],
       ...,
       [0.2134335 , 0.76809704, 0.68582743, ..., 0.6598072 , 0.722394  ,
        0.24376276],
       [0.25164175, 0.71169984, 0.6915861 , ..., 0.6443373 , 0.75804615,
        0.36761266],
       [0.19714141, 0.8151962 , 0.7924184 , ..., 0.7200219 , 0.80788034,
        0.28709126]], dtype=float32)

In [163]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [164]:
autoencoder_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
autoencoder_acc = cluster_accuracy(autoencoder_kmeans.labels_, og_df["num"].to_numpy())
autoencoder_acc

0.2797992471769134

In [165]:
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[[feature_to_encode]])
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

In [166]:
df_one_hot = og_df.copy()
df_one_hot.drop(columns="num", inplace=True)

for col in categorial_columns:
    df_one_hot = encode_feature(df_one_hot, col)
df_one_hot.loc[:, ~df_one_hot.columns.isin(cont_columns)] = df_one_hot.loc[:, ~df_one_hot.columns.isin(cont_columns)].apply(LabelEncoder().fit_transform)
df_one_hot = df_one_hot.fillna(df_one_hot.mean())
df_one_hot

Unnamed: 0,age,trestbps,chol,thalch,oldpeak,ca,sex_Female,sex_Male,cp_asymptomatic,cp_atypical angina,...,restecg_normal,restecg_st-t abnormality,exang_False,exang_True,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
0,63,145.000000,233.0,150.000000,2.300000,0.000000,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
1,67,160.000000,286.0,108.000000,1.500000,3.000000,0,1,1,0,...,0,0,0,1,0,1,0,0,1,0
2,67,120.000000,229.0,129.000000,2.600000,2.000000,0,1,1,0,...,0,0,0,1,0,1,0,0,0,1
4,41,130.000000,204.0,172.000000,1.400000,0.000000,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0
5,56,120.000000,236.0,178.000000,0.800000,0.000000,0,1,0,1,...,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,46,134.000000,310.0,126.000000,0.000000,0.739464,0,1,1,0,...,1,0,1,0,0,0,0,0,1,0
915,54,127.000000,333.0,154.000000,0.000000,0.739464,1,0,1,0,...,0,1,1,0,0,0,0,0,0,0
916,62,132.913863,139.0,136.269076,0.957162,0.739464,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
917,55,122.000000,223.0,100.000000,0.000000,0.739464,0,1,1,0,...,0,1,1,0,0,0,0,1,0,0


In [167]:
classic_one_hot_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(df_one_hot)
classic_one_hot_acc = cluster_accuracy(classic_one_hot_kmeans.labels_, og_df["num"].to_numpy())
classic_one_hot_acc

0.31493099121706397

In [168]:
one_hot_dataset = HeartDiseaseDataset(df_one_hot)
one_hot_dataloader = DataLoader(one_hot_dataset, batch_size=100, shuffle=True)
len(one_hot_dataset)

797

In [169]:
class OneHotMixedTypeClusteringModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(25, 18),
            torch.nn.Sigmoid(),
            torch.nn.Linear(18, 12),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(12, 18),
            torch.nn.Sigmoid(),
            torch.nn.Linear(18, 25),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

one_hot_model = OneHotMixedTypeClusteringModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x in one_hot_dataloader:
        optimizer.zero_grad()
        outputs = one_hot_model(x)
        train_loss = criterion(outputs,  x)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 3616.381348
epoch: 2/100, loss = 3615.391052
epoch: 3/100, loss = 3615.434570
epoch: 4/100, loss = 3616.502411
epoch: 5/100, loss = 3616.082031
epoch: 6/100, loss = 3615.604370
epoch: 7/100, loss = 3617.363647
epoch: 8/100, loss = 3614.930115
epoch: 9/100, loss = 3616.608032
epoch: 10/100, loss = 3615.602844
epoch: 11/100, loss = 3615.035126
epoch: 12/100, loss = 3616.309204
epoch: 13/100, loss = 3615.775665
epoch: 14/100, loss = 3616.218903
epoch: 15/100, loss = 3616.567780
epoch: 16/100, loss = 3616.501007
epoch: 17/100, loss = 3615.367218
epoch: 18/100, loss = 3616.323242
epoch: 19/100, loss = 3616.732941
epoch: 20/100, loss = 3616.566101
epoch: 21/100, loss = 3616.731323
epoch: 22/100, loss = 3615.920532
epoch: 23/100, loss = 3615.620972
epoch: 24/100, loss = 3615.344147
epoch: 25/100, loss = 3616.259521
epoch: 26/100, loss = 3615.202057
epoch: 27/100, loss = 3615.840149
epoch: 28/100, loss = 3615.977661
epoch: 29/100, loss = 3615.614960
epoch: 30/100, loss = 3

In [170]:
one_hot_features = one_hot_model.encoder(torch.tensor(df_one_hot.values, dtype=torch.float)).detach().numpy()
one_hot_autoencoder_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(one_hot_features)
one_hot_autoencoder_acc = cluster_accuracy(one_hot_autoencoder_kmeans.labels_, og_df["num"].to_numpy())
one_hot_autoencoder_acc

0.3851944792973651

In [171]:
no_nan_df = og_df.interpolate()
distance_matrix = gower.gower_matrix(no_nan_df)
gower_agglo = AgglomerativeClustering(n_clusters=5, metric="precomputed", linkage="average").fit_predict(distance_matrix)
gower_agglo

array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [172]:
gower_agglo_acc = cluster_accuracy(gower_agglo, og_df["num"].to_numpy())
gower_agglo_acc

0.38017565872020076

In [173]:
print(f"Naively choose most frequent label: {og_df['num'].value_counts().max()/og_df['num'].count()}")
print(f"Classic Kmeans with OneHot: {classic_one_hot_acc}")
print(f"Gower Distance Agglomerative: {gower_agglo_acc}")
print(f"Autoencoder Kmeans: {autoencoder_acc}")
print(f"Autoencoder Kmeans and OneHot: {one_hot_autoencoder_acc}")

Naively choose most frequent label: 0.3613550815558344
Classic Kmeans with OneHot: 0.31493099121706397
Gower Distance Agglomerative: 0.38017565872020076
Autoencoder Kmeans: 0.2797992471769134
Autoencoder Kmeans and OneHot: 0.3851944792973651
