In [720]:
import gower
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [721]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
6,62,Female,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2


In [722]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [723]:
df = og_df.copy()
df.drop(columns="num", inplace=True)
df[categorial_columns] = df[categorial_columns].apply(LabelEncoder().fit_transform)
df[categorial_columns] = MinMaxScaler().fit_transform(df[categorial_columns])

df[cont_columns] = MinMaxScaler().fit_transform(df[cont_columns])
df = df.fillna(df.mean())
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725000,0.386401,0.5,0.000000,0.633803,0.0,0.556818,0.000000,0.000000,0.000000
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
4,0.265306,0.0,0.333333,0.650000,0.338308,0.0,0.000000,0.788732,0.0,0.454545,0.666667,0.000000,0.333333
6,0.693878,0.0,0.000000,0.700000,0.444444,0.0,0.000000,0.704225,0.0,0.704545,0.000000,0.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,0.367347,1.0,0.000000,0.670000,0.514096,0.0,0.333333,0.464789,0.0,0.295455,1.000000,0.245547,0.333333
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.245547,1.000000
916,0.693878,1.0,1.000000,0.660705,0.230514,0.0,0.666667,0.536471,1.0,0.402432,1.000000,0.245547,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.245547,0.000000


In [724]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.x = torch.tensor(df.values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.x[idx]

    def __len__(self):
        return self.x.shape[0]

In [725]:
dataset = HeartDiseaseDataset(df)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [726]:
class MixedTypeClusteringModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(13, 10),
            torch.nn.Sigmoid(),
            torch.nn.Linear(10, 7),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(7, 10),
            torch.nn.Sigmoid(),
            torch.nn.Linear(10, 13),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 20
lr = 0.001

model = MixedTypeClusteringModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x in dataloader:
        optimizer.zero_grad()
        outputs = model(x)
        train_loss = criterion(outputs,  x)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/20, loss = 0.134797
epoch: 2/20, loss = 0.129072
epoch: 3/20, loss = 0.123741
epoch: 4/20, loss = 0.118789
epoch: 5/20, loss = 0.114179
epoch: 6/20, loss = 0.109854
epoch: 7/20, loss = 0.105825
epoch: 8/20, loss = 0.102066
epoch: 9/20, loss = 0.098521
epoch: 10/20, loss = 0.095231
epoch: 11/20, loss = 0.092142
epoch: 12/20, loss = 0.089303
epoch: 13/20, loss = 0.086644
epoch: 14/20, loss = 0.084204
epoch: 15/20, loss = 0.081990
epoch: 16/20, loss = 0.079979
epoch: 17/20, loss = 0.078186
epoch: 18/20, loss = 0.076542
epoch: 19/20, loss = 0.075132
epoch: 20/20, loss = 0.073863


In [727]:
features = model.encoder(torch.tensor(df.values, dtype=torch.float)).detach().numpy()
features

array([[0.33563286, 0.54785407, 0.63185495, ..., 0.45946935, 0.45043293,
        0.7542313 ],
       [0.34144118, 0.54714125, 0.6273945 , ..., 0.4533577 , 0.4464733 ,
        0.76069224],
       [0.34052342, 0.54303515, 0.625451  , ..., 0.4527891 , 0.4455814 ,
        0.7630433 ],
       ...,
       [0.3248014 , 0.54966336, 0.6213418 , ..., 0.44565696, 0.44498637,
        0.7669306 ],
       [0.33349502, 0.5554864 , 0.6264569 , ..., 0.45651126, 0.44726452,
        0.76197034],
       [0.33331686, 0.55033565, 0.62223995, ..., 0.45474336, 0.4432952 ,
        0.765264  ]], dtype=float32)

In [728]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [729]:
autoencoder_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
autoencoder_acc = cluster_accuracy(autoencoder_kmeans.labels_, og_df["num"].to_numpy())
autoencoder_acc

0.30363864491844417

In [730]:
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[[feature_to_encode]])
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

In [731]:
df_one_hot = og_df.copy()
df_one_hot.drop(columns="num", inplace=True)

for col in categorial_columns:
    df_one_hot = encode_feature(df_one_hot, col)
df_one_hot.loc[:, ~df_one_hot.columns.isin(cont_columns)] = df_one_hot.loc[:, ~df_one_hot.columns.isin(cont_columns)].apply(LabelEncoder().fit_transform)
df_one_hot = df_one_hot.fillna(df_one_hot.mean())
df_one_hot

Unnamed: 0,age,trestbps,chol,thalch,oldpeak,ca,sex_Female,sex_Male,cp_asymptomatic,cp_atypical angina,...,restecg_normal,restecg_st-t abnormality,exang_False,exang_True,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
0,63,145.00000,233.0,150.000000,2.300000,0.000000,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
1,67,160.00000,286.0,108.000000,1.500000,3.000000,0,1,1,0,...,0,0,0,1,0,1,0,0,1,0
2,67,120.00000,229.0,129.000000,2.600000,2.000000,0,1,1,0,...,0,0,0,1,0,1,0,0,0,1
4,41,130.00000,204.0,172.000000,1.400000,0.000000,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0
6,62,140.00000,268.0,160.000000,3.600000,2.000000,1,0,1,0,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,46,134.00000,310.0,126.000000,0.000000,0.736641,0,1,1,0,...,1,0,1,0,0,0,0,0,1,0
915,54,127.00000,333.0,154.000000,0.000000,0.736641,1,0,1,0,...,0,1,1,0,0,0,0,0,0,0
916,62,132.14094,139.0,136.178905,0.941398,0.736641,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
917,55,122.00000,223.0,100.000000,0.000000,0.736641,0,1,1,0,...,0,1,1,0,0,0,0,1,0,0


In [732]:
classic_one_hot_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(df_one_hot)
classic_one_hot_acc = cluster_accuracy(classic_one_hot_kmeans.labels_, og_df["num"].to_numpy())
classic_one_hot_acc

0.33751568381430364

In [733]:
one_hot_dataset = HeartDiseaseDataset(df_one_hot)
one_hot_dataloader = DataLoader(one_hot_dataset, batch_size=100, shuffle=True)
len(one_hot_dataset)

797

In [734]:
class OneHotMixedTypeClusteringModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(25, 18),
            torch.nn.Sigmoid(),
            torch.nn.Linear(18, 12),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(12, 18),
            torch.nn.Sigmoid(),
            torch.nn.Linear(18, 25),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 20
lr = 0.001

one_hot_model = OneHotMixedTypeClusteringModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x in one_hot_dataloader:
        optimizer.zero_grad()
        outputs = one_hot_model(x)
        train_loss = criterion(outputs,  x)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/20, loss = 3609.641235
epoch: 2/20, loss = 3609.099915
epoch: 3/20, loss = 3609.670715
epoch: 4/20, loss = 3608.340118
epoch: 5/20, loss = 3609.050629
epoch: 6/20, loss = 3608.688751
epoch: 7/20, loss = 3609.491394
epoch: 8/20, loss = 3608.751831
epoch: 9/20, loss = 3610.072144
epoch: 10/20, loss = 3608.554749
epoch: 11/20, loss = 3609.849182
epoch: 12/20, loss = 3609.257904
epoch: 13/20, loss = 3609.341522
epoch: 14/20, loss = 3609.083649
epoch: 15/20, loss = 3609.468567
epoch: 16/20, loss = 3609.425476
epoch: 17/20, loss = 3609.324738
epoch: 18/20, loss = 3609.362030
epoch: 19/20, loss = 3609.700500
epoch: 20/20, loss = 3609.622559


In [735]:
one_hot_features = one_hot_model.encoder(torch.tensor(df_one_hot.values, dtype=torch.float)).detach().numpy()
one_hot_autoencoder_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(one_hot_features)
one_hot_autoencoder_acc = cluster_accuracy(one_hot_autoencoder_kmeans.labels_, og_df["num"].to_numpy())
one_hot_autoencoder_acc

0.3739021329987453

In [736]:
no_nan_df = og_df.interpolate()
distance_matrix = gower.gower_matrix(no_nan_df)
gower_agglo = AgglomerativeClustering(n_clusters=5, metric="precomputed", linkage="average").fit_predict(distance_matrix)
gower_agglo

array([0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 3, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [737]:
gower_agglo_acc = cluster_accuracy(gower_agglo, og_df["num"].to_numpy())
gower_agglo_acc

0.38268506900878296

In [738]:
print(f"Naively choose most frequent label: {og_df['num'].value_counts().max()/og_df['num'].count()}")
print(f"Classic Kmeans with OneHot: {classic_one_hot_acc}")
print(f"Gower Distance Agglomerative: {gower_agglo_acc}")
print(f"Autoencoder Kmeans: {autoencoder_acc}")
print(f"Autoencoder Kmeans and OneHot: {one_hot_autoencoder_acc}")

Naively choose most frequent label: 0.3613550815558344
Classic Kmeans with OneHot: 0.33751568381430364
Gower Distance Agglomerative: 0.38268506900878296
Autoencoder Kmeans: 0.30363864491844417
Autoencoder Kmeans and OneHot: 0.3739021329987453
