In [1]:
import gower
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [2]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
6,62,Female,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
8,63,Male,asymptomatic,130.0,254.0,False,lv hypertrophy,147.0,False,1.4,flat,1.0,reversable defect,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [3]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [4]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])

df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
4,0.265306,0.0,0.333333,0.650000,0.338308,0.0,0.000000,0.788732,0.0,0.454545,0.666667,0.000000,0.333333
6,0.693878,0.0,0.000000,0.700000,0.444444,0.0,0.000000,0.704225,0.0,0.704545,0.000000,0.666667,0.333333
8,0.714286,1.0,0.000000,0.650000,0.421227,0.0,0.000000,0.612676,0.0,0.454545,0.333333,0.333333,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.259705,1.000000
916,0.693878,1.0,1.000000,0.662547,0.230514,0.0,0.666667,0.537019,1.0,0.400641,1.000000,0.259705,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.259705,0.000000
918,0.612245,1.0,0.000000,0.662547,0.638474,0.5,0.000000,0.537019,1.0,0.400641,1.000000,0.259705,1.000000


In [5]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [6]:
dataset = HeartDiseaseDataset(df_min_max)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [7]:
class AllToCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(df_min_max.columns), 9),
            torch.nn.BatchNorm1d(9),
            torch.nn.ReLU(),
            torch.nn.Linear(9, 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

all_to_cat_model = AllToCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(all_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    all_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = all_to_cat_model(torch.cat((cat, cont), 1))
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.179693
epoch: 2/100, loss = 0.168194
epoch: 3/100, loss = 0.158236
epoch: 4/100, loss = 0.149866
epoch: 5/100, loss = 0.142515
epoch: 6/100, loss = 0.135928
epoch: 7/100, loss = 0.129806
epoch: 8/100, loss = 0.124608
epoch: 9/100, loss = 0.120505
epoch: 10/100, loss = 0.116494
epoch: 11/100, loss = 0.113239
epoch: 12/100, loss = 0.109927
epoch: 13/100, loss = 0.107870
epoch: 14/100, loss = 0.105018
epoch: 15/100, loss = 0.102948
epoch: 16/100, loss = 0.101318
epoch: 17/100, loss = 0.099815
epoch: 18/100, loss = 0.097617
epoch: 19/100, loss = 0.095997
epoch: 20/100, loss = 0.094674
epoch: 21/100, loss = 0.093409
epoch: 22/100, loss = 0.092065
epoch: 23/100, loss = 0.090290
epoch: 24/100, loss = 0.089864
epoch: 25/100, loss = 0.088100
epoch: 26/100, loss = 0.086997
epoch: 27/100, loss = 0.086163
epoch: 28/100, loss = 0.084693
epoch: 29/100, loss = 0.083503
epoch: 30/100, loss = 0.082367
epoch: 31/100, loss = 0.081174
epoch: 32/100, loss = 0.080297
epoch: 33/100, lo

In [8]:
cat_features = all_to_cat_model.encoder(torch.tensor(df_min_max.values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
features

array([[0.73297334, 0.30357751, 0.66529679, ..., 0.33802817, 0.46590909,
        1.        ],
       [0.6026926 , 0.27165443, 0.70617265, ..., 0.48591549, 0.59090909,
        0.66666667],
       [0.72208458, 0.30373308, 0.2358558 , ..., 0.78873239, 0.45454545,
        0.        ],
       ...,
       [0.57018661, 0.77334017, 0.78965718, ..., 0.28169014, 0.29545455,
        0.25970549],
       [0.4054088 , 0.54068351, 0.71624714, ..., 0.53701916, 0.40064142,
        0.25970549],
       [0.44339585, 0.23012984, 0.61549181, ..., 0.23239437, 0.29545455,
        0.25970549]])

In [9]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [10]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
all_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
all_to_cat_acc

0.40401505646173147

In [11]:
class OnlyCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(categorial_columns), 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

cat_to_cat_model = OnlyCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(cat_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    cat_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = cat_to_cat_model(cat)
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.200821
epoch: 2/100, loss = 0.193351
epoch: 3/100, loss = 0.185992
epoch: 4/100, loss = 0.178450
epoch: 5/100, loss = 0.170593
epoch: 6/100, loss = 0.163130
epoch: 7/100, loss = 0.155509
epoch: 8/100, loss = 0.148344
epoch: 9/100, loss = 0.141795
epoch: 10/100, loss = 0.136214
epoch: 11/100, loss = 0.131714
epoch: 12/100, loss = 0.126966
epoch: 13/100, loss = 0.123176
epoch: 14/100, loss = 0.119354
epoch: 15/100, loss = 0.116115
epoch: 16/100, loss = 0.113346
epoch: 17/100, loss = 0.110176
epoch: 18/100, loss = 0.107350
epoch: 19/100, loss = 0.104243
epoch: 20/100, loss = 0.101859
epoch: 21/100, loss = 0.099119
epoch: 22/100, loss = 0.096463
epoch: 23/100, loss = 0.094632
epoch: 24/100, loss = 0.091887
epoch: 25/100, loss = 0.089452
epoch: 26/100, loss = 0.087276
epoch: 27/100, loss = 0.085254
epoch: 28/100, loss = 0.082768
epoch: 29/100, loss = 0.081009
epoch: 30/100, loss = 0.079760
epoch: 31/100, loss = 0.077687
epoch: 32/100, loss = 0.076930
epoch: 33/100, lo

In [12]:
cat_features = cat_to_cat_model.encoder(torch.tensor(df_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
cat_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
cat_to_cat_acc

0.3713927227101631

In [13]:
print(f"All Cols to Categorial Cols AE: {all_to_cat_acc}")
print(f"Categorial Cols to Categorial Cols AE: {cat_to_cat_acc}")

All Cols to Categorial Cols AE: 0.40401505646173147
Categorial Cols to Categorial Cols AE: 0.3713927227101631


In [14]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
1,67,1,0,160.000000,286.0,0,0,108.00000,1,1.500000,1,3.000000,1
2,67,1,0,120.000000,229.0,0,0,129.00000,1,2.600000,1,2.000000,2
4,41,0,1,130.000000,204.0,0,0,172.00000,0,1.400000,2,0.000000,1
6,62,0,0,140.000000,268.0,0,0,160.00000,0,3.600000,0,2.000000,1
8,63,1,0,130.000000,254.0,0,0,147.00000,0,1.400000,1,1.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,0,0,127.000000,333.0,1,2,154.00000,0,0.000000,3,0.779116,3
916,62,1,3,132.509459,139.0,0,2,136.25672,2,0.925645,3,0.779116,3
917,55,1,0,122.000000,223.0,1,2,100.00000,0,0.000000,3,0.779116,0
918,58,1,0,132.509459,385.0,1,0,136.25672,2,0.925645,3,0.779116,3


In [15]:
no_min_max_dataset = HeartDiseaseDataset(df_no_min_max)
no_min_max_dataloader = DataLoader(no_min_max_dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [16]:
epochs = 100
lr = 0.001

no_min_max_cat_to_cat_model = OnlyCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(no_min_max_cat_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    no_min_max_cat_to_cat_model.train()
    loss = 0

    for cat, cont in no_min_max_dataloader:
        optimizer.zero_grad()
        outputs = no_min_max_cat_to_cat_model(cat)
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.367521
epoch: 2/100, loss = 1.347952
epoch: 3/100, loss = 1.330685
epoch: 4/100, loss = 1.312321
epoch: 5/100, loss = 1.294633
epoch: 6/100, loss = 1.277788
epoch: 7/100, loss = 1.259671
epoch: 8/100, loss = 1.245125
epoch: 9/100, loss = 1.229399
epoch: 10/100, loss = 1.216415
epoch: 11/100, loss = 1.202995
epoch: 12/100, loss = 1.192759
epoch: 13/100, loss = 1.183159
epoch: 14/100, loss = 1.174424
epoch: 15/100, loss = 1.166045
epoch: 16/100, loss = 1.158894
epoch: 17/100, loss = 1.153262
epoch: 18/100, loss = 1.146329
epoch: 19/100, loss = 1.139419
epoch: 20/100, loss = 1.133875
epoch: 21/100, loss = 1.127841
epoch: 22/100, loss = 1.123741
epoch: 23/100, loss = 1.117929
epoch: 24/100, loss = 1.112150
epoch: 25/100, loss = 1.107021
epoch: 26/100, loss = 1.102660
epoch: 27/100, loss = 1.097601
epoch: 28/100, loss = 1.094280
epoch: 29/100, loss = 1.089013
epoch: 30/100, loss = 1.084149
epoch: 31/100, loss = 1.080871
epoch: 32/100, loss = 1.075743
epoch: 33/100, lo

In [17]:
cat_features = no_min_max_cat_to_cat_model.encoder(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_no_min_max[cont_columns].values), 1)
features

array([[3.55217725e-01, 2.63153046e-01, 1.58569843e-01, ...,
        1.08000000e+02, 1.50000000e+00, 3.00000000e+00],
       [3.61713827e-01, 2.23063543e-01, 1.80463716e-01, ...,
        1.29000000e+02, 2.60000000e+00, 2.00000000e+00],
       [2.10512206e-01, 6.13534153e-01, 4.82439518e-01, ...,
        1.72000000e+02, 1.40000000e+00, 0.00000000e+00],
       ...,
       [7.55723298e-01, 8.07533562e-01, 5.72365463e-01, ...,
        1.00000000e+02, 0.00000000e+00, 7.79116466e-01],
       [2.56014436e-01, 2.29596570e-01, 1.89873055e-01, ...,
        1.36256720e+02, 9.25644505e-01, 7.79116466e-01],
       [1.91442490e-01, 4.55426991e-01, 4.54688102e-01, ...,
        9.30000000e+01, 0.00000000e+00, 7.79116466e-01]])

In [18]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
no_min_max_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
no_min_max_acc

0.3751568381430364