In [74]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [75]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
6,62,Female,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [76]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [77]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])

df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725000,0.386401,0.5,0.000000,0.666667,0.0,0.556818,0.000000,0.000000,0.000000
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.355556,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.511111,0.5,0.590909,0.333333,0.666667,0.666667
3,0.183673,1.0,0.666667,0.650000,0.414594,0.0,0.333333,0.940741,0.0,0.693182,0.000000,0.000000,0.333333
6,0.693878,0.0,0.000000,0.700000,0.444444,0.0,0.000000,0.740741,0.0,0.704545,0.000000,0.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.696296,0.0,0.295455,1.000000,0.252564,1.000000
916,0.693878,1.0,1.000000,0.661433,0.230514,0.0,0.666667,0.561664,1.0,0.403732,1.000000,0.252564,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.296296,0.0,0.295455,1.000000,0.252564,0.000000
918,0.612245,1.0,0.000000,0.661433,0.638474,0.5,0.000000,0.561664,1.0,0.403732,1.000000,0.252564,1.000000


In [78]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [79]:
dataset = HeartDiseaseDataset(df_min_max)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [80]:
class AllToCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(df_min_max.columns), 9),
            torch.nn.BatchNorm1d(9),
            torch.nn.ReLU(),
            torch.nn.Linear(9, 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

all_to_cat_model = AllToCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(all_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    all_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = all_to_cat_model(torch.cat((cat, cont), 1))
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.196394
epoch: 2/100, loss = 0.185074
epoch: 3/100, loss = 0.173884
epoch: 4/100, loss = 0.163533
epoch: 5/100, loss = 0.154022
epoch: 6/100, loss = 0.145754
epoch: 7/100, loss = 0.138484
epoch: 8/100, loss = 0.132338
epoch: 9/100, loss = 0.126797
epoch: 10/100, loss = 0.122130
epoch: 11/100, loss = 0.118086
epoch: 12/100, loss = 0.114530
epoch: 13/100, loss = 0.110691
epoch: 14/100, loss = 0.108210
epoch: 15/100, loss = 0.105490
epoch: 16/100, loss = 0.102652
epoch: 17/100, loss = 0.100232
epoch: 18/100, loss = 0.098008
epoch: 19/100, loss = 0.096104
epoch: 20/100, loss = 0.094351
epoch: 21/100, loss = 0.092399
epoch: 22/100, loss = 0.090680
epoch: 23/100, loss = 0.089651
epoch: 24/100, loss = 0.087711
epoch: 25/100, loss = 0.086844
epoch: 26/100, loss = 0.084831
epoch: 27/100, loss = 0.083860
epoch: 28/100, loss = 0.082296
epoch: 29/100, loss = 0.081470
epoch: 30/100, loss = 0.080155
epoch: 31/100, loss = 0.079210
epoch: 32/100, loss = 0.077699
epoch: 33/100, lo

In [81]:
cat_features = all_to_cat_model.encoder(torch.tensor(df_min_max.values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
features

array([[0.57319617, 0.39101484, 0.72134006, ..., 0.66666667, 0.55681818,
        0.        ],
       [0.22031595, 0.31446555, 0.36246598, ..., 0.35555556, 0.46590909,
        1.        ],
       [0.17055316, 0.21436307, 0.33376518, ..., 0.51111111, 0.59090909,
        0.66666667],
       ...,
       [0.54392433, 0.7908566 , 0.53421348, ..., 0.2962963 , 0.29545455,
        0.2525641 ],
       [0.24430327, 0.51297486, 0.58800972, ..., 0.56166394, 0.40373157,
        0.2525641 ],
       [0.20519525, 0.17199433, 0.44757509, ..., 0.24444444, 0.29545455,
        0.2525641 ]])

In [82]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [83]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
all_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
all_to_cat_acc

0.3186951066499373

In [84]:
class OnlyCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(categorial_columns), 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

cat_to_cat_model = OnlyCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(cat_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    cat_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = cat_to_cat_model(cat)
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.203300
epoch: 2/100, loss = 0.197388
epoch: 3/100, loss = 0.191051
epoch: 4/100, loss = 0.185135
epoch: 5/100, loss = 0.179607
epoch: 6/100, loss = 0.174116
epoch: 7/100, loss = 0.168146
epoch: 8/100, loss = 0.162484
epoch: 9/100, loss = 0.156702
epoch: 10/100, loss = 0.150947
epoch: 11/100, loss = 0.145555
epoch: 12/100, loss = 0.139701
epoch: 13/100, loss = 0.134431
epoch: 14/100, loss = 0.129597
epoch: 15/100, loss = 0.125407
epoch: 16/100, loss = 0.120979
epoch: 17/100, loss = 0.117098
epoch: 18/100, loss = 0.113676
epoch: 19/100, loss = 0.110161
epoch: 20/100, loss = 0.106272
epoch: 21/100, loss = 0.103127
epoch: 22/100, loss = 0.101138
epoch: 23/100, loss = 0.098119
epoch: 24/100, loss = 0.095774
epoch: 25/100, loss = 0.093919
epoch: 26/100, loss = 0.091951
epoch: 27/100, loss = 0.090379
epoch: 28/100, loss = 0.088500
epoch: 29/100, loss = 0.087609
epoch: 30/100, loss = 0.086223
epoch: 31/100, loss = 0.084470
epoch: 32/100, loss = 0.083409
epoch: 33/100, lo

In [85]:
cat_features = cat_to_cat_model.encoder(torch.tensor(df_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
cat_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
cat_to_cat_acc

0.3851944792973651

In [86]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,1,3,145.000000,233.0,1,0,150.000000,0,2.300000,0,0.000000,0
1,67,1,0,160.000000,286.0,0,0,108.000000,1,1.500000,1,3.000000,1
2,67,1,0,120.000000,229.0,0,0,129.000000,1,2.600000,1,2.000000,2
3,37,1,2,130.000000,250.0,0,1,187.000000,0,3.500000,0,0.000000,1
6,62,0,0,140.000000,268.0,0,0,160.000000,0,3.600000,0,2.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,0,0,127.000000,333.0,1,2,154.000000,0,0.000000,3,0.757692,3
916,62,1,3,132.286676,139.0,0,2,135.824632,2,0.952838,3,0.757692,3
917,55,1,0,122.000000,223.0,1,2,100.000000,0,0.000000,3,0.757692,0
918,58,1,0,132.286676,385.0,1,0,135.824632,2,0.952838,3,0.757692,3


In [87]:
no_min_max_dataset = HeartDiseaseDataset(df_no_min_max)
no_min_max_dataloader = DataLoader(no_min_max_dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [88]:
epochs = 100
lr = 0.001

no_min_max_cat_to_cat_model = OnlyCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(no_min_max_cat_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    no_min_max_cat_to_cat_model.train()
    loss = 0

    for cat, cont in no_min_max_dataloader:
        optimizer.zero_grad()
        outputs = no_min_max_cat_to_cat_model(cat)
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.282252
epoch: 2/100, loss = 1.266738
epoch: 3/100, loss = 1.250640
epoch: 4/100, loss = 1.235710
epoch: 5/100, loss = 1.221517
epoch: 6/100, loss = 1.208145
epoch: 7/100, loss = 1.195378
epoch: 8/100, loss = 1.183441
epoch: 9/100, loss = 1.171225
epoch: 10/100, loss = 1.160448
epoch: 11/100, loss = 1.150394
epoch: 12/100, loss = 1.139788
epoch: 13/100, loss = 1.129945
epoch: 14/100, loss = 1.119556
epoch: 15/100, loss = 1.110785
epoch: 16/100, loss = 1.100845
epoch: 17/100, loss = 1.092131
epoch: 18/100, loss = 1.083793
epoch: 19/100, loss = 1.075787
epoch: 20/100, loss = 1.068824
epoch: 21/100, loss = 1.062732
epoch: 22/100, loss = 1.057706
epoch: 23/100, loss = 1.052522
epoch: 24/100, loss = 1.047738
epoch: 25/100, loss = 1.044066
epoch: 26/100, loss = 1.041002
epoch: 27/100, loss = 1.037856
epoch: 28/100, loss = 1.034254
epoch: 29/100, loss = 1.030684
epoch: 30/100, loss = 1.028039
epoch: 31/100, loss = 1.025700
epoch: 32/100, loss = 1.023235
epoch: 33/100, lo

In [89]:
cat_features = no_min_max_cat_to_cat_model.encoder(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
features

array([[0.52008939, 0.63564682, 0.14617272, ..., 0.66666667, 0.55681818,
        0.        ],
       [0.57387429, 0.77381641, 0.53025556, ..., 0.35555556, 0.46590909,
        1.        ],
       [0.69572443, 0.71157026, 0.63199574, ..., 0.51111111, 0.59090909,
        0.66666667],
       ...,
       [0.0336555 , 0.34917748, 0.39261562, ..., 0.2962963 , 0.29545455,
        0.2525641 ],
       [0.47608197, 0.41255972, 0.82813102, ..., 0.56166394, 0.40373157,
        0.2525641 ],
       [0.43759966, 0.31887686, 0.75474447, ..., 0.24444444, 0.29545455,
        0.2525641 ]])

In [90]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
no_min_max_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
no_min_max_acc

0.39021329987452946

In [91]:
print(f"All Cols to Categorial Cols AE: {all_to_cat_acc}")
print(f"Categorial Cols to Categorial Cols AE: {cat_to_cat_acc}")
print(f"Categorial Cols to Categorial Cols AE, No MinMax on Categorial Cols: {no_min_max_acc}")

All Cols to Categorial Cols AE: 0.3186951066499373
Categorial Cols to Categorial Cols AE: 0.3851944792973651
Categorial Cols to Categorial Cols AE, No MinMax Norm: 0.39021329987452946
