In [56]:
import gower
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [57]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,Male,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,0.0,,,,1
914,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2


In [58]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [59]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])

df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725,0.386401,0.5,0.000000,0.633803,0.0,0.556818,0.000000,0.000000,0.000000
1,0.795918,1.0,0.000000,0.800,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
3,0.183673,1.0,0.666667,0.650,0.414594,0.0,0.333333,0.894366,0.0,0.693182,0.000000,0.000000,0.333333
4,0.265306,0.0,0.333333,0.650,0.338308,0.0,0.000000,0.788732,0.0,0.454545,0.666667,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,0.693878,1.0,0.000000,0.790,0.281924,0.0,0.666667,0.549296,0.5,0.295455,1.000000,0.257692,1.000000
914,0.367347,1.0,0.000000,0.670,0.514096,0.0,0.333333,0.464789,0.0,0.295455,1.000000,0.257692,0.333333
915,0.530612,0.0,0.000000,0.635,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.257692,1.000000
917,0.551020,1.0,0.000000,0.610,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.257692,0.000000


In [60]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [61]:
dataset = HeartDiseaseDataset(df_min_max)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [62]:
class AllToCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(df_min_max.columns), 9),
            torch.nn.BatchNorm1d(9),
            torch.nn.ReLU(),
            torch.nn.Linear(9, 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

all_to_cat_model = AllToCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(all_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    all_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = all_to_cat_model(torch.cat((cat, cont), 1))
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.201640
epoch: 2/100, loss = 0.190897
epoch: 3/100, loss = 0.181094
epoch: 4/100, loss = 0.172507
epoch: 5/100, loss = 0.163989
epoch: 6/100, loss = 0.155754
epoch: 7/100, loss = 0.148432
epoch: 8/100, loss = 0.142346
epoch: 9/100, loss = 0.135756
epoch: 10/100, loss = 0.130184
epoch: 11/100, loss = 0.124784
epoch: 12/100, loss = 0.120340
epoch: 13/100, loss = 0.116690
epoch: 14/100, loss = 0.112920
epoch: 15/100, loss = 0.109960
epoch: 16/100, loss = 0.107231
epoch: 17/100, loss = 0.104424
epoch: 18/100, loss = 0.101859
epoch: 19/100, loss = 0.099820
epoch: 20/100, loss = 0.097713
epoch: 21/100, loss = 0.095375
epoch: 22/100, loss = 0.093883
epoch: 23/100, loss = 0.092106
epoch: 24/100, loss = 0.090128
epoch: 25/100, loss = 0.087936
epoch: 26/100, loss = 0.086172
epoch: 27/100, loss = 0.084873
epoch: 28/100, loss = 0.084188
epoch: 29/100, loss = 0.081829
epoch: 30/100, loss = 0.080613
epoch: 31/100, loss = 0.078955
epoch: 32/100, loss = 0.078263
epoch: 33/100, lo

In [63]:
cat_features = all_to_cat_model.encoder(torch.tensor(df_min_max.values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
features

array([[0.836604  , 0.69939512, 0.72495413, ..., 0.63380282, 0.55681818,
        0.        ],
       [0.54750472, 0.33808276, 0.83925742, ..., 0.33802817, 0.46590909,
        1.        ],
       [0.55098021, 0.40891263, 0.86557841, ..., 0.48591549, 0.59090909,
        0.66666667],
       ...,
       [0.44544202, 0.20854697, 0.35869312, ..., 0.66197183, 0.29545455,
        0.25769231],
       [0.47435492, 0.44441375, 0.27250209, ..., 0.28169014, 0.29545455,
        0.25769231],
       [0.54499245, 0.4557021 , 0.86200279, ..., 0.23239437, 0.29545455,
        0.25769231]])

In [64]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [65]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
all_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
all_to_cat_acc

0.36888331242158096

In [66]:
class OnlyCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(categorial_columns), 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

cat_to_cat_model = OnlyCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(cat_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    cat_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = cat_to_cat_model(cat)
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.207631
epoch: 2/100, loss = 0.199944
epoch: 3/100, loss = 0.192041
epoch: 4/100, loss = 0.184285
epoch: 5/100, loss = 0.176508
epoch: 6/100, loss = 0.169518
epoch: 7/100, loss = 0.162206
epoch: 8/100, loss = 0.155303
epoch: 9/100, loss = 0.148290
epoch: 10/100, loss = 0.141614
epoch: 11/100, loss = 0.135480
epoch: 12/100, loss = 0.129494
epoch: 13/100, loss = 0.123732
epoch: 14/100, loss = 0.118748
epoch: 15/100, loss = 0.114669
epoch: 16/100, loss = 0.111119
epoch: 17/100, loss = 0.107583
epoch: 18/100, loss = 0.104848
epoch: 19/100, loss = 0.102321
epoch: 20/100, loss = 0.100119
epoch: 21/100, loss = 0.098025
epoch: 22/100, loss = 0.096052
epoch: 23/100, loss = 0.094700
epoch: 24/100, loss = 0.093286
epoch: 25/100, loss = 0.091086
epoch: 26/100, loss = 0.089813
epoch: 27/100, loss = 0.088314
epoch: 28/100, loss = 0.087217
epoch: 29/100, loss = 0.085788
epoch: 30/100, loss = 0.084733
epoch: 31/100, loss = 0.083909
epoch: 32/100, loss = 0.082332
epoch: 33/100, lo

In [67]:
cat_features = cat_to_cat_model.encoder(torch.tensor(df_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
cat_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
cat_to_cat_acc

0.3864491844416562

In [68]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,1,3,145.0,233.0,1,0,150.0,0,2.3,0,0.000000,0
1,67,1,0,160.0,286.0,0,0,108.0,1,1.5,1,3.000000,1
2,67,1,0,120.0,229.0,0,0,129.0,1,2.6,1,2.000000,2
3,37,1,2,130.0,250.0,0,1,187.0,0,3.5,0,0.000000,1
4,41,0,1,130.0,204.0,0,0,172.0,0,1.4,2,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,1,0,158.0,170.0,0,2,138.0,1,0.0,3,0.773077,3
914,46,1,0,134.0,310.0,0,1,126.0,0,0.0,3,0.773077,1
915,54,0,0,127.0,333.0,1,2,154.0,0,0.0,3,0.773077,3
917,55,1,0,122.0,223.0,1,2,100.0,0,0.0,3,0.773077,0


In [69]:
no_min_max_dataset = HeartDiseaseDataset(df_no_min_max)
no_min_max_dataloader = DataLoader(no_min_max_dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [70]:
epochs = 100
lr = 0.001

no_min_max_cat_to_cat_model = OnlyCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(no_min_max_cat_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    no_min_max_cat_to_cat_model.train()
    loss = 0

    for cat, cont in no_min_max_dataloader:
        optimizer.zero_grad()
        outputs = no_min_max_cat_to_cat_model(cat)
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.393570
epoch: 2/100, loss = 1.378754
epoch: 3/100, loss = 1.360934
epoch: 4/100, loss = 1.344735
epoch: 5/100, loss = 1.327510
epoch: 6/100, loss = 1.308347
epoch: 7/100, loss = 1.290211
epoch: 8/100, loss = 1.271844
epoch: 9/100, loss = 1.255405
epoch: 10/100, loss = 1.238767
epoch: 11/100, loss = 1.220706
epoch: 12/100, loss = 1.205071
epoch: 13/100, loss = 1.191720
epoch: 14/100, loss = 1.176893
epoch: 15/100, loss = 1.162826
epoch: 16/100, loss = 1.150756
epoch: 17/100, loss = 1.139157
epoch: 18/100, loss = 1.127496
epoch: 19/100, loss = 1.116440
epoch: 20/100, loss = 1.106147
epoch: 21/100, loss = 1.096981
epoch: 22/100, loss = 1.086606
epoch: 23/100, loss = 1.079098
epoch: 24/100, loss = 1.070676
epoch: 25/100, loss = 1.063154
epoch: 26/100, loss = 1.057193
epoch: 27/100, loss = 1.051342
epoch: 28/100, loss = 1.046333
epoch: 29/100, loss = 1.040838
epoch: 30/100, loss = 1.036871
epoch: 31/100, loss = 1.032345
epoch: 32/100, loss = 1.029247
epoch: 33/100, lo

In [71]:
cat_features = no_min_max_cat_to_cat_model.encoder(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
features

array([[0.32349589, 0.07337263, 0.63071227, ..., 0.63380282, 0.55681818,
        0.        ],
       [0.36416116, 0.55075395, 0.5547595 , ..., 0.33802817, 0.46590909,
        1.        ],
       [0.26801246, 0.59528887, 0.38193089, ..., 0.48591549, 0.59090909,
        0.66666667],
       ...,
       [0.89896572, 0.63097548, 0.24113217, ..., 0.66197183, 0.29545455,
        0.25769231],
       [0.96516722, 0.56497169, 0.66133946, ..., 0.28169014, 0.29545455,
        0.25769231],
       [0.42713028, 0.33807147, 0.29711917, ..., 0.23239437, 0.29545455,
        0.25769231]])

In [72]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
no_min_max_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
no_min_max_acc

0.39648682559598497

In [73]:
print(f"All Cols to Categorial Cols AE: {all_to_cat_acc}")
print(f"Categorial Cols to Categorial Cols AE: {cat_to_cat_acc}")
print(f"Categorial Cols to Categorial Cols AE, No MinMax Norm: {no_min_max_acc}")

All Cols to Categorial Cols AE: 0.36888331242158096
Categorial Cols to Categorial Cols AE: 0.3864491844416562
Categorial Cols to Categorial Cols AE, No MinMax Norm: 0.39648682559598497
