In [1]:
import gower
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [2]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
5,56,Male,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2


In [3]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [4]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])

df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
3,0.183673,1.0,0.666667,0.650000,0.414594,0.0,0.333333,0.894366,0.0,0.693182,0.000000,0.000000,0.333333
4,0.265306,0.0,0.333333,0.650000,0.338308,0.0,0.000000,0.788732,0.0,0.454545,0.666667,0.000000,0.333333
5,0.571429,1.0,0.333333,0.600000,0.391376,0.0,0.333333,0.830986,0.0,0.386364,0.666667,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,0.367347,1.0,0.000000,0.670000,0.514096,0.0,0.333333,0.464789,0.0,0.295455,1.000000,0.247831,0.333333
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.247831,1.000000
916,0.693878,1.0,1.000000,0.661499,0.230514,0.0,0.666667,0.533786,1.0,0.405095,1.000000,0.247831,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.247831,0.000000


In [5]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [6]:
dataset = HeartDiseaseDataset(df_min_max)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [7]:
class AllToCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(df_min_max.columns), 9),
            torch.nn.BatchNorm1d(9),
            torch.nn.ReLU(),
            torch.nn.Linear(9, 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

all_to_cat_model = AllToCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(all_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    all_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = all_to_cat_model(torch.cat((cat, cont), 1))
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.207867
epoch: 2/100, loss = 0.196106
epoch: 3/100, loss = 0.185192
epoch: 4/100, loss = 0.174876
epoch: 5/100, loss = 0.165151
epoch: 6/100, loss = 0.156160
epoch: 7/100, loss = 0.148030
epoch: 8/100, loss = 0.140446
epoch: 9/100, loss = 0.133694
epoch: 10/100, loss = 0.127757
epoch: 11/100, loss = 0.122640
epoch: 12/100, loss = 0.118013
epoch: 13/100, loss = 0.114996
epoch: 14/100, loss = 0.111006
epoch: 15/100, loss = 0.108755
epoch: 16/100, loss = 0.106082
epoch: 17/100, loss = 0.103761
epoch: 18/100, loss = 0.102000
epoch: 19/100, loss = 0.099423
epoch: 20/100, loss = 0.097690
epoch: 21/100, loss = 0.096030
epoch: 22/100, loss = 0.094571
epoch: 23/100, loss = 0.093022
epoch: 24/100, loss = 0.091855
epoch: 25/100, loss = 0.090509
epoch: 26/100, loss = 0.089028
epoch: 27/100, loss = 0.087954
epoch: 28/100, loss = 0.086970
epoch: 29/100, loss = 0.085777
epoch: 30/100, loss = 0.085383
epoch: 31/100, loss = 0.083622
epoch: 32/100, loss = 0.082818
epoch: 33/100, lo

In [8]:
cat_features = all_to_cat_model.encoder(torch.tensor(df_min_max.values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
features

array([[0.35617447, 0.13337889, 0.29803568, ..., 0.33802817, 0.46590909,
        1.        ],
       [0.38131493, 0.25230283, 0.17939673, ..., 0.48591549, 0.59090909,
        0.66666667],
       [0.45592242, 0.64279503, 0.60783374, ..., 0.8943662 , 0.69318182,
        0.        ],
       ...,
       [0.30887136, 0.34223068, 0.719006  , ..., 0.53378594, 0.40509457,
        0.24783147],
       [0.72626001, 0.41355437, 0.7283901 , ..., 0.28169014, 0.29545455,
        0.24783147],
       [0.30457729, 0.27561513, 0.25782847, ..., 0.23239437, 0.29545455,
        0.24783147]])

In [9]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [10]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
all_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
all_to_cat_acc

0.32622333751568383

In [11]:
class OnlyCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(categorial_columns), 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

cat_to_cat_model = OnlyCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(cat_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    cat_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = cat_to_cat_model(cat)
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.181782
epoch: 2/100, loss = 0.176070
epoch: 3/100, loss = 0.170642
epoch: 4/100, loss = 0.165406
epoch: 5/100, loss = 0.160215
epoch: 6/100, loss = 0.154988
epoch: 7/100, loss = 0.150458
epoch: 8/100, loss = 0.145371
epoch: 9/100, loss = 0.140710
epoch: 10/100, loss = 0.135677
epoch: 11/100, loss = 0.131461
epoch: 12/100, loss = 0.126702
epoch: 13/100, loss = 0.122717
epoch: 14/100, loss = 0.118969
epoch: 15/100, loss = 0.115589
epoch: 16/100, loss = 0.112219
epoch: 17/100, loss = 0.109572
epoch: 18/100, loss = 0.107365
epoch: 19/100, loss = 0.104841
epoch: 20/100, loss = 0.101919
epoch: 21/100, loss = 0.099854
epoch: 22/100, loss = 0.097579
epoch: 23/100, loss = 0.095773
epoch: 24/100, loss = 0.093674
epoch: 25/100, loss = 0.092164
epoch: 26/100, loss = 0.089863
epoch: 27/100, loss = 0.088528
epoch: 28/100, loss = 0.086744
epoch: 29/100, loss = 0.085380
epoch: 30/100, loss = 0.084247
epoch: 31/100, loss = 0.082936
epoch: 32/100, loss = 0.081474
epoch: 33/100, lo

In [12]:
cat_features = cat_to_cat_model.encoder(torch.tensor(df_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
cat_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
cat_to_cat_acc

0.3601003764115433

In [13]:
print(f"All Cols to Categorial Cols AE: {all_to_cat_acc}")
print(f"Categorial Cols to Categorial Cols AE: {cat_to_cat_acc}")

All Cols to Categorial Cols AE: 0.32622333751568383
Categorial Cols to Categorial Cols AE: 0.3601003764115433


In [14]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
1,67,1,0,160.000000,286.0,0,0,108.000000,1,1.500000,1,3.000000,1
2,67,1,0,120.000000,229.0,0,0,129.000000,1,2.600000,1,2.000000,2
3,37,1,2,130.000000,250.0,0,1,187.000000,0,3.500000,0,0.000000,1
4,41,0,1,130.000000,204.0,0,0,172.000000,0,1.400000,2,0.000000,1
5,56,1,1,120.000000,236.0,0,1,178.000000,0,0.800000,2,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,46,1,0,134.000000,310.0,0,1,126.000000,0,0.000000,3,0.743494,1
915,54,0,0,127.000000,333.0,1,2,154.000000,0,0.000000,3,0.743494,3
916,62,1,3,132.299866,139.0,0,2,135.797603,2,0.964832,3,0.743494,3
917,55,1,0,122.000000,223.0,1,2,100.000000,0,0.000000,3,0.743494,0


In [15]:
no_min_max_dataset = HeartDiseaseDataset(df_no_min_max)
no_min_max_dataloader = DataLoader(no_min_max_dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [16]:
epochs = 100
lr = 0.001

no_min_max_cat_to_cat_model = OnlyCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(no_min_max_cat_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    no_min_max_cat_to_cat_model.train()
    loss = 0

    for cat, cont in no_min_max_dataloader:
        optimizer.zero_grad()
        outputs = no_min_max_cat_to_cat_model(cat)
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.323721
epoch: 2/100, loss = 1.313531
epoch: 3/100, loss = 1.302672
epoch: 4/100, loss = 1.291262
epoch: 5/100, loss = 1.278423
epoch: 6/100, loss = 1.266150
epoch: 7/100, loss = 1.251593
epoch: 8/100, loss = 1.235791
epoch: 9/100, loss = 1.218073
epoch: 10/100, loss = 1.199752
epoch: 11/100, loss = 1.183890
epoch: 12/100, loss = 1.170153
epoch: 13/100, loss = 1.160314
epoch: 14/100, loss = 1.150356
epoch: 15/100, loss = 1.141636
epoch: 16/100, loss = 1.132371
epoch: 17/100, loss = 1.123561
epoch: 18/100, loss = 1.114459
epoch: 19/100, loss = 1.105261
epoch: 20/100, loss = 1.098103
epoch: 21/100, loss = 1.091650
epoch: 22/100, loss = 1.085226
epoch: 23/100, loss = 1.080605
epoch: 24/100, loss = 1.074548
epoch: 25/100, loss = 1.069233
epoch: 26/100, loss = 1.063841
epoch: 27/100, loss = 1.059591
epoch: 28/100, loss = 1.054830
epoch: 29/100, loss = 1.049923
epoch: 30/100, loss = 1.044210
epoch: 31/100, loss = 1.039806
epoch: 32/100, loss = 1.035593
epoch: 33/100, lo

In [17]:
cat_features = no_min_max_cat_to_cat_model.encoder(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_no_min_max[cont_columns].values), 1)
features

array([[  0.57635373,   0.238298  ,   0.58382416, ..., 108.        ,
          1.5       ,   3.        ],
       [  0.58219784,   0.32884163,   0.72077841, ..., 129.        ,
          2.6       ,   2.        ],
       [  0.27503547,   0.27279717,   0.39024404, ..., 187.        ,
          3.5       ,   0.        ],
       ...,
       [  0.54734182,   0.7800532 ,   0.50838709, ..., 135.7976032 ,
          0.96483221,   0.74349442],
       [  0.635939  ,   0.79851639,   0.28082404, ..., 100.        ,
          0.        ,   0.74349442],
       [  0.46205655,   0.63187206,   0.62006289, ...,  93.        ,
          0.        ,   0.74349442]])

In [18]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
no_min_max_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
no_min_max_acc

0.3462986198243413