In [198]:
import gower
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [199]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
5,56,Male,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [200]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [201]:
df = og_df.copy()
df.drop(columns="num", inplace=True)
df[categorial_columns] = df[categorial_columns].apply(LabelEncoder().fit_transform)
df[categorial_columns] = MinMaxScaler().fit_transform(df[categorial_columns])

df[cont_columns] = MinMaxScaler().fit_transform(df[cont_columns])
df = df.fillna(df.mean())
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725000,0.386401,0.5,0.000000,0.633803,0.0,0.556818,0.000000,0.000000,0.000000
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
3,0.183673,1.0,0.666667,0.650000,0.414594,0.0,0.333333,0.894366,0.0,0.693182,0.000000,0.000000,0.333333
5,0.571429,1.0,0.333333,0.600000,0.391376,0.0,0.333333,0.830986,0.0,0.386364,0.666667,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,0.367347,1.0,0.000000,0.670000,0.514096,0.0,0.333333,0.464789,0.0,0.295455,1.000000,0.248737,0.333333
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.248737,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.248737,0.000000
918,0.612245,1.0,0.000000,0.662949,0.638474,0.5,0.000000,0.537061,1.0,0.401989,1.000000,0.248737,1.000000


In [202]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [203]:
dataset = HeartDiseaseDataset(df)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [204]:
class AllToCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(df.columns), 9),
            torch.nn.BatchNorm1d(9),
            torch.nn.ReLU(),
            torch.nn.Linear(9, 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

all_to_cat_model = AllToCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(all_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    all_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = all_to_cat_model(torch.cat((cat, cont), 1))
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.186713
epoch: 2/100, loss = 0.176970
epoch: 3/100, loss = 0.167061
epoch: 4/100, loss = 0.158809
epoch: 5/100, loss = 0.151612
epoch: 6/100, loss = 0.143855
epoch: 7/100, loss = 0.137120
epoch: 8/100, loss = 0.131828
epoch: 9/100, loss = 0.126819
epoch: 10/100, loss = 0.122186
epoch: 11/100, loss = 0.118201
epoch: 12/100, loss = 0.114584
epoch: 13/100, loss = 0.111063
epoch: 14/100, loss = 0.108626
epoch: 15/100, loss = 0.105471
epoch: 16/100, loss = 0.102871
epoch: 17/100, loss = 0.100204
epoch: 18/100, loss = 0.098075
epoch: 19/100, loss = 0.096051
epoch: 20/100, loss = 0.094764
epoch: 21/100, loss = 0.092822
epoch: 22/100, loss = 0.090675
epoch: 23/100, loss = 0.089213
epoch: 24/100, loss = 0.087834
epoch: 25/100, loss = 0.086339
epoch: 26/100, loss = 0.085115
epoch: 27/100, loss = 0.083893
epoch: 28/100, loss = 0.082686
epoch: 29/100, loss = 0.081733
epoch: 30/100, loss = 0.080771
epoch: 31/100, loss = 0.079553
epoch: 32/100, loss = 0.079164
epoch: 33/100, lo

In [205]:
cat_features = all_to_cat_model.encoder(torch.tensor(df.values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df[cont_columns].values), 1)
features

array([[0.29795331, 0.49451137, 0.45338643, ..., 0.63380282, 0.55681818,
        0.        ],
       [0.75777566, 0.22989401, 0.55975157, ..., 0.33802817, 0.46590909,
        1.        ],
       [0.73622459, 0.21030919, 0.56186223, ..., 0.48591549, 0.59090909,
        0.66666667],
       ...,
       [0.66906512, 0.52333385, 0.42955545, ..., 0.28169014, 0.29545455,
        0.24873737],
       [0.36839759, 0.74466068, 0.21922338, ..., 0.53706103, 0.40198864,
        0.24873737],
       [0.65290588, 0.33600339, 0.52168715, ..., 0.23239437, 0.29545455,
        0.24873737]])

In [206]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [207]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
all_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
all_to_cat_acc

0.36511919698870765

In [208]:
class OnlyCategorialAE(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(categorial_columns), 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )


    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

cat_to_cat_model = OnlyCategorialAE()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(cat_to_cat_model.parameters(), lr=lr)

for epoch in range(epochs):
    cat_to_cat_model.train()
    loss = 0

    for cat, cont in dataloader:
        optimizer.zero_grad()
        outputs = cat_to_cat_model(cat)
        train_loss = criterion(outputs,  cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.182464
epoch: 2/100, loss = 0.175826
epoch: 3/100, loss = 0.169045
epoch: 4/100, loss = 0.163160
epoch: 5/100, loss = 0.157447
epoch: 6/100, loss = 0.152628
epoch: 7/100, loss = 0.147763
epoch: 8/100, loss = 0.143402
epoch: 9/100, loss = 0.139230
epoch: 10/100, loss = 0.135305
epoch: 11/100, loss = 0.131502
epoch: 12/100, loss = 0.128396
epoch: 13/100, loss = 0.125053
epoch: 14/100, loss = 0.121428
epoch: 15/100, loss = 0.118494
epoch: 16/100, loss = 0.115835
epoch: 17/100, loss = 0.113186
epoch: 18/100, loss = 0.110622
epoch: 19/100, loss = 0.107821
epoch: 20/100, loss = 0.105647
epoch: 21/100, loss = 0.103144
epoch: 22/100, loss = 0.101193
epoch: 23/100, loss = 0.099079
epoch: 24/100, loss = 0.096911
epoch: 25/100, loss = 0.094654
epoch: 26/100, loss = 0.093247
epoch: 27/100, loss = 0.091050
epoch: 28/100, loss = 0.089972
epoch: 29/100, loss = 0.088488
epoch: 30/100, loss = 0.087027
epoch: 31/100, loss = 0.085404
epoch: 32/100, loss = 0.084369
epoch: 33/100, lo

In [209]:
cat_features = cat_to_cat_model.encoder(torch.tensor(df[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df[cont_columns].values), 1)
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
cat_to_cat_acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
cat_to_cat_acc

0.38143036386449186

In [210]:
print(f"All Cols to Categorial Cols AE: {all_to_cat_acc}")
print(f"Categorial Cols to Categorial Cols AE: {cat_to_cat_acc}")

All Cols to Categorial Cols AE: 0.36511919698870765
Categorial Cols to Categorial Cols AE: 0.38143036386449186
