In [105]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment

In [106]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [107]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
5,56,Male,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
6,62,Female,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [108]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [109]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])
df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
3,0.183673,1.0,0.666667,0.650000,0.414594,0.0,0.333333,0.894366,0.0,0.693182,0.000000,0.000000,0.333333
5,0.571429,1.0,0.333333,0.600000,0.391376,0.0,0.333333,0.830986,0.0,0.386364,0.666667,0.000000,0.333333
6,0.693878,0.0,0.000000,0.700000,0.444444,0.0,0.000000,0.704225,0.0,0.704545,0.000000,0.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.244613,1.000000
916,0.693878,1.0,1.000000,0.661537,0.230514,0.0,0.666667,0.534600,1.0,0.402460,1.000000,0.244613,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.244613,0.000000
918,0.612245,1.0,0.000000,0.661537,0.638474,0.5,0.000000,0.534600,1.0,0.402460,1.000000,0.244613,1.000000


In [110]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
1,67,1,0,160.000000,286.0,0,0,108.000000,1,1.500000,1,3.00000,1
2,67,1,0,120.000000,229.0,0,0,129.000000,1,2.600000,1,2.00000,2
3,37,1,2,130.000000,250.0,0,1,187.000000,0,3.500000,0,0.00000,1
5,56,1,1,120.000000,236.0,0,1,178.000000,0,0.800000,2,0.00000,1
6,62,0,0,140.000000,268.0,0,0,160.000000,0,3.600000,0,2.00000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,0,0,127.000000,333.0,1,2,154.000000,0,0.000000,3,0.73384,3
916,62,1,3,132.307383,139.0,0,2,135.913218,2,0.941644,3,0.73384,3
917,55,1,0,122.000000,223.0,1,2,100.000000,0,0.000000,3,0.73384,0
918,58,1,0,132.307383,385.0,1,0,135.913218,2,0.941644,3,0.73384,3


In [111]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [112]:
dataset = HeartDiseaseDataset(df_no_min_max)
dataloader = DataLoader(dataset, batch_size=50, shuffle=True)
len(dataset)

797

In [113]:
embedding_sizes = [(df_no_min_max[col].nunique(), min(50, max(2, (df_no_min_max[col].nunique()+1) // 2))) for col in df_no_min_max[categorial_columns]]
embedding_sizes

[(2, 2), (4, 2), (3, 2), (4, 2), (3, 2), (4, 2), (4, 2)]

In [114]:
class AttentionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim) for num, dim in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(n_emb, 10),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(10),
            torch.nn.Linear(10, 4),
            torch.nn.Sigmoid(),
            torch.nn.BatchNorm1d(4),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, n_emb),
            torch.nn.Sigmoid()
        )

    def _embed(self, x_cat):
        x_cat = x_cat.to(torch.long)
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        self.last_target = x.clone().detach()
        return x

    def encode(self, x_cat):
        embedded = self._embed(x_cat)
        x = F.scaled_dot_product_attention(embedded, embedded, embedded)
        encoded = self.encoder(x)
        return encoded

    def forward(self, x_cat):
        encoded = self.encode(x_cat)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = AttentionModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = model(x_cat)
        train_loss = criterion(outputs, model.last_target)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.412345
epoch: 2/100, loss = 1.383591
epoch: 3/100, loss = 1.354771
epoch: 4/100, loss = 1.322858
epoch: 5/100, loss = 1.290178
epoch: 6/100, loss = 1.248358
epoch: 7/100, loss = 1.206301
epoch: 8/100, loss = 1.161156
epoch: 9/100, loss = 1.113331
epoch: 10/100, loss = 1.068228
epoch: 11/100, loss = 1.019751
epoch: 12/100, loss = 0.981947
epoch: 13/100, loss = 0.943967
epoch: 14/100, loss = 0.916913
epoch: 15/100, loss = 0.892518
epoch: 16/100, loss = 0.875607
epoch: 17/100, loss = 0.863432
epoch: 18/100, loss = 0.853090
epoch: 19/100, loss = 0.841149
epoch: 20/100, loss = 0.831749
epoch: 21/100, loss = 0.827493
epoch: 22/100, loss = 0.821125
epoch: 23/100, loss = 0.817037
epoch: 24/100, loss = 0.812074
epoch: 25/100, loss = 0.809025
epoch: 26/100, loss = 0.805565
epoch: 27/100, loss = 0.803348
epoch: 28/100, loss = 0.800373
epoch: 29/100, loss = 0.795954
epoch: 30/100, loss = 0.793020
epoch: 31/100, loss = 0.792297
epoch: 32/100, loss = 0.789918
epoch: 33/100, lo

In [115]:
cat_features = model.encode(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
features

array([[-1.9597683 ,  3.35041714, -1.38300228, ...,  0.33802817,
         0.46590909,  1.        ],
       [ 0.07113981,  1.15076637, -2.48257351, ...,  0.48591549,
         0.59090909,  0.66666667],
       [-1.62013996,  2.52418423,  3.25058198, ...,  0.8943662 ,
         0.69318182,  0.        ],
       ...,
       [-1.1900239 ,  2.65318871,  1.204849  , ...,  0.28169014,
         0.29545455,  0.24461343],
       [ 1.48048615, -0.55837202, -0.92211556, ...,  0.53460012,
         0.40245957,  0.24461343],
       [ 0.62465882,  0.70401144, -1.90707278, ...,  0.23239437,
         0.29545455,  0.24461343]])

In [116]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)

acc = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
acc

0.39146800501882056

In [117]:
nmi = normalized_mutual_info_score(og_df["num"].to_numpy(), kmeans.labels_)
nmi

0.08879818589706295