In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment

In [2]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [3]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
6,62,Female,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
8,63,Male,asymptomatic,130.0,254.0,False,lv hypertrophy,147.0,False,1.4,flat,1.0,reversable defect,2
9,53,Male,asymptomatic,140.0,203.0,True,lv hypertrophy,155.0,True,3.1,downsloping,0.0,reversable defect,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [4]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal", "ca"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak"]

In [5]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])
df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.355556,0.5,0.465909,0.333333,0.75,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.511111,0.5,0.590909,0.333333,0.50,0.666667
6,0.693878,0.0,0.000000,0.700000,0.444444,0.0,0.000000,0.740741,0.0,0.704545,0.000000,0.50,0.333333
8,0.714286,1.0,0.000000,0.650000,0.421227,0.0,0.000000,0.644444,0.0,0.454545,0.333333,0.25,0.666667
9,0.510204,1.0,0.000000,0.700000,0.336650,0.5,0.000000,0.703704,0.5,0.647727,0.000000,0.00,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.696296,0.0,0.295455,1.000000,1.00,1.000000
916,0.693878,1.0,1.000000,0.663919,0.230514,0.0,0.666667,0.559070,1.0,0.402362,1.000000,1.00,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.296296,0.0,0.295455,1.000000,1.00,0.000000
918,0.612245,1.0,0.000000,0.663919,0.638474,0.5,0.000000,0.559070,1.0,0.402362,1.000000,1.00,1.000000


In [6]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
1,67,1,0,160.000000,286.0,0,0,108.000000,1,1.500000,1,3,1
2,67,1,0,120.000000,229.0,0,0,129.000000,1,2.600000,1,2,2
6,62,0,0,140.000000,268.0,0,0,160.000000,0,3.600000,0,2,1
8,63,1,0,130.000000,254.0,0,0,147.000000,0,1.400000,1,1,2
9,53,1,0,140.000000,203.0,1,0,155.000000,1,3.100000,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,0,0,127.000000,333.0,1,2,154.000000,0,0.000000,3,4,3
916,62,1,3,132.783784,139.0,0,2,135.474462,2,0.940786,3,4,3
917,55,1,0,122.000000,223.0,1,2,100.000000,0,0.000000,3,4,0
918,58,1,0,132.783784,385.0,1,0,135.474462,2,0.940786,3,4,3


In [7]:
# Use MinMax scaled continous cols but unscaled categorial cols
df = pd.concat((df_no_min_max[categorial_columns], df_min_max[cont_columns]), axis=1)
df

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,thal,ca,age,trestbps,chol,thalch,oldpeak
1,1,0,0,0,1,1,1,3,0.795918,0.800000,0.474295,0.355556,0.465909
2,1,0,0,0,1,1,2,2,0.795918,0.600000,0.379768,0.511111,0.590909
6,0,0,0,0,0,0,1,2,0.693878,0.700000,0.444444,0.740741,0.704545
8,1,0,0,0,0,1,2,1,0.714286,0.650000,0.421227,0.644444,0.454545
9,1,0,1,0,1,0,2,0,0.510204,0.700000,0.336650,0.703704,0.647727
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0,0,1,2,0,3,3,4,0.530612,0.635000,0.552239,0.696296,0.295455
916,1,3,0,2,2,3,3,4,0.693878,0.663919,0.230514,0.559070,0.402362
917,1,0,1,2,0,3,0,4,0.551020,0.610000,0.369818,0.296296,0.295455
918,1,0,1,0,2,3,3,4,0.612245,0.663919,0.638474,0.559070,0.402362


In [8]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [9]:
dataset = HeartDiseaseDataset(df)
dataloader = DataLoader(dataset, batch_size=50, shuffle=True)
len(dataset)

797

In [10]:
embedding_sizes = [(df_no_min_max[col].nunique(), min(50, max(2, (df_no_min_max[col].nunique()+1) // 2))) for col in df_no_min_max[categorial_columns]]
embedding_sizes

[(2, 2), (4, 2), (3, 2), (4, 2), (3, 2), (4, 2), (4, 2), (5, 3)]

In [11]:
class AttentionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim) for num, dim in embedding_sizes])
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(22, 12),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(12),
            torch.nn.Linear(12, 4),
            torch.nn.Sigmoid(),
            torch.nn.BatchNorm1d(4),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, 12),
            torch.nn.ReLU(),
            torch.nn.Linear(12, 22),
            torch.nn.Sigmoid()
        )


    def encode(self, x_cat, x_cont):
        x_cat = x_cat.to(torch.long)
        embedded = torch.cat([e(x_cat[:, i]) for i, e in enumerate(self.embeddings)], 1)
        self.last_target = embedded.clone().detach()

        qkv = torch.cat((embedded, x_cont), 1)
        x = F.scaled_dot_product_attention(qkv, qkv, qkv)
        encoded = self.encoder(x)
        return encoded

    def forward(self, x_cat, x_cont):
        encoded = self.encode(x_cat, x_cont)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = AttentionModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = model(x_cat, x_cont)
        train_loss = criterion(outputs, torch.cat((model.last_target, x_cont), 1))
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.797030
epoch: 2/100, loss = 0.777495
epoch: 3/100, loss = 0.760863
epoch: 4/100, loss = 0.742814
epoch: 5/100, loss = 0.721972
epoch: 6/100, loss = 0.698894
epoch: 7/100, loss = 0.673418
epoch: 8/100, loss = 0.644657
epoch: 9/100, loss = 0.614785
epoch: 10/100, loss = 0.584583
epoch: 11/100, loss = 0.554904
epoch: 12/100, loss = 0.531048
epoch: 13/100, loss = 0.510345
epoch: 14/100, loss = 0.491613
epoch: 15/100, loss = 0.477768
epoch: 16/100, loss = 0.466784
epoch: 17/100, loss = 0.457828
epoch: 18/100, loss = 0.451883
epoch: 19/100, loss = 0.446945
epoch: 20/100, loss = 0.441357
epoch: 21/100, loss = 0.436575
epoch: 22/100, loss = 0.432909
epoch: 23/100, loss = 0.430889
epoch: 24/100, loss = 0.426495
epoch: 25/100, loss = 0.425599
epoch: 26/100, loss = 0.424437
epoch: 27/100, loss = 0.420311
epoch: 28/100, loss = 0.418937
epoch: 29/100, loss = 0.415989
epoch: 30/100, loss = 0.415078
epoch: 31/100, loss = 0.413930
epoch: 32/100, loss = 0.411772
epoch: 33/100, lo

In [12]:
cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
cont = torch.tensor(df[cont_columns].values, dtype=torch.float)
features = model.encode(cat, cont).detach().numpy()
features

array([[-1.5637932 , -1.6812193 ,  1.6927414 , -0.03613377],
       [ 0.72901726, -3.309219  , -0.14864254, -0.93930626],
       [-0.49878883, -1.54004   ,  1.6418772 , -0.6683006 ],
       ...,
       [ 1.7394829 , -1.1579192 ,  0.393013  ,  0.8029375 ],
       [ 2.7350292 , -1.4279191 , -1.4736547 ,  0.46297073],
       [ 1.7872543 , -1.4044383 , -2.410636  , -1.3359184 ]],
      dtype=float32)

In [13]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())

0.35382685069008785

In [14]:
normalized_mutual_info_score(og_df["num"].to_numpy(), kmeans.labels_)

0.1002064205143976