In [121]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment

In [122]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [123]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [124]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal", "ca"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak"]

In [125]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])
df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725000,0.386401,0.5,0.000000,0.633803,0.0,0.556818,0.000000,0.00,0.000000
1,0.795918,1.0,0.000000,0.800000,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,0.75,0.333333
2,0.795918,1.0,0.000000,0.600000,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.50,0.666667
3,0.183673,1.0,0.666667,0.650000,0.414594,0.0,0.333333,0.894366,0.0,0.693182,0.000000,0.00,0.333333
4,0.265306,0.0,0.333333,0.650000,0.338308,0.0,0.000000,0.788732,0.0,0.454545,0.666667,0.00,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.530612,0.0,0.000000,0.635000,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,1.00,1.000000
916,0.693878,1.0,1.000000,0.661676,0.230514,0.0,0.666667,0.533552,1.0,0.403892,1.000000,1.00,1.000000
917,0.551020,1.0,0.000000,0.610000,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,1.00,0.000000
918,0.612245,1.0,0.000000,0.661676,0.638474,0.5,0.000000,0.533552,1.0,0.403892,1.000000,1.00,1.000000


In [126]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,1,3,145.000000,233.0,1,0,150.000000,0,2.300000,0,0,0
1,67,1,0,160.000000,286.0,0,0,108.000000,1,1.500000,1,3,1
2,67,1,0,120.000000,229.0,0,0,129.000000,1,2.600000,1,2,2
3,37,1,2,130.000000,250.0,0,1,187.000000,0,3.500000,0,0,1
4,41,0,1,130.000000,204.0,0,0,172.000000,0,1.400000,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,0,0,127.000000,333.0,1,2,154.000000,0,0.000000,3,4,3
916,62,1,3,132.335128,139.0,0,2,135.764391,2,0.954251,3,4,3
917,55,1,0,122.000000,223.0,1,2,100.000000,0,0.000000,3,4,0
918,58,1,0,132.335128,385.0,1,0,135.764391,2,0.954251,3,4,3


In [127]:
# Use MinMax scaled continous cols but unscaled categorial cols
df = pd.concat((df_no_min_max[categorial_columns], df_min_max[cont_columns]), axis=1)
df

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,thal,ca,age,trestbps,chol,thalch,oldpeak
0,1,3,1,0,0,0,0,0,0.714286,0.725000,0.386401,0.633803,0.556818
1,1,0,0,0,1,1,1,3,0.795918,0.800000,0.474295,0.338028,0.465909
2,1,0,0,0,1,1,2,2,0.795918,0.600000,0.379768,0.485915,0.590909
3,1,2,0,1,0,0,1,0,0.183673,0.650000,0.414594,0.894366,0.693182
4,0,1,0,0,0,2,1,0,0.265306,0.650000,0.338308,0.788732,0.454545
...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0,0,1,2,0,3,3,4,0.530612,0.635000,0.552239,0.661972,0.295455
916,1,3,0,2,2,3,3,4,0.693878,0.661676,0.230514,0.533552,0.403892
917,1,0,1,2,0,3,0,4,0.551020,0.610000,0.369818,0.281690,0.295455
918,1,0,1,0,2,3,3,4,0.612245,0.661676,0.638474,0.533552,0.403892


In [128]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [129]:
dataset = HeartDiseaseDataset(df)
dataloader = DataLoader(dataset, batch_size=50, shuffle=True)
len(dataset)

797

In [130]:
embedding_sizes = [(df_no_min_max[col].nunique(), min(50, max(2, (df_no_min_max[col].nunique()+1) // 2))) for col in df_no_min_max[categorial_columns]]
embedding_sizes

[(2, 2), (4, 2), (3, 2), (4, 2), (3, 2), (4, 2), (4, 2), (5, 3)]

In [131]:
class AttentionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim) for num, dim in embedding_sizes])
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(17, 13),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(13),
            torch.nn.Linear(13, 7),
            torch.nn.Sigmoid(),
            torch.nn.BatchNorm1d(7),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(12, 15),
            torch.nn.ReLU(),
            torch.nn.Linear(15, 17),
            torch.nn.Sigmoid()
        )


    def encode(self, x_cat, x_cont):
        x_cat = x_cat.to(torch.long)
        embedded = torch.cat([e(x_cat[:, i]) for i, e in enumerate(self.embeddings)], 1)
        self.last_target = embedded.clone().detach()
        encoded = self.encoder(embedded)
        qkv = torch.cat((encoded, x_cont), 1)
        attended = F.scaled_dot_product_attention(qkv, qkv, qkv)
        return attended

    def forward(self, x_cat, x_cont):
        encoded = self.encode(x_cat, x_cont)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = AttentionModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = model(x_cat, x_cont)
        train_loss = criterion(outputs, model.last_target)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.244895
epoch: 2/100, loss = 1.208785
epoch: 3/100, loss = 1.163235
epoch: 4/100, loss = 1.107661
epoch: 5/100, loss = 1.044306
epoch: 6/100, loss = 0.984977
epoch: 7/100, loss = 0.943631
epoch: 8/100, loss = 0.914114
epoch: 9/100, loss = 0.894711
epoch: 10/100, loss = 0.873258
epoch: 11/100, loss = 0.854263
epoch: 12/100, loss = 0.842046
epoch: 13/100, loss = 0.834557
epoch: 14/100, loss = 0.827315
epoch: 15/100, loss = 0.823836
epoch: 16/100, loss = 0.822818
epoch: 17/100, loss = 0.819478
epoch: 18/100, loss = 0.817586
epoch: 19/100, loss = 0.815475
epoch: 20/100, loss = 0.813610
epoch: 21/100, loss = 0.813092
epoch: 22/100, loss = 0.811319
epoch: 23/100, loss = 0.811663
epoch: 24/100, loss = 0.809612
epoch: 25/100, loss = 0.808814
epoch: 26/100, loss = 0.807276
epoch: 27/100, loss = 0.808826
epoch: 28/100, loss = 0.808677
epoch: 29/100, loss = 0.806402
epoch: 30/100, loss = 0.806051
epoch: 31/100, loss = 0.803689
epoch: 32/100, loss = 0.804051
epoch: 33/100, lo

In [132]:
cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
cont = torch.tensor(df[cont_columns].values, dtype=torch.float)
features = model.encode(cat, cont).detach().numpy()
features

array([[ 0.94592524,  2.2407458 , -1.9737897 , ...,  0.35746652,
         0.63890636,  0.39759836],
       [-0.5025546 , -1.5099381 , -0.36331964, ...,  0.34632847,
         0.46937245,  0.42731136],
       [ 3.4261572 ,  2.6313338 ,  0.7149805 , ...,  0.41614088,
         0.54408765,  0.47544032],
       ...,
       [-0.68555933, -0.52582765, -0.15837629, ...,  0.33871174,
         0.48562133,  0.4164183 ],
       [-0.14246292, -0.3875218 ,  0.01788907, ...,  0.3348309 ,
         0.48530722,  0.4238472 ],
       [-0.40023556, -1.5661058 ,  2.5532029 , ...,  0.3530306 ,
         0.59099793,  0.34766573]], dtype=float32)

In [133]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())

0.4077791718946048

In [134]:
normalized_mutual_info_score(og_df["num"].to_numpy(), kmeans.labels_)

0.10499779868016192