In [102]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [103]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [104]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
5,56,Male,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
6,62,Female,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,Male,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,0.0,,,,1
914,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2


In [105]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [106]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])
df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725,0.386401,0.5,0.000000,0.633803,0.0,0.556818,0.000000,0.000000,0.000000
1,0.795918,1.0,0.000000,0.800,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
5,0.571429,1.0,0.333333,0.600,0.391376,0.0,0.333333,0.830986,0.0,0.386364,0.666667,0.000000,0.333333
6,0.693878,0.0,0.000000,0.700,0.444444,0.0,0.000000,0.704225,0.0,0.704545,0.000000,0.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,0.693878,1.0,0.000000,0.790,0.281924,0.0,0.666667,0.549296,0.5,0.295455,1.000000,0.244186,1.000000
914,0.367347,1.0,0.000000,0.670,0.514096,0.0,0.333333,0.464789,0.0,0.295455,1.000000,0.244186,0.333333
915,0.530612,0.0,0.000000,0.635,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.244186,1.000000
917,0.551020,1.0,0.000000,0.610,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.244186,0.000000


In [107]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,1,3,145.0,233.0,1,0,150.0,0,2.3,0,0.000000,0
1,67,1,0,160.0,286.0,0,0,108.0,1,1.5,1,3.000000,1
2,67,1,0,120.0,229.0,0,0,129.0,1,2.6,1,2.000000,2
5,56,1,1,120.0,236.0,0,1,178.0,0,0.8,2,0.000000,1
6,62,0,0,140.0,268.0,0,0,160.0,0,3.6,0,2.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,1,0,158.0,170.0,0,2,138.0,1,0.0,3,0.732558,3
914,46,1,0,134.0,310.0,0,1,126.0,0,0.0,3,0.732558,1
915,54,0,0,127.0,333.0,1,2,154.0,0,0.0,3,0.732558,3
917,55,1,0,122.0,223.0,1,2,100.0,0,0.0,3,0.732558,0


In [108]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [109]:
dataset = HeartDiseaseDataset(df_min_max)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [110]:
embedding_sizes = [(df_no_min_max[col].nunique(), min(50, max(2, (df_no_min_max[col].nunique()+1) // 2))) for col in df_no_min_max[categorial_columns]]
embedding_sizes

[(2, 2), (4, 2), (3, 2), (4, 2), (3, 2), (4, 2), (4, 2)]

In [111]:
class AttentionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim) for num, dim in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(n_emb, 9),
            torch.nn.BatchNorm1d(9),
            torch.nn.ReLU(),
            torch.nn.Linear(9, 5),
            torch.nn.BatchNorm1d(5),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(5, 9),
            torch.nn.BatchNorm1d(9),
            torch.nn.ReLU(),
            torch.nn.Linear(9, n_emb),
            torch.nn.BatchNorm1d(n_emb),
            torch.nn.Sigmoid()
        )

    def embed(self, x_cat):
        x_cat = x_cat.to(torch.long)
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        self.last_target = x.clone().detach()
        return x


    def forward(self, x_cat):
        embedded = self.embed(x_cat)
        encoded = self.encoder(embedded)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = AttentionModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = model(x_cat)
        train_loss = criterion(outputs, model.last_target)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.703290
epoch: 2/100, loss = 1.685603
epoch: 3/100, loss = 1.666752
epoch: 4/100, loss = 1.648962
epoch: 5/100, loss = 1.629099
epoch: 6/100, loss = 1.614855
epoch: 7/100, loss = 1.602848
epoch: 8/100, loss = 1.593688
epoch: 9/100, loss = 1.581864
epoch: 10/100, loss = 1.569663
epoch: 11/100, loss = 1.559205
epoch: 12/100, loss = 1.547536
epoch: 13/100, loss = 1.540773
epoch: 14/100, loss = 1.532961
epoch: 15/100, loss = 1.526786
epoch: 16/100, loss = 1.519524
epoch: 17/100, loss = 1.513885
epoch: 18/100, loss = 1.508924
epoch: 19/100, loss = 1.509694
epoch: 20/100, loss = 1.499191
epoch: 21/100, loss = 1.497921
epoch: 22/100, loss = 1.492094
epoch: 23/100, loss = 1.487126
epoch: 24/100, loss = 1.484832
epoch: 25/100, loss = 1.478184
epoch: 26/100, loss = 1.478160
epoch: 27/100, loss = 1.469068
epoch: 28/100, loss = 1.469476
epoch: 29/100, loss = 1.464637
epoch: 30/100, loss = 1.458991
epoch: 31/100, loss = 1.457959
epoch: 32/100, loss = 1.457733
epoch: 33/100, lo

In [112]:
cat_features = model.encoder(model.embed(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float))).detach().numpy()
features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
features

array([[0.2570039 , 0.05466382, 0.57112163, ..., 0.63380282, 0.55681818,
        0.        ],
       [0.59897333, 0.51240951, 0.69509774, ..., 0.33802817, 0.46590909,
        1.        ],
       [0.5788058 , 0.47310993, 0.69082898, ..., 0.48591549, 0.59090909,
        0.66666667],
       ...,
       [0.2246467 , 0.55523825, 0.37610343, ..., 0.66197183, 0.29545455,
        0.24418605],
       [0.26854882, 0.30749008, 0.4088552 , ..., 0.28169014, 0.29545455,
        0.24418605],
       [0.51229054, 0.46442613, 0.57307768, ..., 0.23239437, 0.29545455,
        0.24418605]])

In [113]:
kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(features)
accuracy = cluster_accuracy(kmeans.labels_, og_df["num"].to_numpy())
accuracy

0.4065244667503137