In [46]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [47]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [48]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
5,56,Male,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,Male,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,0.0,,,,1
914,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2


In [49]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [50]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])
df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725,0.386401,0.5,0.000000,0.633803,0.0,0.556818,0.000000,0.000000,0.000000
1,0.795918,1.0,0.000000,0.800,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
3,0.183673,1.0,0.666667,0.650,0.414594,0.0,0.333333,0.894366,0.0,0.693182,0.000000,0.000000,0.333333
5,0.571429,1.0,0.333333,0.600,0.391376,0.0,0.333333,0.830986,0.0,0.386364,0.666667,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,0.693878,1.0,0.000000,0.790,0.281924,0.0,0.666667,0.549296,0.5,0.295455,1.000000,0.248092,1.000000
914,0.367347,1.0,0.000000,0.670,0.514096,0.0,0.333333,0.464789,0.0,0.295455,1.000000,0.248092,0.333333
915,0.530612,0.0,0.000000,0.635,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.248092,1.000000
917,0.551020,1.0,0.000000,0.610,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.248092,0.000000


In [51]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,1,3,145.0,233.0,1,0,150.0,0,2.3,0,0.000000,0
1,67,1,0,160.0,286.0,0,0,108.0,1,1.5,1,3.000000,1
2,67,1,0,120.0,229.0,0,0,129.0,1,2.6,1,2.000000,2
3,37,1,2,130.0,250.0,0,1,187.0,0,3.5,0,0.000000,1
5,56,1,1,120.0,236.0,0,1,178.0,0,0.8,2,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,1,0,158.0,170.0,0,2,138.0,1,0.0,3,0.744275,3
914,46,1,0,134.0,310.0,0,1,126.0,0,0.0,3,0.744275,1
915,54,0,0,127.0,333.0,1,2,154.0,0,0.0,3,0.744275,3
917,55,1,0,122.0,223.0,1,2,100.0,0,0.0,3,0.744275,0


In [52]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [53]:
dataset = HeartDiseaseDataset(df_min_max)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [54]:
embedding_sizes = [(df_no_min_max[col].nunique(), min(50, max(2, (df_no_min_max[col].nunique()+1) // 2))) for col in df_no_min_max[categorial_columns]]
embedding_sizes

[(2, 2), (4, 2), (3, 2), (4, 2), (3, 2), (4, 2), (4, 2)]

In [55]:
class EmbeddingModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim) for num, dim in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(n_emb, 10),
            torch.nn.BatchNorm1d(10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 5),
            torch.nn.BatchNorm1d(5),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(5, 10),
            torch.nn.BatchNorm1d(10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, n_emb),
            torch.nn.BatchNorm1d(n_emb),
            torch.nn.Sigmoid()
        )

    def embed(self, x_cat):
        x_cat = x_cat.to(torch.long)
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        self.last_target = x.clone().detach()
        return x


    def forward(self, x_cat):
        embedded = self.embed(x_cat)
        encoded = self.encoder(embedded)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = EmbeddingModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = model(x_cat)
        train_loss = criterion(outputs, model.last_target)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.795293
epoch: 2/100, loss = 0.770563
epoch: 3/100, loss = 0.750318
epoch: 4/100, loss = 0.736437
epoch: 5/100, loss = 0.727786
epoch: 6/100, loss = 0.718838
epoch: 7/100, loss = 0.709963
epoch: 8/100, loss = 0.699951
epoch: 9/100, loss = 0.691608
epoch: 10/100, loss = 0.683843
epoch: 11/100, loss = 0.677747
epoch: 12/100, loss = 0.671137
epoch: 13/100, loss = 0.665922
epoch: 14/100, loss = 0.660093
epoch: 15/100, loss = 0.652843
epoch: 16/100, loss = 0.647953
epoch: 17/100, loss = 0.644660
epoch: 18/100, loss = 0.640733
epoch: 19/100, loss = 0.635961
epoch: 20/100, loss = 0.631048
epoch: 21/100, loss = 0.628197
epoch: 22/100, loss = 0.624059
epoch: 23/100, loss = 0.619593
epoch: 24/100, loss = 0.616334
epoch: 25/100, loss = 0.612968
epoch: 26/100, loss = 0.608987
epoch: 27/100, loss = 0.605439
epoch: 28/100, loss = 0.601901
epoch: 29/100, loss = 0.597742
epoch: 30/100, loss = 0.594111
epoch: 31/100, loss = 0.590337
epoch: 32/100, loss = 0.585969
epoch: 33/100, lo

In [56]:
cat_features = model.encoder(model.embed(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float))).detach().numpy()
emb_features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
emb_features

array([[0.48061433, 0.80400318, 0.25544319, ..., 0.63380282, 0.55681818,
        0.        ],
       [0.70400512, 0.58711112, 0.48211822, ..., 0.33802817, 0.46590909,
        1.        ],
       [0.72711879, 0.4867419 , 0.61616045, ..., 0.48591549, 0.59090909,
        0.66666667],
       ...,
       [0.67904365, 0.68620634, 0.15941194, ..., 0.66197183, 0.29545455,
        0.2480916 ],
       [0.69987154, 0.57911009, 0.14512829, ..., 0.28169014, 0.29545455,
        0.2480916 ],
       [0.54435116, 0.78236496, 0.59793109, ..., 0.23239437, 0.29545455,
        0.2480916 ]])

In [57]:
class NoEmbeddingModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(categorial_columns), 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

no_emb_model = NoEmbeddingModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(no_emb_model.parameters(), lr=lr)

for epoch in range(epochs):
    no_emb_model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = no_emb_model(x_cat)
        train_loss = criterion(outputs,  x_cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.197067
epoch: 2/100, loss = 0.191220
epoch: 3/100, loss = 0.185232
epoch: 4/100, loss = 0.179241
epoch: 5/100, loss = 0.173658
epoch: 6/100, loss = 0.167935
epoch: 7/100, loss = 0.162461
epoch: 8/100, loss = 0.157426
epoch: 9/100, loss = 0.151830
epoch: 10/100, loss = 0.147336
epoch: 11/100, loss = 0.142164
epoch: 12/100, loss = 0.137716
epoch: 13/100, loss = 0.133678
epoch: 14/100, loss = 0.129417
epoch: 15/100, loss = 0.126173
epoch: 16/100, loss = 0.122769
epoch: 17/100, loss = 0.119017
epoch: 18/100, loss = 0.115702
epoch: 19/100, loss = 0.113489
epoch: 20/100, loss = 0.110744
epoch: 21/100, loss = 0.108096
epoch: 22/100, loss = 0.105553
epoch: 23/100, loss = 0.103768
epoch: 24/100, loss = 0.101399
epoch: 25/100, loss = 0.099643
epoch: 26/100, loss = 0.097829
epoch: 27/100, loss = 0.096570
epoch: 28/100, loss = 0.094603
epoch: 29/100, loss = 0.093493
epoch: 30/100, loss = 0.091935
epoch: 31/100, loss = 0.090519
epoch: 32/100, loss = 0.089148
epoch: 33/100, lo

In [58]:
cat_features = no_emb_model.encoder(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
no_emb_features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
no_emb_features

array([[0.94438982, 0.96247274, 0.27103686, ..., 0.63380282, 0.55681818,
        0.        ],
       [0.33041218, 0.67214167, 0.27885282, ..., 0.33802817, 0.46590909,
        1.        ],
       [0.32576355, 0.61795926, 0.36475679, ..., 0.48591549, 0.59090909,
        0.66666667],
       ...,
       [0.33899528, 0.10623961, 0.73014563, ..., 0.66197183, 0.29545455,
        0.2480916 ],
       [0.29100227, 0.58482736, 0.41327247, ..., 0.28169014, 0.29545455,
        0.2480916 ],
       [0.51337862, 0.62326276, 0.62623727, ..., 0.23239437, 0.29545455,
        0.2480916 ]])

In [59]:
emb_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(emb_features)
no_emb_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(no_emb_features)

emb_acc = cluster_accuracy(emb_kmeans.labels_, og_df["num"].to_numpy())
no_emb_acc = cluster_accuracy(no_emb_kmeans.labels_, og_df["num"].to_numpy())

print(f"Catergorial Embeddings Accuracy: {emb_acc}")
print(f"No Embeddings Accuracy: {no_emb_acc}")

Catergorial Embeddings Accuracy: 0.3174404015056462
No Embeddings Accuracy: 0.33500627352572143


In [60]:
emb_nmi = normalized_mutual_info_score(og_df["num"].to_numpy(), emb_kmeans.labels_)
no_emb_nmi = normalized_mutual_info_score(og_df["num"].to_numpy(), no_emb_kmeans.labels_)

print(f"Embeddings NMI: {emb_nmi}")
print(f"No Embeddings NMI: {no_emb_nmi}")

Embeddings NMI: 0.07183427997191377
No Embeddings NMI: 0.1447807605488128
