In [106]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from scipy.optimize import linear_sum_assignment

In [107]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [108]:
og_df = pd.read_csv("datasets/heart_disease_uci.csv")
og_df.drop(columns=["id", "dataset"], inplace=True)
og_df = og_df.drop(og_df[og_df["num"] == 0].sample(frac=0.3).index)
og_df # this df still has "num" -> the target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
5,56,Male,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
6,62,Female,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,Male,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,0.0,,,,1
914,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2


In [109]:
categorial_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
cont_columns = ["age", "trestbps", "chol", "thalch", "oldpeak", "ca"]

In [110]:
df_min_max = og_df.copy()
df_min_max.drop(columns="num", inplace=True)
df_min_max[categorial_columns] = df_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_min_max[categorial_columns] = MinMaxScaler().fit_transform(df_min_max[categorial_columns])
df_min_max[cont_columns] = MinMaxScaler().fit_transform(df_min_max[cont_columns])
df_min_max = df_min_max.fillna(df_min_max.mean())
df_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,0.714286,1.0,1.000000,0.725,0.386401,0.5,0.000000,0.633803,0.0,0.556818,0.000000,0.000000,0.000000
1,0.795918,1.0,0.000000,0.800,0.474295,0.0,0.000000,0.338028,0.5,0.465909,0.333333,1.000000,0.333333
2,0.795918,1.0,0.000000,0.600,0.379768,0.0,0.000000,0.485915,0.5,0.590909,0.333333,0.666667,0.666667
5,0.571429,1.0,0.333333,0.600,0.391376,0.0,0.333333,0.830986,0.0,0.386364,0.666667,0.000000,0.333333
6,0.693878,0.0,0.000000,0.700,0.444444,0.0,0.000000,0.704225,0.0,0.704545,0.000000,0.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,0.693878,1.0,0.000000,0.790,0.281924,0.0,0.666667,0.549296,0.5,0.295455,1.000000,0.247765,1.000000
914,0.367347,1.0,0.000000,0.670,0.514096,0.0,0.333333,0.464789,0.0,0.295455,1.000000,0.247765,0.333333
915,0.530612,0.0,0.000000,0.635,0.552239,0.5,0.666667,0.661972,0.0,0.295455,1.000000,0.247765,1.000000
917,0.551020,1.0,0.000000,0.610,0.369818,0.5,0.666667,0.281690,0.0,0.295455,1.000000,0.247765,0.000000


In [111]:
df_no_min_max = og_df.copy()
df_no_min_max.drop(columns="num", inplace=True)
df_no_min_max[categorial_columns] = df_no_min_max[categorial_columns].apply(LabelEncoder().fit_transform)
df_no_min_max = df_no_min_max.fillna(df_no_min_max.mean())
df_no_min_max

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
0,63,1,3,145.0,233.0,1,0,150.0,0,2.3,0,0.000000,0
1,67,1,0,160.0,286.0,0,0,108.0,1,1.5,1,3.000000,1
2,67,1,0,120.0,229.0,0,0,129.0,1,2.6,1,2.000000,2
5,56,1,1,120.0,236.0,0,1,178.0,0,0.8,2,0.000000,1
6,62,0,0,140.0,268.0,0,0,160.0,0,3.6,0,2.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,62,1,0,158.0,170.0,0,2,138.0,1,0.0,3,0.743295,3
914,46,1,0,134.0,310.0,0,1,126.0,0,0.0,3,0.743295,1
915,54,0,0,127.0,333.0,1,2,154.0,0,0.0,3,0.743295,3
917,55,1,0,122.0,223.0,1,2,100.0,0,0.0,3,0.743295,0


In [112]:
class HeartDiseaseDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[categorial_columns].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_columns].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]

In [113]:
dataset = HeartDiseaseDataset(df_min_max)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
len(dataset)

797

In [114]:
embedding_sizes = [(df_no_min_max[col].nunique(), min(50, max(2, (df_no_min_max[col].nunique()+1) // 2))) for col in df_no_min_max[categorial_columns]]
embedding_sizes

[(2, 2), (4, 2), (3, 2), (4, 2), (3, 2), (4, 2), (4, 2)]

In [115]:
class EmbeddingModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim) for num, dim in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(n_emb, 10),
            torch.nn.BatchNorm1d(10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 5),
            torch.nn.BatchNorm1d(5),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(5, 10),
            torch.nn.BatchNorm1d(10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, n_emb),
            torch.nn.BatchNorm1d(n_emb),
            torch.nn.Sigmoid()
        )

    def embed(self, x_cat):
        x_cat = x_cat.to(torch.long)
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        self.last_target = x.clone().detach()
        return x


    def forward(self, x_cat):
        embedded = self.embed(x_cat)
        encoded = self.encoder(embedded)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = EmbeddingModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = model(x_cat)
        train_loss = criterion(outputs, model.last_target)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.457249
epoch: 2/100, loss = 1.428930
epoch: 3/100, loss = 1.406558
epoch: 4/100, loss = 1.386416
epoch: 5/100, loss = 1.369587
epoch: 6/100, loss = 1.352072
epoch: 7/100, loss = 1.335533
epoch: 8/100, loss = 1.323635
epoch: 9/100, loss = 1.310142
epoch: 10/100, loss = 1.297719
epoch: 11/100, loss = 1.284739
epoch: 12/100, loss = 1.276516
epoch: 13/100, loss = 1.269326
epoch: 14/100, loss = 1.260961
epoch: 15/100, loss = 1.254827
epoch: 16/100, loss = 1.246248
epoch: 17/100, loss = 1.239866
epoch: 18/100, loss = 1.232297
epoch: 19/100, loss = 1.226155
epoch: 20/100, loss = 1.220984
epoch: 21/100, loss = 1.214649
epoch: 22/100, loss = 1.207650
epoch: 23/100, loss = 1.201822
epoch: 24/100, loss = 1.194959
epoch: 25/100, loss = 1.191230
epoch: 26/100, loss = 1.184407
epoch: 27/100, loss = 1.179370
epoch: 28/100, loss = 1.175357
epoch: 29/100, loss = 1.168760
epoch: 30/100, loss = 1.165686
epoch: 31/100, loss = 1.161779
epoch: 32/100, loss = 1.154682
epoch: 33/100, lo

In [116]:
cat_features = model.encoder(model.embed(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float))).detach().numpy()
emb_features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
emb_features

array([[0.3175799 , 0.47763401, 0.27391624, ..., 0.63380282, 0.55681818,
        0.        ],
       [0.62006414, 0.49546278, 0.65352452, ..., 0.33802817, 0.46590909,
        1.        ],
       [0.34096557, 0.57605141, 0.44675148, ..., 0.48591549, 0.59090909,
        0.66666667],
       ...,
       [0.07604529, 0.93895268, 0.36377221, ..., 0.66197183, 0.29545455,
        0.24776501],
       [0.25997174, 0.55062395, 0.51040393, ..., 0.28169014, 0.29545455,
        0.24776501],
       [0.53415155, 0.34326372, 0.21084449, ..., 0.23239437, 0.29545455,
        0.24776501]])

In [117]:
class NoEmbeddingModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(len(categorial_columns), 4),
            torch.nn.BatchNorm1d(4),
            torch.nn.Sigmoid(),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(4, len(categorial_columns)),
            torch.nn.BatchNorm1d(len(categorial_columns)),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

no_emb_model = NoEmbeddingModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(no_emb_model.parameters(), lr=lr)

for epoch in range(epochs):
    no_emb_model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = no_emb_model(x_cat)
        train_loss = criterion(outputs,  x_cat)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.210855
epoch: 2/100, loss = 0.203398
epoch: 3/100, loss = 0.195978
epoch: 4/100, loss = 0.187806
epoch: 5/100, loss = 0.180817
epoch: 6/100, loss = 0.173674
epoch: 7/100, loss = 0.167455
epoch: 8/100, loss = 0.160880
epoch: 9/100, loss = 0.154685
epoch: 10/100, loss = 0.148660
epoch: 11/100, loss = 0.142613
epoch: 12/100, loss = 0.137596
epoch: 13/100, loss = 0.131756
epoch: 14/100, loss = 0.126135
epoch: 15/100, loss = 0.121402
epoch: 16/100, loss = 0.117029
epoch: 17/100, loss = 0.112778
epoch: 18/100, loss = 0.109128
epoch: 19/100, loss = 0.105581
epoch: 20/100, loss = 0.102827
epoch: 21/100, loss = 0.100434
epoch: 22/100, loss = 0.097961
epoch: 23/100, loss = 0.096113
epoch: 24/100, loss = 0.093927
epoch: 25/100, loss = 0.091757
epoch: 26/100, loss = 0.090151
epoch: 27/100, loss = 0.088443
epoch: 28/100, loss = 0.086686
epoch: 29/100, loss = 0.084977
epoch: 30/100, loss = 0.083755
epoch: 31/100, loss = 0.082002
epoch: 32/100, loss = 0.080793
epoch: 33/100, lo

In [118]:
cat_features = no_emb_model.encoder(torch.tensor(df_no_min_max[categorial_columns].values, dtype=torch.float)).detach().numpy()
no_emb_features = np.concatenate((cat_features, df_min_max[cont_columns].values), 1)
no_emb_features

array([[0.63290638, 0.58673674, 0.0487307 , ..., 0.63380282, 0.55681818,
        0.        ],
       [0.80592299, 0.29343107, 0.30010173, ..., 0.33802817, 0.46590909,
        1.        ],
       [0.74926722, 0.24845929, 0.34675065, ..., 0.48591549, 0.59090909,
        0.66666667],
       ...,
       [0.4066658 , 0.60128987, 0.79905474, ..., 0.66197183, 0.29545455,
        0.24776501],
       [0.63790715, 0.59232634, 0.63531697, ..., 0.28169014, 0.29545455,
        0.24776501],
       [0.30170631, 0.50384915, 0.60135478, ..., 0.23239437, 0.29545455,
        0.24776501]])

In [119]:
emb_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(emb_features)
no_emb_kmeans = KMeans(n_clusters=5, n_init="auto", random_state=0).fit(no_emb_features)

emb_acc = cluster_accuracy(emb_kmeans.labels_, og_df["num"].to_numpy())
no_emb_acc = cluster_accuracy(no_emb_kmeans.labels_, og_df["num"].to_numpy())

print(f"Catergorial Embeddings Accuracy: {emb_acc}")
print(f"No Embeddings Accuracy: {no_emb_acc}")

Catergorial Embeddings Accuracy: 0.30363864491844417
No Embeddings Accuracy: 0.3613550815558344


In [120]:
emb_nmi = normalized_mutual_info_score(og_df["num"].to_numpy(), emb_kmeans.labels_)
no_emb_nmi = normalized_mutual_info_score(og_df["num"].to_numpy(), no_emb_kmeans.labels_)

print(f"Embeddings NMI: {emb_nmi}")
print(f"No Embeddings NMI: {no_emb_nmi}")

Embeddings NMI: 0.08307825924411819
No Embeddings NMI: 0.16834914289030875
