In [76]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from gower_duped import gower_matrix as gower_matrix_duped

In [77]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [78]:
og_df = pd.read_csv("datasets/census_income.csv")
og_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [79]:
og_df.loc[(og_df["class"] == " <=50K.") | (og_df["class"] == " <=50K"), "class"] = 0
og_df.loc[(og_df["class"] == " >50K.") | (og_df["class"] == " >50K"), "class"] = 1
# Probability of most common class
og_df["class"].value_counts().max()/og_df["class"].count()

0.7607182343065395

In [80]:
cat_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
cont_cols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]

In [81]:
df = og_df.copy()
df.drop(columns="class", inplace=True)
df[cat_cols] = df[cat_cols].apply(LabelEncoder().fit_transform)
df[cont_cols] = MinMaxScaler().fit_transform(df[cont_cols])
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,0.301370,7,0.044131,9,0.800000,4,1,1,4,1,0.021740,0.0,0.397959,39
1,0.452055,6,0.048052,9,0.800000,2,4,0,4,1,0.000000,0.0,0.122449,39
2,0.287671,4,0.137581,11,0.533333,0,6,1,4,1,0.000000,0.0,0.397959,39
3,0.493151,4,0.150486,1,0.400000,2,6,0,2,1,0.000000,0.0,0.397959,39
4,0.150685,4,0.220635,9,0.800000,2,10,5,2,0,0.000000,0.0,0.397959,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.301370,4,0.137428,9,0.800000,0,10,1,4,0,0.000000,0.0,0.357143,39
48838,0.643836,0,0.209130,11,0.533333,6,0,2,2,1,0.000000,0.0,0.397959,39
48839,0.287671,4,0.245379,9,0.800000,2,10,0,4,1,0.000000,0.0,0.500000,39
48840,0.369863,4,0.048444,9,0.800000,0,1,3,1,1,0.054551,0.0,0.397959,39


In [82]:
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[[feature_to_encode]], dtype=float)
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

df_one_hot = og_df.copy()
df_one_hot.drop(columns="class", inplace=True)
df_one_hot[cont_cols] = MinMaxScaler().fit_transform(df_one_hot[cont_cols])
for col in cat_cols:
    df_one_hot = encode_feature(df_one_hot, col)
df_one_hot

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.301370,0.044131,0.800000,0.021740,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.452055,0.048052,0.800000,0.000000,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.287671,0.137581,0.533333,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.493151,0.150486,0.400000,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.150685,0.220635,0.800000,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.301370,0.137428,0.800000,0.000000,0.0,0.357143,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,0.643836,0.209130,0.533333,0.000000,0.0,0.397959,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,0.287671,0.245379,0.800000,0.000000,0.0,0.500000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,0.369863,0.048444,0.800000,0.054551,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [83]:
kmeans = KMeans(n_clusters=2, n_init="auto", random_state=0).fit(df_one_hot)
kmeans_acc = cluster_accuracy(kmeans.labels_, og_df["class"].to_numpy())
kmeans_acc

0.7165758977928832

In [84]:
kmeans_nmi = normalized_mutual_info_score(og_df["class"].to_numpy(), kmeans.labels_)
kmeans_nmi

0.1325918200579342

In [85]:
# no_target_df = og_df.drop(columns="class")
# distance_matrix = gower_matrix_duped(no_target_df)
# distance_matrix

In [86]:
# gower_agglo = AgglomerativeClustering(n_clusters=2, metric="precomputed", linkage="single").fit_predict(distance_matrix)
# gower_agglo_acc = cluster_accuracy(gower_agglo, og_df["class"].to_numpy())
# gower_agglo_acc
# linkage=average: 0.7602882764833545
# linkage=single: 0.760697760124483

In [87]:
# gower_agglo_nmi = normalized_mutual_info_score(og_df["class"].to_numpy(), gower_agglo)
# gower_agglo_nmi

In [88]:
embedding_sizes = [(df[col].nunique(), min(50, max(2, (df[col].nunique()+1) // 2))) for col in df[cat_cols]]
embedding_sizes

[(9, 5), (16, 8), (7, 4), (15, 8), (6, 3), (5, 3), (2, 2), (42, 21)]

In [89]:
class CensusIncomeDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[cat_cols].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_cols].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]
    
dataset = CensusIncomeDataset(df)
dataloader = DataLoader(dataset, batch_size=512, shuffle=True)
len(dataset)

48842

In [90]:
class AttentionModelDecoderOnlyCat(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim) for num, dim in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        in_dim = n_emb + len(cont_cols)
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(in_dim, 32),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(32),
            torch.nn.Linear(32, 16),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(16),
            torch.nn.Linear(16, 8),
            torch.nn.Sigmoid(),
            torch.nn.BatchNorm1d(8),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(8, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, n_emb),
            torch.nn.Sigmoid()
        )


    def encode(self, x_cat, x_cont):
        x_cat = x_cat.to(torch.long)
        embedded = torch.cat([e(x_cat[:, i]) for i, e in enumerate(self.embeddings)], 1)
        self.last_target = embedded.clone().detach()

        qkv = torch.cat((embedded, x_cont), 1)
        x = F.scaled_dot_product_attention(qkv, qkv, qkv)
        encoded = self.encoder(x)
        return encoded

    def forward(self, x_cat, x_cont):
        encoded = self.encode(x_cat, x_cont)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = AttentionModelDecoderOnlyCat()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = model(x_cat, x_cont)
        train_loss = criterion(outputs, model.last_target)
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.011161
epoch: 2/100, loss = 0.749975
epoch: 3/100, loss = 0.723298
epoch: 4/100, loss = 0.701258
epoch: 5/100, loss = 0.683748
epoch: 6/100, loss = 0.673417
epoch: 7/100, loss = 0.659925
epoch: 8/100, loss = 0.646812
epoch: 9/100, loss = 0.636555
epoch: 10/100, loss = 0.627363
epoch: 11/100, loss = 0.622448
epoch: 12/100, loss = 0.618037
epoch: 13/100, loss = 0.614590
epoch: 14/100, loss = 0.611696
epoch: 15/100, loss = 0.608681
epoch: 16/100, loss = 0.605609
epoch: 17/100, loss = 0.602997
epoch: 18/100, loss = 0.601463
epoch: 19/100, loss = 0.599820
epoch: 20/100, loss = 0.598456
epoch: 21/100, loss = 0.597155
epoch: 22/100, loss = 0.596325
epoch: 23/100, loss = 0.595582
epoch: 24/100, loss = 0.595352
epoch: 25/100, loss = 0.595104
epoch: 26/100, loss = 0.594951
epoch: 27/100, loss = 0.595158
epoch: 28/100, loss = 0.596212
epoch: 29/100, loss = 0.597184
epoch: 30/100, loss = 0.598257
epoch: 31/100, loss = 0.600083
epoch: 32/100, loss = 0.603421
epoch: 33/100, lo

In [91]:
cat = torch.tensor(df[cat_cols].values, dtype=torch.float)
cont = torch.tensor(df[cont_cols].values, dtype=torch.float)
cat_features = model.encode(cat, cont).detach().numpy()
features = np.concatenate((cat_features, df[cont_cols].values), 1)
features

array([[-1.27100515, -1.65021658,  1.78275776, ...,  0.02174022,
         0.        ,  0.39795918],
       [-3.20273423, -1.71422434, -0.46493387, ...,  0.        ,
         0.        ,  0.12244898],
       [ 0.45016956, -1.44210911, -3.24438381, ...,  0.        ,
         0.        ,  0.39795918],
       ...,
       [-1.98420954, -1.61188459,  1.4049902 , ...,  0.        ,
         0.        ,  0.5       ],
       [-2.11292887, -1.72118902,  0.78204441, ...,  0.05455055,
         0.        ,  0.39795918],
       [-2.08278608, -1.822721  ,  1.02361584, ...,  0.        ,
         0.        ,  0.60204082]])

In [92]:
kmeans = KMeans(n_clusters=2, n_init="auto", random_state=0).fit(features)
deep_acc = cluster_accuracy(kmeans.labels_, og_df["class"].to_numpy())
deep_acc

0.5062650997092666

In [93]:
deep_nmi = normalized_mutual_info_score(og_df["class"].to_numpy(), kmeans.labels_)
deep_nmi

0.021787362747908486

In [94]:
class AttentionModelDecoderAllCols(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim) for num, dim in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        in_dim = n_emb + len(cont_cols)
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(in_dim, 32),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(32),
            torch.nn.Linear(32, 16),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(16),
            torch.nn.Linear(16, 8),
            torch.nn.Sigmoid(),
            torch.nn.BatchNorm1d(8),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(8, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, in_dim),
            torch.nn.Sigmoid()
        )


    def encode(self, x_cat, x_cont):
        x_cat = x_cat.to(torch.long)
        embedded = torch.cat([e(x_cat[:, i]) for i, e in enumerate(self.embeddings)], 1)
        self.last_target = embedded.clone().detach()

        qkv = torch.cat((embedded, x_cont), 1)
        x = F.scaled_dot_product_attention(qkv, qkv, qkv)
        encoded = self.encoder(x)
        return encoded

    def forward(self, x_cat, x_cont):
        encoded = self.encode(x_cat, x_cont)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

all_cols_model = AttentionModelDecoderAllCols()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(all_cols_model.parameters(), lr=lr)

for epoch in range(epochs):
    all_cols_model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = all_cols_model(x_cat, x_cont)
        train_loss = criterion(outputs, torch.cat((all_cols_model.last_target, x_cont), 1))
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 0.802192
epoch: 2/100, loss = 0.587430
epoch: 3/100, loss = 0.563110
epoch: 4/100, loss = 0.540228
epoch: 5/100, loss = 0.524651
epoch: 6/100, loss = 0.511783
epoch: 7/100, loss = 0.503997
epoch: 8/100, loss = 0.497053
epoch: 9/100, loss = 0.490834
epoch: 10/100, loss = 0.484748
epoch: 11/100, loss = 0.480123
epoch: 12/100, loss = 0.476143
epoch: 13/100, loss = 0.473645
epoch: 14/100, loss = 0.471910
epoch: 15/100, loss = 0.470136
epoch: 16/100, loss = 0.468560
epoch: 17/100, loss = 0.467343
epoch: 18/100, loss = 0.466572
epoch: 19/100, loss = 0.464882
epoch: 20/100, loss = 0.463597
epoch: 21/100, loss = 0.462804
epoch: 22/100, loss = 0.462486
epoch: 23/100, loss = 0.461753
epoch: 24/100, loss = 0.461230
epoch: 25/100, loss = 0.460672
epoch: 26/100, loss = 0.460670
epoch: 27/100, loss = 0.460256
epoch: 28/100, loss = 0.459365
epoch: 29/100, loss = 0.459115
epoch: 30/100, loss = 0.458987
epoch: 31/100, loss = 0.458754
epoch: 32/100, loss = 0.458768
epoch: 33/100, lo

In [95]:
cat = torch.tensor(df[cat_cols].values, dtype=torch.float)
cont = torch.tensor(df[cont_cols].values, dtype=torch.float)
decoder_all_cols_features = all_cols_model.encode(cat, cont).detach().numpy()
decoder_all_cols_features

array([[ 0.07509518, -0.32844543, -0.1906023 , ..., -0.56984806,
         2.4054174 ,  1.4794273 ],
       [-1.3279209 , -3.7389007 ,  2.0956469 , ...,  2.3532438 ,
        -1.2456269 ,  0.368495  ],
       [ 1.2748337 , -0.6890826 , -0.4127841 , ..., -2.7919455 ,
         2.7442555 ,  0.33372307],
       ...,
       [-2.2810316 , -2.057921  , -0.6197653 , ...,  2.0288439 ,
         1.324235  , -2.621729  ],
       [-0.36414146, -0.73335123, -0.01869631, ..., -0.18686485,
         2.5865703 , -0.45962954],
       [-0.87611675, -0.8533535 ,  3.9578562 , ...,  1.2284737 ,
        -0.6673651 , -3.7042866 ]], dtype=float32)

In [96]:
all_cols_kmeans = KMeans(n_clusters=2, n_init="auto", random_state=0).fit(decoder_all_cols_features)
all_cols_acc = cluster_accuracy(all_cols_kmeans.labels_, og_df["class"].to_numpy())
all_cols_acc

0.6772859424266

In [97]:
all_cols_nmi = normalized_mutual_info_score(og_df["class"].to_numpy(), all_cols_kmeans.labels_)
all_cols_nmi

0.1469560146376703

In [98]:
pd.DataFrame([[kmeans_acc, kmeans_nmi], [0.760697760124483, 0.000023], [deep_acc, deep_nmi], [all_cols_acc, all_cols_nmi]], index=["KMeans", "Gower + Agglomerative", "Deep Attention KMeans, only Cat Cols reconstructed in Decoder", "Deep Attention KMeans, all Cols reconstructed in Decoder"], columns=["Accuracy", "NMI"])

Unnamed: 0,Accuracy,NMI
KMeans,0.716576,0.132592
Gower + Agglomerative,0.760698,2.3e-05
"Deep Attention KMeans, only Cat Cols reconstructed in Decoder",0.506265,0.021787
"Deep Attention KMeans, all Cols reconstructed in Decoder",0.677286,0.146956
