In [42]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from gower_duped import gower_matrix as gower_matrix_duped

In [43]:
def cluster_accuracy(y_pred, y_true):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_pred)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_pred.size):
        cost_matrix[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(cost_matrix.max() - cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [44]:
og_df = pd.read_csv("datasets/census_income.csv")
og_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [45]:
og_df.loc[(og_df["class"] == " <=50K.") | (og_df["class"] == " <=50K"), "class"] = 0
og_df.loc[(og_df["class"] == " >50K.") | (og_df["class"] == " >50K"), "class"] = 1
# Probability of most common class
og_df["class"].value_counts().max()/og_df["class"].count()

0.7607182343065395

In [46]:
cat_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
cont_cols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]

In [47]:
df = og_df.copy()
df.drop(columns="class", inplace=True)
df[cat_cols] = df[cat_cols].apply(LabelEncoder().fit_transform)
df[cont_cols] = MinMaxScaler().fit_transform(df[cont_cols])
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,0.301370,7,0.044131,9,0.800000,4,1,1,4,1,0.021740,0.0,0.397959,39
1,0.452055,6,0.048052,9,0.800000,2,4,0,4,1,0.000000,0.0,0.122449,39
2,0.287671,4,0.137581,11,0.533333,0,6,1,4,1,0.000000,0.0,0.397959,39
3,0.493151,4,0.150486,1,0.400000,2,6,0,2,1,0.000000,0.0,0.397959,39
4,0.150685,4,0.220635,9,0.800000,2,10,5,2,0,0.000000,0.0,0.397959,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.301370,4,0.137428,9,0.800000,0,10,1,4,0,0.000000,0.0,0.357143,39
48838,0.643836,0,0.209130,11,0.533333,6,0,2,2,1,0.000000,0.0,0.397959,39
48839,0.287671,4,0.245379,9,0.800000,2,10,0,4,1,0.000000,0.0,0.500000,39
48840,0.369863,4,0.048444,9,0.800000,0,1,3,1,1,0.054551,0.0,0.397959,39


In [48]:
def encode_feature(df, feature_to_encode):
    dummies = pd.get_dummies(df[[feature_to_encode]], dtype=float)
    result_df = pd.concat([df, dummies], axis=1)
    result_df.drop(columns=feature_to_encode, inplace=True)
    return result_df

df_one_hot = og_df.copy()
df_one_hot.drop(columns="class", inplace=True)
df_one_hot[cont_cols] = MinMaxScaler().fit_transform(df_one_hot[cont_cols])
for col in cat_cols:
    df_one_hot = encode_feature(df_one_hot, col)
df_one_hot

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.301370,0.044131,0.800000,0.021740,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.452055,0.048052,0.800000,0.000000,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.287671,0.137581,0.533333,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.493151,0.150486,0.400000,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.150685,0.220635,0.800000,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.301370,0.137428,0.800000,0.000000,0.0,0.357143,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,0.643836,0.209130,0.533333,0.000000,0.0,0.397959,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,0.287671,0.245379,0.800000,0.000000,0.0,0.500000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,0.369863,0.048444,0.800000,0.054551,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [49]:
kmeans = KMeans(n_clusters=2, n_init="auto", random_state=0).fit(df_one_hot)
kmeans_acc = cluster_accuracy(kmeans.labels_, og_df["class"].to_numpy())
kmeans_acc

0.7165758977928832

In [50]:
kmeans_nmi = normalized_mutual_info_score(og_df["class"].to_numpy(), kmeans.labels_)
kmeans_nmi

0.1325918200579342

In [51]:
no_target_df = og_df.drop(columns="class")
distance_matrix = gower_matrix_duped(no_target_df)
distance_matrix

100%|██████████| 48842/48842 [18:37<00:00, 43.69it/s]  


array([[0.        , 0.3179897 , 0.31396827, ..., 0.30990908, 0.2932583 ,
        0.31081456],
       [0.3179897 , 0.        , 0.41400644, ..., 0.19566154, 0.38661748,
        0.12513845],
       [0.31396827, 0.41400644, 0.        , ..., 0.31975034, 0.32089615,
        0.3953219 ],
       ...,
       [0.30990908, 0.19566154, 0.31975034, ..., 0.        , 0.31683698,
        0.16239977],
       [0.2932583 , 0.38661748, 0.32089615, ..., 0.31683698, 0.        ,
        0.38917103],
       [0.31081456, 0.12513845, 0.3953219 , ..., 0.16239977, 0.38917103,
        0.        ]], dtype=float32)

In [52]:
gower_agglo = AgglomerativeClustering(n_clusters=2, metric="precomputed", linkage="average").fit_predict(distance_matrix)
gower_agglo_acc = cluster_accuracy(gower_agglo, og_df["class"].to_numpy())
gower_agglo_acc
# linkage=average: 0.7602882764833545
# linkage=single: 0.760697760124483

0.7602882764833545

In [53]:
gower_agglo_nmi = normalized_mutual_info_score(og_df["class"].to_numpy(), gower_agglo)
gower_agglo_nmi

2.3468560869329926e-05

In [54]:
embedding_sizes = [(df[col].nunique(), min(50, max(2, (df[col].nunique()+1) // 2))) for col in df[cat_cols]]
embedding_sizes

[(9, 5), (16, 8), (7, 4), (15, 8), (6, 3), (5, 3), (2, 2), (42, 21)]

In [55]:
class CensusIncomeDataset(Dataset):
    def __init__(self, df):
        self.cat = torch.tensor(df[cat_cols].values, dtype=torch.float)
        self.cont = torch.tensor(df[cont_cols].values, dtype=torch.float)

    def __getitem__(self, idx):
        return self.cat[idx], self.cont[idx]

    def __len__(self):
        return self.cat.shape[0]
    
dataset = CensusIncomeDataset(df)
dataloader = DataLoader(dataset, batch_size=512, shuffle=True)
len(dataset)

48842

In [56]:
class AttentionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim) for num, dim in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        in_dim = n_emb + len(cont_cols)
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(in_dim, 32),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(32),
            torch.nn.Linear(32, 16),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(16),
            torch.nn.Linear(16, 8),
            torch.nn.Sigmoid(),
            torch.nn.BatchNorm1d(8),
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(8, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, in_dim),
            torch.nn.Sigmoid()
        )


    def encode(self, x_cat, x_cont):
        x_cat = x_cat.to(torch.long)
        embedded = torch.cat([e(x_cat[:, i]) for i, e in enumerate(self.embeddings)], 1)
        self.last_target = embedded.clone().detach()

        qkv = torch.cat((embedded, x_cont), 1)
        x = F.scaled_dot_product_attention(qkv, qkv, qkv)
        encoded = self.encoder(x)
        return encoded

    def forward(self, x_cat, x_cont):
        encoded = self.encode(x_cat, x_cont)
        decoded = self.decoder(encoded)
        return decoded


epochs = 100
lr = 0.001

model = AttentionModel()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    loss = 0

    for x_cat, x_cont in dataloader:
        optimizer.zero_grad()
        outputs = model(x_cat, x_cont)
        train_loss = criterion(outputs, torch.cat((model.last_target, x_cont), 1))
        train_loss.backward()
        optimizer.step()
        loss += train_loss.item()

    loss = loss / len(dataloader)
    print("epoch: {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

epoch: 1/100, loss = 1.081454
epoch: 2/100, loss = 0.810576
epoch: 3/100, loss = 0.782044
epoch: 4/100, loss = 0.759625
epoch: 5/100, loss = 0.736315
epoch: 6/100, loss = 0.717795
epoch: 7/100, loss = 0.697768
epoch: 8/100, loss = 0.680314
epoch: 9/100, loss = 0.666005
epoch: 10/100, loss = 0.653759
epoch: 11/100, loss = 0.644495
epoch: 12/100, loss = 0.635795
epoch: 13/100, loss = 0.626431
epoch: 14/100, loss = 0.619718
epoch: 15/100, loss = 0.612891
epoch: 16/100, loss = 0.605771
epoch: 17/100, loss = 0.599854
epoch: 18/100, loss = 0.593850
epoch: 19/100, loss = 0.588277
epoch: 20/100, loss = 0.582965
epoch: 21/100, loss = 0.578347
epoch: 22/100, loss = 0.574253
epoch: 23/100, loss = 0.570368
epoch: 24/100, loss = 0.567287
epoch: 25/100, loss = 0.565001
epoch: 26/100, loss = 0.562682
epoch: 27/100, loss = 0.560616
epoch: 28/100, loss = 0.558432
epoch: 29/100, loss = 0.557205
epoch: 30/100, loss = 0.556095
epoch: 31/100, loss = 0.555376
epoch: 32/100, loss = 0.552496
epoch: 33/100, lo

In [57]:
cat = torch.tensor(df[cat_cols].values, dtype=torch.float)
cont = torch.tensor(df[cont_cols].values, dtype=torch.float)
features = model.encode(cat, cont).detach().numpy()
features

array([[-0.7731142 ,  2.2105112 ,  2.4065523 , ..., -0.95765257,
         2.0670462 ,  1.998209  ],
       [-2.1812267 ,  2.2791386 ,  1.3138523 , ...,  1.6385336 ,
        -1.7250853 ,  0.4066143 ],
       [-0.47664738, -0.61287594, -1.8925271 , ...,  1.7260189 ,
        -1.6755476 , -2.3692465 ],
       ...,
       [-2.214448  ,  1.5419874 ,  0.33006716, ...,  2.3086662 ,
        -2.3054028 ,  0.78027916],
       [-1.505003  ,  2.1899023 ,  1.892911  , ...,  2.6764307 ,
        -0.8353691 ,  1.4537773 ],
       [-2.2850761 ,  2.9549255 ,  1.9880047 , ...,  0.5408449 ,
        -1.1282539 , -0.2725029 ]], dtype=float32)

In [58]:
kmeans = KMeans(n_clusters=2, n_init="auto", random_state=0).fit(features)
deep_acc = cluster_accuracy(kmeans.labels_, og_df["class"].to_numpy())
deep_acc

0.6074689816141845

In [59]:
deep_nmi = normalized_mutual_info_score(og_df["class"].to_numpy(), kmeans.labels_)
deep_nmi

0.12904984832427263

In [66]:
pd.DataFrame([[kmeans_acc, kmeans_nmi], [gower_agglo_acc, gower_agglo_nmi], [deep_acc, deep_nmi]], index=["KMeans", "Gower + Agglomerative", "Deep Attention KMeans"], columns=["Accuracy", "NMI"])

Unnamed: 0,Accuracy,NMI
KMeans,0.716576,0.132592
Gower + Agglomerative,0.760288,2.3e-05
Deep Attention KMeans,0.607469,0.12905
