In [1]:
import os
from copy import deepcopy
import pandas as pd
import numpy as np
import math
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler
from scipy.optimize import linear_sum_assignment
import torch
import torch.nn as nn

from autoencoders.all_to_all_autoencoder import AllToAllAutoencoder
from autoencoders.transformer_autoencoder import TransformerAutoencoder
from load_datasets import load_all_datasets

In [2]:
device = "cpu"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)
np.random.seed(0)

In [3]:
def cluster_accuracy(labels_true, labels_pred):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(labels_true)), len(np.unique(labels_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(labels_true.shape[0]):
        cost_matrix[labels_true[i], labels_pred[i]] += 1
    inverted_cost_matrix = cost_matrix.max() - cost_matrix
    row_ind, col_ind = linear_sum_assignment(inverted_cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / labels_pred.size

In [5]:
def build_block(layers: list, activation_fn: torch.nn.Module = nn.LeakyReLU, output_fn: torch.nn.Module = nn.LeakyReLU, 
                bias: bool = True, batch_norm: bool = False, dropout: float = None):
    block_list = []
    for i in range(len(layers) - 1):
        block_list.append(torch.nn.Linear(layers[i], layers[i + 1], bias=bias))
        if batch_norm:
            block_list.append(torch.nn.BatchNorm1d(layers[i + 1]))
        if dropout is not None:
            block_list.append(torch.nn.Dropout(dropout))
        if activation_fn is not None:
            if (i != len(layers) - 2):
                block_list.append(activation_fn())
            else:
                if output_fn is not None:
                    block_list.append(output_fn())
    return torch.nn.Sequential(*block_list)

def build_autoencoder(input_dim: int, output_dim: int, layer_per_block: int, activation_fn: torch.nn.Module = nn.LeakyReLU, 
                      output_fn: torch.nn.Module = nn.LeakyReLU, bias: bool = True, batch_norm: bool = False, dropout: float = None):
    hidden_dim = max(1, min(round(input_dim/4), round(output_dim/4)))

    encoder_layer_list = list(range(input_dim, hidden_dim - 1, min(-1, -round((input_dim - hidden_dim) / layer_per_block))))
    encoder_layer_list[-1] = hidden_dim
    encoder = build_block(encoder_layer_list, activation_fn, output_fn, bias, batch_norm, dropout)

    decoder_layer_list = list(range(hidden_dim, output_dim + 1, max(1, round((output_dim - hidden_dim) / layer_per_block))))
    decoder_layer_list[-1] = output_dim
    decoder = build_block(decoder_layer_list, activation_fn, output_fn, bias, batch_norm, dropout)

    return encoder, decoder

In [6]:
datasets = load_all_datasets(max_rows=5000)

accuracies = {d.name: {} for d in datasets}
nmis = {d.name: {} for d in datasets}

for d in datasets:
    print(f"{d.name}: Input dim: {d.input_dim}; Cat dim: {d.cat_dim}; Cont dim: {d.cont_dim}")

Abalone: Input dim: 9; Cat dim: 2; Cont dim: 7
Auction Verification: Input dim: 13; Cat dim: 12; Cont dim: 1
Bank Marketing: Input dim: 22; Cat dim: 17; Cont dim: 5
Breast Cancer: Input dim: 45; Cat dim: 45; Cont dim: 0
Census Income: Input dim: 58; Cat dim: 52; Cont dim: 6
Credit Approval: Input dim: 28; Cat dim: 22; Cont dim: 6
Heart Disease: Input dim: 17; Cat dim: 11; Cont dim: 6
Soybean Disease: Input dim: 55; Cat dim: 55; Cont dim: 0


In [10]:
# All Columns to All Columns Autoencoder
for d in datasets:
    print(f"Calculating for {d.name}...")
    encoder, decoder = build_autoencoder(d.input_dim, d.input_dim, max(1, round(math.log2(d.input_dim))))
    print(encoder, decoder)

    ae = AllToAllAutoencoder(encoder, decoder, d.input_dim, d.cat_dim, d.embedding_sizes, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = ae.encode(cat, cont).detach().cpu().numpy()
    kmeans = KMeans(n_clusters=d.n_targets, init="random", n_init=1, max_iter=300, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["All cols AE"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["All cols AE"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=9, out_features=7, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=7, out_features=5, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=5, out_features=2, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=2, out_features=4, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=4, out_features=6, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=6, out_features=9, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
)
Epoch 25/100 - Batch Reconstruction loss: 0.465266
Epoch 50/100 - Batch Reconstruction loss: 0.333456
Epoch 75/100 - Batch Reconstruction loss: 0.309214
Epoch 100/100 - Batch Reconstruction loss: 0.268465
Calculating for Auction Verification...
Sequential(
  (0): Linear(in_features=13, out_features=11, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=11

In [11]:
# All Columns to All Columns Autoencoder with Attention
for d in datasets:
    print(f"Calculating for {d.name}...")
    encoder, decoder = build_autoencoder(d.input_dim, d.input_dim, max(1, round(math.log2(d.input_dim))))
    print(encoder, decoder)

    ae = AllToAllAutoencoder(encoder, decoder, d.input_dim, d.cat_dim, d.embedding_sizes, attention=True, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = ae.encode(cat, cont).detach().cpu().numpy()
    kmeans = KMeans(n_clusters=d.n_targets, init="random", n_init=1, max_iter=300, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["AE with Attention"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["AE with Attention"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=9, out_features=7, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=7, out_features=5, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=5, out_features=2, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=2, out_features=4, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=4, out_features=6, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=6, out_features=9, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
)


Epoch 25/100 - Batch Reconstruction loss: 0.339005
Epoch 50/100 - Batch Reconstruction loss: 0.272260
Epoch 75/100 - Batch Reconstruction loss: 0.265192
Epoch 100/100 - Batch Reconstruction loss: 0.263849
Calculating for Auction Verification...
Sequential(
  (0): Linear(in_features=13, out_features=11, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=11, out_features=9, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=9, out_features=7, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=7, out_features=5, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=5, out_features=3, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=3, out_features=5, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=5, out_features=7, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=7, out_features=9, bias=True)
  (5): Leaky

In [7]:
# Transformer
for d in datasets:
    print(f"Calculating for {d.name}...")
    encoder, decoder = build_autoencoder(d.input_dim, d.input_dim, max(1, round(math.log2(d.input_dim))))
    print(encoder, decoder)

    ae = TransformerAutoencoder(encoder, decoder, d.input_dim, d.cat_dim, d.embedding_sizes, depth=8, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = ae.encode(cat, cont).detach().cpu().numpy()
    kmeans = KMeans(n_clusters=d.n_targets, init="random", n_init=1, max_iter=300, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["Transformer n=8"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["Transformer n=8"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=9, out_features=7, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=7, out_features=5, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=5, out_features=2, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=2, out_features=4, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=4, out_features=6, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=6, out_features=9, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
)
Epoch 25/100 - Batch Reconstruction loss: 0.142252
Epoch 50/100 - Batch Reconstruction loss: 0.138866
Epoch 75/100 - Batch Reconstruction loss: 0.131882
Epoch 100/100 - Batch Reconstruction loss: 0.119488
Calculating for Auction Verification...
Sequential(
  (0): Linear(in_features=13, out_features=11, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=11

In [12]:
pd.DataFrame(nmis.values(), index=nmis.keys()).round(4)

Unnamed: 0,Transformer n=8,All cols AE,AE with Attention
Abalone,0.1687,0.1154,0.1575
Auction Verification,0.0025,0.0082,0.0
Bank Marketing,0.0076,0.0198,0.0008
Breast Cancer,0.5486,0.6629,0.6388
Census Income,0.0749,0.1208,0.0187
Credit Approval,0.1498,0.0225,0.0001
Heart Disease,0.0958,0.0998,0.1876
Soybean Disease,0.492,0.4519,0.3713


In [13]:
pd.DataFrame(accuracies.values(), index=accuracies.keys()).round(4)

Unnamed: 0,Transformer n=8,All cols AE,AE with Attention
Abalone,0.1245,0.136,0.1211
Auction Verification,0.5507,0.8023,0.6495
Bank Marketing,0.5994,0.7254,0.6778
Breast Cancer,0.8887,0.9268,0.9297
Census Income,0.7352,0.6754,0.6134
Credit Approval,0.7259,0.5957,0.513
Heart Disease,0.3244,0.3545,0.398
Soybean Disease,0.3648,0.3808,0.258
