In [1]:
import os
import pandas as pd
import numpy as np
import math
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
import torch

from utils import *
from autoencoders.all_to_all_autoencoder import AllToAllAutoencoder
from autoencoders.transformer_autoencoder import TransformerAutoencoder
from autoencoders.attention_autoencoder import AttentionAutoencoder
from load_datasets import load_all_datasets

In [2]:
device = "cuda"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)
np.random.seed(0)

datasets = load_all_datasets(max_rows=5000)
accuracies = {d.name: {} for d in datasets}
nmis = {d.name: {} for d in datasets}

In [3]:
# All Columns to All Columns Autoencoder
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = sum(d for _, d in d.embedding_sizes) + len(d.cont_cols)
    encoder, decoder = build_autoencoder(input_dim, input_dim, max(1, round(math.log2(input_dim))))
    print(encoder, decoder)

    ae = AllToAllAutoencoder(encoder, decoder, d.embedding_sizes, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = ae.encode(cat, cont).detach().cpu().numpy()
    kmeans = KMeans(n_clusters=d.n_targets, init="random", max_iter=300, n_init=1, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["All cols AE"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["All cols AE"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=9, out_features=7, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=7, out_features=5, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=5, out_features=2, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=2, out_features=5, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=5, out_features=7, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=7, out_features=9, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
)
Epoch 25/100 - Batch Reconstruction loss: 0.414975
Epoch 50/100 - Batch Reconstruction loss: 0.438344
Epoch 75/100 - Batch Reconstruction loss: 0.359090
Epoch 100/100 - Batch Reconstruction loss: 0.117859
Calculating for Auction Verification...
Sequential(
  (0): Linear(in_features=13, out_features=11, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=11

In [4]:
# All Columns to All Columns Autoencoder with Attention
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = len(d.cont_cols) + len(d.cat_cols) * 32
    encoder, decoder = build_autoencoder(input_dim, input_dim, max(1, round(math.log2(input_dim))))
    print(encoder, decoder)

    ae = AttentionAutoencoder(encoder, decoder, d.embedding_sizes, emb_dim=32, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = ae.encode(cat, cont).detach().cpu().numpy()
    kmeans = KMeans(n_clusters=d.n_targets, init="random", n_init=1, max_iter=300, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["AE with Attention"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["AE with Attention"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=39, out_features=33, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=33, out_features=27, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=27, out_features=21, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=21, out_features=10, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=10, out_features=21, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=21, out_features=27, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=27, out_features=33, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=33, out_features=39, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
)
Epoch 25/100 - Batch Reconstruction loss: 0.051670
Epoch 50/100 - Batch Reconstruction loss: 0.016812
Epoch 75/100 - Batch Reconstruction loss: 0.016067
Epoch 100/100 - Batch Re

In [5]:
# Transformer
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = len(d.cont_cols) + len(d.cat_cols) * 32
    encoder, decoder = build_autoencoder(input_dim, input_dim, max(1, round(math.log2(input_dim))))
    print(encoder, decoder)

    ae = TransformerAutoencoder(encoder, decoder, d.embedding_sizes, emb_dim=32, depth=6, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = ae.encode(cat, cont).detach().cpu().numpy()
    kmeans = KMeans(n_clusters=d.n_targets, init="random", n_init=1, max_iter=300, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["Transformer n=6"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["Transformer n=6"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=39, out_features=33, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=33, out_features=27, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=27, out_features=21, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=21, out_features=10, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=10, out_features=21, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=21, out_features=27, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=27, out_features=33, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=33, out_features=39, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
)
Epoch 25/100 - Batch Reconstruction loss: 0.065678
Epoch 50/100 - Batch Reconstruction loss: 0.060802
Epoch 75/100 - Batch Reconstruction loss: 0.061031
Epoch 100/100 - Batch Re

In [6]:
pd.DataFrame(nmis.values(), index=nmis.keys()).round(4)

Unnamed: 0,All cols AE,AE with Attention,Transformer n=6
Abalone,0.1603,0.1681,0.1612
Auction Verification,0.1234,0.0109,0.0045
Bank Marketing,0.004,0.0394,0.0022
Breast Cancer,0.5521,0.1906,0.5569
Census Income,0.1475,0.0237,0.0056
Credit Approval,0.0123,0.1833,0.3061
Heart Disease,0.155,0.1244,0.1863
Soybean Disease,0.5656,0.2514,0.3966


In [7]:
pd.DataFrame(accuracies.values(), index=accuracies.keys()).round(4)

Unnamed: 0,All cols AE,AE with Attention,Transformer n=6
Abalone,0.1503,0.1575,0.1345
Auction Verification,0.8331,0.6422,0.6393
Bank Marketing,0.7416,0.7044,0.5636
Breast Cancer,0.9136,0.7277,0.8799
Census Income,0.674,0.7146,0.6812
Credit Approval,0.536,0.7335,0.7887
Heart Disease,0.4448,0.3411,0.4515
Soybean Disease,0.4751,0.2473,0.3114
