In [1]:
import os
import pandas as pd
import numpy as np
import math
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
import torch

from utils import *
from autoencoders.ft_transformer_autoencoder import FTTransformerAutoencoder
from load_datasets import load_all_datasets
from duped_modules.dcn_duped import DCNDuped
from duped_modules.dec_duped import DECDuped, IDECDuped

In [2]:
device = "cuda"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)
np.random.seed(0)

datasets = load_all_datasets(max_rows=5000)
accuracies = {d.name: {} for d in datasets}
nmis = {d.name: {} for d in datasets}

In [3]:
# FT Transformer
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = (len(d.cont_cols) + len(d.cat_cols)) * 32
    encoder, decoder = build_autoencoder(input_dim, input_dim, max(1, round(math.log2(input_dim))))
    print(encoder, decoder)

    ae = FTTransformerAutoencoder(encoder, decoder, len(d.cont_cols), d.embedding_sizes, emb_dim=32, depth=6, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = ae.encode(cat, cont).detach().cpu().numpy()
    kmeans = KMeans(n_clusters=d.n_targets, init="random", n_init=1, max_iter=300, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["FT Transformer n=6"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["FT Transformer n=6"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=256, out_features=232, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=232, out_features=208, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=208, out_features=184, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=184, out_features=160, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=160, out_features=136, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=136, out_features=112, bias=True)
  (11): LeakyReLU(negative_slope=0.01)
  (12): Linear(in_features=112, out_features=88, bias=True)
  (13): LeakyReLU(negative_slope=0.01)
  (14): Linear(in_features=88, out_features=64, bias=True)
  (15): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=64, out_features=88, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=88, out_features=112, bias=True)
  (3)

In [4]:
pd.DataFrame(nmis.values(), index=nmis.keys()).round(4)

Unnamed: 0,FT Transformer n=6
Abalone,0.175
Auction Verification,0.0065
Bank Marketing,0.004
Breast Cancer,0.4371
Census Income,0.1423
Credit Approval,0.0897
Heart Disease,0.1717
Soybean Disease,0.2755


In [5]:
pd.DataFrame(accuracies.values(), index=accuracies.keys()).round(4)

Unnamed: 0,FT Transformer n=6
Abalone,0.1609
Auction Verification,0.629
Bank Marketing,0.8796
Breast Cancer,0.8184
Census Income,0.7246
Credit Approval,0.6141
Heart Disease,0.3813
Soybean Disease,0.242


In [3]:
# Deep Clustering with FT-Transformer
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = (len(d.cont_cols) + len(d.cat_cols)) * 32
    encoder, decoder = build_autoencoder(input_dim, input_dim, max(1, round(math.log2(input_dim))))
    print(encoder, decoder)
    ae = FTTransformerAutoencoder(encoder, decoder, len(d.cont_cols), d.embedding_sizes, emb_dim=32, depth=6, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    dcn = DCNDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dcn.fit(d.dataloader)
    nmis[d.name]["FT-Transformer DCN"] = normalized_mutual_info_score(d.y, dcn.labels_)
    accuracies[d.name]["FT-Transformer DCN"] = cluster_accuracy(d.y, dcn.labels_)

    dec = DECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dec.fit(d.dataloader)
    nmis[d.name]["FT-Transformer DEC"] = normalized_mutual_info_score(d.y, dec.labels_)
    accuracies[d.name]["FT-Transformer DEC"] = cluster_accuracy(d.y, dec.labels_)

    idec = IDECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    idec.fit(d.dataloader)
    nmis[d.name]["FT-Transformer IDEC"] = normalized_mutual_info_score(d.y, idec.labels_)
    accuracies[d.name]["FT-Transformer IDEC"] = cluster_accuracy(d.y, idec.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=256, out_features=232, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=232, out_features=208, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=208, out_features=184, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=184, out_features=160, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=160, out_features=136, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=136, out_features=112, bias=True)
  (11): LeakyReLU(negative_slope=0.01)
  (12): Linear(in_features=112, out_features=88, bias=True)
  (13): LeakyReLU(negative_slope=0.01)
  (14): Linear(in_features=88, out_features=64, bias=True)
  (15): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=64, out_features=88, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=88, out_features=112, bias=True)
  (3)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Calculating for Auction Verification...
Sequential(
  (0): Linear(in_features=224, out_features=203, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=203, out_features=182, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=182, out_features=161, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=161, out_features=140, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=140, out_features=119, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=119, out_features=98, bias=True)
  (11): LeakyReLU(negative_slope=0.01)
  (12): Linear(in_features=98, out_features=77, bias=True)
  (13): LeakyReLU(negative_slope=0.01)
  (14): Linear(in_features=77, out_features=56, bias=True)
  (15): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=56, out_features=77, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=77, out_features=98, bias=T

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Calculating for Bank Marketing...
Sequential(
  (0): Linear(in_features=416, out_features=381, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=381, out_features=346, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=346, out_features=311, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=311, out_features=276, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=276, out_features=241, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=241, out_features=206, bias=True)
  (11): LeakyReLU(negative_slope=0.01)
  (12): Linear(in_features=206, out_features=171, bias=True)
  (13): LeakyReLU(negative_slope=0.01)
  (14): Linear(in_features=171, out_features=104, bias=True)
  (15): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=104, out_features=171, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=171, out_features=206, bia

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Calculating for Breast Cancer...
Sequential(
  (0): Linear(in_features=288, out_features=261, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=261, out_features=234, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=234, out_features=207, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=207, out_features=180, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=180, out_features=153, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=153, out_features=126, bias=True)
  (11): LeakyReLU(negative_slope=0.01)
  (12): Linear(in_features=126, out_features=99, bias=True)
  (13): LeakyReLU(negative_slope=0.01)
  (14): Linear(in_features=99, out_features=72, bias=True)
  (15): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=72, out_features=99, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=99, out_features=126, bias=True)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Calculating for Census Income...
Sequential(
  (0): Linear(in_features=448, out_features=411, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=411, out_features=374, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=374, out_features=337, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=337, out_features=300, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=300, out_features=263, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=263, out_features=226, bias=True)
  (11): LeakyReLU(negative_slope=0.01)
  (12): Linear(in_features=226, out_features=189, bias=True)
  (13): LeakyReLU(negative_slope=0.01)
  (14): Linear(in_features=189, out_features=152, bias=True)
  (15): LeakyReLU(negative_slope=0.01)
  (16): Linear(in_features=152, out_features=112, bias=True)
  (17): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=112, out_features=152, bi

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Calculating for Credit Approval...
Sequential(
  (0): Linear(in_features=480, out_features=440, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=440, out_features=400, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=400, out_features=360, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=360, out_features=320, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=320, out_features=280, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=280, out_features=240, bias=True)
  (11): LeakyReLU(negative_slope=0.01)
  (12): Linear(in_features=240, out_features=200, bias=True)
  (13): LeakyReLU(negative_slope=0.01)
  (14): Linear(in_features=200, out_features=160, bias=True)
  (15): LeakyReLU(negative_slope=0.01)
  (16): Linear(in_features=160, out_features=120, bias=True)
  (17): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=120, out_features=160, 

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Calculating for Heart Disease...
Sequential(
  (0): Linear(in_features=416, out_features=381, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=381, out_features=346, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=346, out_features=311, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=311, out_features=276, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=276, out_features=241, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=241, out_features=206, bias=True)
  (11): LeakyReLU(negative_slope=0.01)
  (12): Linear(in_features=206, out_features=171, bias=True)
  (13): LeakyReLU(negative_slope=0.01)
  (14): Linear(in_features=171, out_features=104, bias=True)
  (15): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=104, out_features=171, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=171, out_features=206, bias

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Calculating for Soybean Disease...
Sequential(
  (0): Linear(in_features=1120, out_features=1036, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=1036, out_features=952, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=952, out_features=868, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=868, out_features=784, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=784, out_features=700, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=700, out_features=616, bias=True)
  (11): LeakyReLU(negative_slope=0.01)
  (12): Linear(in_features=616, out_features=532, bias=True)
  (13): LeakyReLU(negative_slope=0.01)
  (14): Linear(in_features=532, out_features=448, bias=True)
  (15): LeakyReLU(negative_slope=0.01)
  (16): Linear(in_features=448, out_features=364, bias=True)
  (17): LeakyReLU(negative_slope=0.01)
  (18): Linear(in_features=364, out_features=280, bias=True)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [4]:
pd.DataFrame(nmis.values(), index=nmis.keys()).round(4)

Unnamed: 0,FT-Transformer DCN,FT-Transformer DEC,FT-Transformer IDEC
Abalone,0.1704,0.152,0.166
Auction Verification,0.0,0.0,0.0
Bank Marketing,0.0006,0.0015,0.0012
Breast Cancer,0.2062,0.2062,0.2062
Census Income,0.1333,0.1333,0.1333
Credit Approval,0.0725,0.0725,0.0725
Heart Disease,0.2473,0.2434,0.2434
Soybean Disease,0.2605,0.2605,0.2684


In [5]:
pd.DataFrame(accuracies.values(), index=accuracies.keys()).round(4)

Unnamed: 0,FT-Transformer DCN,FT-Transformer DEC,FT-Transformer IDEC
Abalone,0.2107,0.1858,0.1678
Auction Verification,0.7445,0.7445,0.7445
Bank Marketing,0.8172,0.7936,0.795
Breast Cancer,0.735,0.735,0.735
Census Income,0.7236,0.7236,0.7236
Credit Approval,0.5697,0.5697,0.5697
Heart Disease,0.5585,0.5652,0.5652
Soybean Disease,0.2278,0.2278,0.2295
