In [1]:
import os
import pandas as pd
import numpy as np
import math
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn

from utils import *
from autoencoders.basic_autoencoder import BasicAutoencoder
from autoencoders.all_to_all_autoencoder import AllToAllAutoencoder
from autoencoders.all_to_cat_autoencoder import AllToCatAutoencoder
from autoencoders.cat_to_cat_autoencoder import CatToCatAutoencoder
from load_datasets import load_all_datasets

In [2]:
device = "cpu"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)
np.random.seed(0)

datasets = load_all_datasets(max_rows=5000)
accuracies = {d.name: {} for d in datasets}
nmis = {d.name: {} for d in datasets}

In [3]:
# Basic Autoencoder with no Categorial Embeddings
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = len(d.cat_cols) + len(d.cont_cols)
    encoder, decoder = build_autoencoder(input_dim, input_dim, max(1, round(math.log2(input_dim))))
    print(encoder, decoder)

    ae = BasicAutoencoder(encoder, decoder, d.embedding_sizes, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = ae.encode(cat, cont).detach().cpu().numpy()
    kmeans = KMeans(n_clusters=d.n_targets, init="random", max_iter=300, n_init=1, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["No Embeddings AE"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["No Embeddings AE"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=8, out_features=6, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=6, out_features=4, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=4, out_features=2, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=2, out_features=4, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=4, out_features=6, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=6, out_features=8, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
)
Epoch 25/100 - Batch Reconstruction loss: 0.157973
Epoch 50/100 - Batch Reconstruction loss: 0.157587
Epoch 75/100 - Batch Reconstruction loss: 0.157757
Epoch 100/100 - Batch Reconstruction loss: 0.156816
Calculating for Auction Verification...
Sequential(
  (0): Linear(in_features=7, out_features=5, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=5, o

In [4]:
# All Columns to All Columns Autoencoder
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = sum(d for _, d in d.embedding_sizes) + len(d.cont_cols)
    encoder, decoder = build_autoencoder(input_dim, input_dim, max(1, round(math.log2(input_dim))))
    print(encoder, decoder)

    ae = AllToAllAutoencoder(encoder, decoder, d.embedding_sizes, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = ae.encode(cat, cont).detach().cpu().numpy()
    kmeans = KMeans(n_clusters=d.n_targets, init="random", max_iter=300, n_init=1, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["All cols AE"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["All cols AE"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=9, out_features=7, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=7, out_features=5, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=5, out_features=2, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=2, out_features=5, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=5, out_features=7, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=7, out_features=9, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
)
Epoch 25/100 - Batch Reconstruction loss: 0.086662
Epoch 50/100 - Batch Reconstruction loss: 0.085288
Epoch 75/100 - Batch Reconstruction loss: 0.068787
Epoch 100/100 - Batch Reconstruction loss: 0.060902
Calculating for Auction Verification...
Sequential(
  (0): Linear(in_features=13, out_features=11, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=11

In [5]:
# Categorial Columns to Categorial Columns Autoencoder
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = sum(d for _, d in d.embedding_sizes)
    encoder, decoder = build_autoencoder(input_dim, input_dim, max(1, round(math.log2(input_dim))), output_fn=nn.Sigmoid)
    print(encoder, decoder)

    ae = CatToCatAutoencoder(encoder, decoder, d.embedding_sizes, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = d.df[d.cont_cols].values
    scaled_cat_features = StandardScaler().fit_transform(ae.encode(cat, cont).detach().cpu().numpy())
    features = np.concatenate((scaled_cat_features, cont), axis=1)
    kmeans = KMeans(n_clusters=d.n_targets, init="random", max_iter=300, n_init=1, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["Cat Cols AE"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["Cat Cols AE"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=2, out_features=1, bias=True)
  (1): Sigmoid()
) Sequential(
  (0): Linear(in_features=1, out_features=2, bias=True)
  (1): Sigmoid()
)
Epoch 25/100 - Batch Reconstruction loss: 6.178423
Epoch 50/100 - Batch Reconstruction loss: 7.425200
Epoch 75/100 - Batch Reconstruction loss: 8.298109
Epoch 100/100 - Batch Reconstruction loss: 9.053138
Calculating for Auction Verification...
Sequential(
  (0): Linear(in_features=12, out_features=10, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=10, out_features=8, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=8, out_features=6, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=6, out_features=3, bias=True)
  (7): Sigmoid()
) Sequential(
  (0): Linear(in_features=3, out_features=6, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=6, out_features=8, bias=True)
  (3): LeakyReLU(neg

In [6]:
# All Columns to Categorial Columns Autoencoder
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = sum(d for _, d in d.embedding_sizes) + len(d.cont_cols)
    encoder, decoder = build_autoencoder(input_dim, sum(d for _, d in d.embedding_sizes), max(1, round(math.log2(sum(d for _, d in d.embedding_sizes)))))
    print(encoder, decoder)

    ae = AllToCatAutoencoder(encoder, decoder, d.embedding_sizes, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    cat = torch.tensor(d.df[d.cat_cols].values, dtype=torch.int).detach().to(device)
    cont = torch.tensor(d.df[d.cont_cols].values, dtype=torch.float).detach().to(device)
    features = np.concatenate((ae.encode(cat, cont).detach().cpu().numpy(), cont.cpu().numpy()), axis=1)
    kmeans = KMeans(n_clusters=d.n_targets, init="random", max_iter=300, n_init=1, random_state=0, algorithm="lloyd").fit(features)

    nmis[d.name]["All cols to cat cols AE"] = normalized_mutual_info_score(d.y, kmeans.labels_)
    accuracies[d.name]["All cols to cat cols AE"] = cluster_accuracy(d.y, kmeans.labels_)

Calculating for Abalone...
Sequential(
  (0): Linear(in_features=9, out_features=1, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear(in_features=1, out_features=2, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
)
Epoch 25/100 - Batch Reconstruction loss: 1.401657
Epoch 50/100 - Batch Reconstruction loss: 3.445078
Epoch 75/100 - Batch Reconstruction loss: 6.079834
Epoch 100/100 - Batch Reconstruction loss: 9.610638
Calculating for Auction Verification...
Sequential(
  (0): Linear(in_features=13, out_features=11, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=11, out_features=9, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=9, out_features=7, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=7, out_features=5, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=5, out_features=3, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
) Sequential(
  (0): Linear

In [7]:
pd.DataFrame(nmis.values(), index=nmis.keys()).round(4)

Unnamed: 0,No Embeddings AE,All cols AE,Cat Cols AE,All cols to cat cols AE
Abalone,0.1559,0.1695,0.1684,0.1709
Auction Verification,0.0066,0.0012,0.0002,0.0413
Bank Marketing,0.0015,0.0039,0.0197,0.0
Breast Cancer,0.7602,0.7178,0.2878,0.6927
Census Income,0.0003,0.0241,0.1024,0.0042
Credit Approval,0.0031,0.0097,0.0008,0.0028
Heart Disease,0.1389,0.1663,0.1848,0.0947
Soybean Disease,0.4974,0.4836,0.4169,0.5426


In [8]:
pd.DataFrame(accuracies.values(), index=accuracies.keys()).round(4)

Unnamed: 0,No Embeddings AE,All cols AE,Cat Cols AE,All cols to cat cols AE
Abalone,0.1154,0.1688,0.1384,0.1348
Auction Verification,0.5497,0.5893,0.6667,0.8243
Bank Marketing,0.531,0.742,0.7894,0.7484
Breast Cancer,0.9634,0.9502,0.7013,0.9458
Census Income,0.7328,0.6664,0.5778,0.6216
Credit Approval,0.5069,0.5176,0.513,0.5391
Heart Disease,0.3244,0.3278,0.4013,0.3545
Soybean Disease,0.3986,0.4075,0.3292,0.4413
