In [None]:
import os
import pandas as pd
import numpy as np
import math
from sklearn.metrics import normalized_mutual_info_score
import torch

from utils import *
from autoencoders.basic_autoencoder import BasicAutoencoder
from autoencoders.all_to_all_autoencoder import AllToAllAutoencoder
from autoencoders.transformer_autoencoder import TransformerAutoencoder
from duped_modules.dcn_duped import DCNDuped
from duped_modules.dec_duped import DECDuped
from duped_modules.dec_duped import IDECDuped
from load_datasets import load_all_datasets

In [None]:
device = "cpu"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)
np.random.seed(0)

datasets = load_all_datasets(max_rows=5000)
accuracies = {d.name: {} for d in datasets}
nmis = {d.name: {} for d in datasets}
for d in datasets:
    print(f"{d.name}: Input dim: {d.input_dim}; Cat dim: {d.cat_dim}; Cont dim: {d.cont_dim}")

In [None]:
# Deep Clustering with Basic Autoencoder (no Column Embeddings)
for d in datasets:
    print(f"Calculating for {d.name}...")
    input_dim = len(d.cat_cols) + len(d.cont_cols)
    encoder, decoder = build_autoencoder(input_dim, input_dim, max(1, round(math.log2(d.input_dim))))
    print(encoder, decoder)
    ae = BasicAutoencoder(encoder, decoder, d.input_dim, d.cat_dim, d.embedding_sizes, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    dcn = DCNDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dcn.fit(d.dataloader)
    nmis[d.name]["No Col Emb DCN"] = normalized_mutual_info_score(d.y, dcn.labels_)
    accuracies[d.name]["No Col Emb DCN"] = cluster_accuracy(d.y, dcn.labels_)

    dec = DECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dec.fit(d.dataloader)
    nmis[d.name]["No Col Emb DEC"] = normalized_mutual_info_score(d.y, dec.labels_)
    accuracies[d.name]["No Col Emb DEC"] = cluster_accuracy(d.y, dec.labels_)

    idec = IDECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    idec.fit(d.dataloader)
    nmis[d.name]["No Col Emb IDEC"] = normalized_mutual_info_score(d.y, idec.labels_)
    accuracies[d.name]["No Col Emb IDEC"] = cluster_accuracy(d.y, idec.labels_)

In [None]:
# Deep Clustering with Column Embeddings
for d in datasets:
    print(f"Calculating for {d.name}...")
    encoder, decoder = build_autoencoder(d.input_dim, d.input_dim, max(1, round(math.log2(d.input_dim))))
    print(encoder, decoder)
    ae = AllToAllAutoencoder(encoder, decoder, d.input_dim, d.cat_dim, d.embedding_sizes, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    dcn = DCNDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dcn.fit(d.dataloader)
    nmis[d.name]["DCN"] = normalized_mutual_info_score(d.y, dcn.labels_)
    accuracies[d.name]["DCN"] = cluster_accuracy(d.y, dcn.labels_)

    dec = DECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dec.fit(d.dataloader)
    nmis[d.name]["DEC"] = normalized_mutual_info_score(d.y, dec.labels_)
    accuracies[d.name]["DEC"] = cluster_accuracy(d.y, dec.labels_)

    idec = IDECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    idec.fit(d.dataloader)
    nmis[d.name]["IDEC"] = normalized_mutual_info_score(d.y, idec.labels_)
    accuracies[d.name]["IDEC"] = cluster_accuracy(d.y, idec.labels_)

In [None]:
# Deep Clustering with Attention
for d in datasets:
    print(f"Calculating for {d.name}...")
    encoder, decoder = build_autoencoder(d.input_dim, d.input_dim, max(1, round(math.log2(d.input_dim))))
    print(encoder, decoder)
    ae = AllToAllAutoencoder(encoder, decoder, d.input_dim, d.cat_dim, d.embedding_sizes, attention=True, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    dcn = DCNDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dcn.fit(d.dataloader)
    nmis[d.name]["Attention DCN"] = normalized_mutual_info_score(d.y, dcn.labels_)
    accuracies[d.name]["Attention DCN"] = cluster_accuracy(d.y, dcn.labels_)

    dec = DECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dec.fit(d.dataloader)
    nmis[d.name]["Attention DEC"] = normalized_mutual_info_score(d.y, dec.labels_)
    accuracies[d.name]["Attention DEC"] = cluster_accuracy(d.y, dec.labels_)

    idec = IDECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    idec.fit(d.dataloader)
    nmis[d.name]["Attention IDEC"] = normalized_mutual_info_score(d.y, idec.labels_)
    accuracies[d.name]["Attention IDEC"] = cluster_accuracy(d.y, idec.labels_)

In [None]:
# Deep Clustering with Transformer
for d in datasets:
    print(f"Calculating for {d.name}...")
    encoder, decoder = build_autoencoder(d.input_dim, d.input_dim, max(1, round(math.log2(d.input_dim))))
    print(encoder, decoder)
    ae = TransformerAutoencoder(encoder, decoder, d.input_dim, d.cat_dim, d.embedding_sizes, depth=8, device=device)
    ae.fit(d.dataloader, n_epochs=100, lr=0.001)

    dcn = DCNDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dcn.fit(d.dataloader)
    nmis[d.name]["Transformer DCN"] = normalized_mutual_info_score(d.y, dcn.labels_)
    accuracies[d.name]["Transformer DCN"] = cluster_accuracy(d.y, dcn.labels_)

    dec = DECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    dec.fit(d.dataloader)
    nmis[d.name]["Transformer DEC"] = normalized_mutual_info_score(d.y, dec.labels_)
    accuracies[d.name]["Transformer DEC"] = cluster_accuracy(d.y, dec.labels_)

    idec = IDECDuped(n_clusters=d.n_targets, autoencoder=ae, random_state=np.random.RandomState(0))
    idec.fit(d.dataloader)
    nmis[d.name]["Transformer IDEC"] = normalized_mutual_info_score(d.y, idec.labels_)
    accuracies[d.name]["Transformer IDEC"] = cluster_accuracy(d.y, idec.labels_)

In [None]:
pd.DataFrame(nmis.values(), index=nmis.keys()).round(4)

In [None]:
pd.DataFrame(accuracies.values(), index=accuracies.keys()).round(4)