In [1]:
from copy import deepcopy
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import StandardScaler
from scipy.optimize import linear_sum_assignment
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

from duped_modules.dcn_duped import DCNDuped
from duped_modules.dec_duped import DECDuped, IDECDuped
from basic_autoencoder import BasicAutoencoder
from load_datasets import load_all_datasets

In [2]:
torch.manual_seed(0)
torch.use_deterministic_algorithms(True)
np.random.seed(0)

In [3]:
def cluster_accuracy(y_true, y_pred):
    # We need to map the labels to our cluster labels
    # This is a linear assignment problem on a bipartite graph
    k = max(len(np.unique(y_true)), len(np.unique(y_pred)))
    cost_matrix = np.zeros((k, k))
    for i in range(y_true.shape[0]):
        cost_matrix[y_true[i], y_pred[i]] += 1
    inverted_cost_matrix = cost_matrix.max() - cost_matrix
    row_ind, col_ind = linear_sum_assignment(inverted_cost_matrix)
    return cost_matrix[row_ind, col_ind].sum() / y_pred.size

In [4]:
datasets = load_all_datasets(max_rows=5000)

accuracies = {d.name: {} for d in datasets}
nmis = {d.name: {} for d in datasets}

In [5]:
for d in datasets:
    print(f"{d.name}: Input dim: {d.input_dim}; Cat dim: {d.cat_dim}; Cont dim: {d.cont_dim}")

Abalone: Input dim: 9; Cat dim: 2; Cont dim: 7
Auction Verification: Input dim: 13; Cat dim: 12; Cont dim: 1
Bank Marketing: Input dim: 22; Cat dim: 17; Cont dim: 5
Breast Cancer: Input dim: 45; Cat dim: 45; Cont dim: 0
Census Income: Input dim: 58; Cat dim: 52; Cont dim: 6
Credit Approval: Input dim: 28; Cat dim: 22; Cont dim: 6
Heart Disease: Input dim: 17; Cat dim: 11; Cont dim: 6
Soybean Disease: Input dim: 55; Cat dim: 55; Cont dim: 0


In [6]:
abalone_encoder = nn.Sequential(
    nn.Linear(datasets.abalone.input_dim, 6),
    nn.LeakyReLU(),
    nn.Linear(6, 4),
    nn.LeakyReLU(),
)
abalone_decoder = nn.Sequential(
    nn.Linear(4, 6),
    nn.LeakyReLU(),
    nn.Linear(6, datasets.abalone.input_dim),
    nn.LeakyReLU(),
)

ae = BasicAutoencoder(deepcopy(abalone_encoder), deepcopy(abalone_decoder), datasets.abalone.embedding_sizes)
ae.fit(datasets.abalone.dataloader, n_epochs=100, lr=0.001)

cat = torch.tensor(datasets.abalone.df[datasets.abalone.cat_cols].values, dtype=torch.int).detach()
cont = torch.tensor(datasets.abalone.df[datasets.abalone.cont_cols].values, dtype=torch.float).detach()
features = ae.encode(cat, cont).detach().numpy()
kmeans = KMeans(n_clusters=datasets.abalone.n_targets, init="random", n_init=10, max_iter=300, random_state=0, algorithm="lloyd").fit(features)
nmi = normalized_mutual_info_score(datasets.abalone.y, kmeans.labels_)
print(nmi)

Epoch 25/100 - Batch Reconstruction loss: 0.124038
Epoch 50/100 - Batch Reconstruction loss: 0.106587
Epoch 75/100 - Batch Reconstruction loss: 0.120869
Epoch 100/100 - Batch Reconstruction loss: 0.171200
0.1585459416556321
