In [1]:
import pathlib
import numpy as np
import pandas as pd

from sklearn.preprocessing import RobustScaler, MinMaxScaler, MaxAbsScaler, PowerTransformer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from tqdm import tqdm

In [4]:
# load
data_path = pathlib.Path().joinpath('data', 'data.csv')
data = pd.read_csv(data_path, index_col='id')

# int as dummies
# int_columns = data.columns[data.dtypes == int]
# dummies = pd.get_dummies(data[int_columns].astype('category'))
# data = pd.concat([data.drop(int_columns, axis=1), dummies], axis=1)

# PowerTransform int
# int_columns = data.columns[data.dtypes == int]
# data[int_columns] = power_transform(data[int_columns])

# prepare data
scaled = make_pipeline(
    # RobustScaler(),
    PowerTransformer(),
    MaxAbsScaler(),
).fit_transform(data)

In [9]:
# # Kaggle 0.23745
# km = KMeans(n_clusters=5, random_state=17).fit(scaled)
# predicted = km.predict(scaled)

In [5]:
# Kaggle 0.50762: MinMaxScaler
# Kaggle 0.58013: MaxAbsScaler + PowerTransformer
# gmm = GaussianMixture(n_components=7, random_state=17).fit(scaled)
# predicted = gmm.predict(scaled)

In [None]:
def multiple_clustering(X, estimator, *, frac=0.7, iters=3, seed=None):
    n_clusters = 7
    assert (frac > 0) and (frac <= 1), "frac must be > 0 and <= 1"
    np.random.seed(seed)

    labels = []
    score = np.zeros((X.shape[0], n_clusters))
    probability = np.zeros((X.shape[0], n_clusters))
    n_feats = np.ceil(frac * X.shape[1]).astype(int)
    for i in tqdm(range(iters), desc='fit', total=iters):       # parallel?
        feats = np.random.choice(range(X.shape[1]), n_feats, replace=False)
        estimator.set_params(random_state=np.random.randint(2 ** 16))
        estimator.fit(X[:, feats])
        proba = estimator.predict_proba(X[:, feats])
        cluster = proba.argmax(axis=1)
        
        # translate labels
        order = pd.Series(cluster).value_counts().index
        remap = dict(zip(order, range(n_clusters)))
        lb = np.vectorize(remap.get)(cluster)
        labels.append(lb)
        # update proba
        probability += proba[:, order]
        # calc mean vec
        mean_vec = [X[lb == cl].mean(axis=0) for cl in range(n_clusters)]
        if i == 0:
            base_mean_vec = np.copy(mean_vec)
        # distance & weights
        dist = pairwise_distances(base_mean_vec, mean_vec, metric='euclidean')
        weights = 1 / (1 + dist ** 2)
        # addition
        score += proba[:, order] @ weights.T

    # bundle translated labels
    bundle = np.c_[labels].T
    # count labels
    counter = np.apply_along_axis(lambda row: dict(zip(*np.unique(row, return_counts=True))), 1, bundle)
    count = pd.DataFrame(list(counter)).fillna(0)[range(n_clusters)].values
    return score, count, probability

In [None]:
# Kaggle 0.59826
scr, cnt, prb = multiple_clustering(scaled, BayesianGaussianMixture(n_components=7, max_iter=150, n_init=3, tol=0.01), frac=1, iters=150, seed=17)
predicted = scr.argmax(axis=1)
# predicted = cnt.argmax(axis=1)
# predicted = prb.argmax(axis=1)

In [6]:
# save submission
sub_path = pathlib.Path().joinpath('data', 'sample_submission.csv')
sub = pd.read_csv(sub_path)
sub['Predicted'] = predicted
sub.to_csv('submission.csv', index=False)
sub

Unnamed: 0,Id,Predicted
0,0,2
1,1,3
2,2,2
3,3,5
4,4,0
...,...,...
97995,97995,3
97996,97996,6
97997,97997,1
97998,97998,4


In [None]:
#