In [None]:
from src.config import PROCESSED, ROOT_DIR, PARAM_UMAP, PARAM_CLUSTER, PARAM
import hdbscan
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import pandas as pd
import os

# Dimension reduction and clustering libraries
import umap
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from src.fig_utils import helper_save


sns.set(style='white', rc={'figure.figsize':(10,8)})


def plot_hdb(data, labels, f_save_fig=None):
    clustered = (labels >= 0)
    plt.scatter(data[~clustered, 0],
                data[~clustered, 1], c=(0.5, 0.5, 0.5),
                s=0.1, alpha=0.5)
    plt.scatter(data[clustered, 0],
                data[clustered, 1], c=labels[clustered],
                s=0.1, cmap='Spectral')
    helper_save(f_save_fig)
    return


def hdb_cluster(data, min_s_ratio, min_clust_ratio, f_save=None):
    n_obs = data.shape[0]
    min_samples = int(n_obs/min_s_ratio)
    min_cluster_size = int(n_obs/min_clust_ratio)

    # Assume needs to have
    labels = hdbscan.HDBSCAN(min_samples=min_samples,
                             min_cluster_size=min_cluster_size).fit_predict(data)
    if f_save is not None:
        pickle.dump(labels, open(f_save, "wb"))
    return labels


def compute_cluster_purity(target, predict):
    (adjusted_rand_score(target, predict),
     adjusted_mutual_info_score(target, predict))
    return


def compute_pca(data):
    lowd_mnist = PCA(n_components=50).fit_transform(data)
    hdbscan_labels = hdbscan.HDBSCAN(min_samples=10,
                                     min_cluster_size=500).fit_predict(
        lowd_mnist)
    return lowd_mnist


def run(p):
    embedding_f = p["umap"]["filenames"]["embedding"]
    f_save = p["cluster"]["filenames"]["cluster results"]
    f_save_fig = p["cluster"]["filenames"]["cluster label figure"]

    data = pickle.load(open(embedding_f),"rb")

    cluster_type = p["cluster"]["params"]["method"]
    min_sample = p["cluster"]["params"]["min_sample"]
    min_cluster_size = p["cluster"]["params"]["min_cluster_size"]

    if cluster_type == "hdb":
        labels = hdb_cluster(data, min_sample, min_cluster_size,
                             f_save=f_save)
        plot_hdb(data, labels, f_save_fig=f_save_fig)

    return


def main():
    return


if __name__ == '__main__':
    run(os.path.join(PARAM, "params_sample.yaml"))
