In [None]:
from pathlib import Path
import numpy as np
import pandas as pd 
from tqdm.notebook import tqdm
import altair as alt
from energyclustering.webapp.resultparser import COBRASResult, ResultInspector
from energyclustering.sampling.preprocessing import preprocess_info_df_for_sampling_classification

alt.data_transformers.disable_max_rows()


In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
WASSER = 'full_distance_matrix_wasserstein'
directory = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/')
fold_directory = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/folds/2022-01-27_folds.npy')
assert all((directory/name).exists() for name in [WASSER])

# Prepare the data

In [None]:
cobras_result = COBRASResult('result_20211124_koen', directory/WASSER)
info_df, data_df = cobras_result.info_df, cobras_result.data_df
subset_info_df = preprocess_info_df_for_sampling_classification(info_df, data_df)
classification_X = subset_info_df.to_numpy()

In [None]:
folds = np.load(fold_directory, allow_pickle = True)
folds

# Collect/make the clusterings

In [None]:
clusterings_to_evaluate = dict()

## Util

In [None]:
from sklearn_extra.cluster import KMedoids
def cluster_based_on_distance_matrix(distance_matrix, n_clusters = 13, random_state = None): 
    clusterer = KMedoids(13, metric = 'precomputed', random_state  = random_state).fit(distance_matrix)
    return clusterer.labels_
    
    

## Wasserstein with expert data

In [None]:
clustering_wasser_expert = cobras_result.clusterings[-1]
clusterings_to_evaluate['wasser_expert'] = clustering_wasser_expert

## Wasserstein without expert 

In [None]:
wasser_distance_matrix = cobras_result._distance_matrix
clustering_wasser_no_expert = cluster_based_on_distance_matrix(wasser_distance_matrix,  13, 1234)
clusterings_to_evaluate['wasser_no_expert'] = clustering_wasser_no_expert

## Euclidean no expert

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
imputed_data_df = data_df.apply(lambda row: row.fillna(row.mean()), axis = 1)
eucl_distance_matrix = euclidean_distances(imputed_data_df.to_numpy())
clustering_eucl_no_expert = cluster_based_on_distance_matrix(eucl_distance_matrix, 13, 1345)
clusterings_to_evaluate['eucl_no_expert'] = clustering_eucl_no_expert

# Evaluate sampling based on each clustering

In [None]:
from energyclustering.sampling.sampler import Sampler
from energyclustering.sampling.sampler import CrossValidationSampler
from sklearn.tree import DecisionTreeClassifier
from dask.distributed import Client
SCHEDULER = 'pinac38.cs.kuleuven.be:8786'

In [None]:
%%time
# imputed_data_df = 
cv_sampler = CrossValidationSampler( DecisionTreeClassifier(max_depth = 5, ccp_alpha = 0.005), folds, imputed_data_df, subset_info_df)
with Client(SCHEDULER) as client: 
#     client.restart()
    results = []
    for name, clustering in tqdm(clusterings_to_evaluate.items()): 
        results.append(cv_sampler.evaluate_sampling_dask(clustering.astype('int'), client))  

In [None]:
result_df = pd.concat(results, keys = clusterings_to_evaluate.keys()).rename_axis(['clustering', 'fold'], axis = 0)
result_df

In [None]:
reduced_result_df = result_df.reset_index().groupby('clustering').sum().drop(columns = 'fold')
reduced_result_df.style.background_gradient(axis = 1,cmap = 'viridis')

In [None]:
reduced_result_df


In [None]:
reduced_result_df.style.background_gradient(axis = 0, cmap = 'viridis')

In [None]:
subset = data_df.index[:3].to_numpy()
subset

In [None]:
index_series = pd.Series(range(0, data_df.shape[0]), index = data_df.index)
index_series.loc[subset]

In [None]:
data_df.index.get_loc(subset)