# Analyse the difference between the different variations we have until now

In [None]:
from pathlib import Path
from energyclustering.webapp.resultparser import COBRASResult, ResultInspector
from energyclustering.clustering.similarity import *
import pandas as pd
from dask.distributed import Client
import altair as alt
import numpy as np
alt.data_transformers.disable_max_rows()
alt.renderers.enable('png')

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
WASSER = 'full_distance_matrix_wasserstein'

EUCL = 'baseline_euclidean'
# FULL_SEASON = 'random_profiles_seasonality_v1'
directory = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/')

assert all((directory/name).exists() for name in [WASSER])

# Constraint-based validation

In [None]:
cobras_result = COBRASResult('result_20211124_koen', directory/WASSER)

### Calculate metrics that are not calculated yet

In [None]:
%%time
METRICS = [DTWDistance(window = 4), MatchingDistanceMeasure(), EuclideanDistance(), WassersteinDistanceMeasure()]
# METRICS = [EuclideanDistance()]
results = []
with Client(local_directory = '/cw/dtailocal/', n_workers=10) as client: 
    for metric in METRICS: 
        result = calculate_distance_between_queries(cobras_result.data_df, cobras_result.query_array, metric, client, n_blocks = 10 )
        results.append(result)


In [None]:
result_inspectors = dict()
for metric, result in zip(METRICS, results): 
    name = type(metric).__name__
    inspector = ResultInspector(cobras_result, result, name)
    result_inspectors[name] = inspector
result_inspectors['wasserstein'] = ResultInspector.from_path(cobras_result, directory/WASSER)

In [None]:
correlations = pd.Series(dtype = 'float64')
for name, inspector in result_inspectors.items(): 
    correlations.loc[name] = inspector.rank_correlation_between_distances_and_queries()
correlations

In [None]:
alt.Chart(correlations.to_frame('score').drop('wasserstein').reset_index()).mark_bar().encode(
    x = 'index', 
    y = 'score', 
    color = 'index'
)

## Histogram of constraint distances
### Our distance metric

In [None]:
result.similarity_metric_histogram_chart().properties(height = 100)

### Hist distance

In [None]:
wasser_result.similarity_metric_histogram_chart().properties(title = 'no agg', height = 100) | wasser1H_result.similarity_metric_histogram_chart().properties(title = '1H', height = 100)  | wasser4H_result.similarity_metric_histogram_chart().properties(title = '4H', height = 100)

### Euclidean distance

In [None]:
euc_result.similarity_metric_histogram_chart().properties(height = 100)

## Rank correlation

In [None]:
result_df = pd.DataFrame(columns = ['correlation'])

### Own metric: DTW and matching

In [None]:
corr= result.rank_correlation_between_distances_and_queries()
result_df.loc['matching_measure'] = corr

### Own metric DTW without matching

In [None]:
dtw_one.rank_correlation_between_distances_and_queries()

### Own metric euclidean with matching

In [None]:
euc_matching.rank_correlation_between_distances_and_queries()

### Own metric euclidean without matching

In [None]:
euc_one.rank_correlation_between_distances_and_queries()

### Histogram distance

In [None]:
corr = wasser_result.rank_correlation_between_distances_and_queries()
result_df.loc['wasserstein_measure'] = corr

In [None]:
corr = wasser1H_result.rank_correlation_between_distances_and_queries()
result_df.loc['wasserstein_measure_1H'] = corr

In [None]:
corr = wasser4H_result.rank_correlation_between_distances_and_queries()
result_df.loc['wasserstein_measure_4H'] = corr

### Plain euclidean

In [None]:
corr = euc_result.rank_correlation_between_distances_and_queries()
result_df.loc['euclidean'] = corr

In [None]:
result_df

In [None]:
alt.Chart(result_df.reset_index().rename(columns= {'index':'measure'}), height = 200, width = 200, title = 'Correlation with expert').mark_bar().encode(
    x = 'measure:N', 
    y= 'correlation:Q', 
    color = alt.Color('measure:N',legend = None), 
    tooltip = 'correlation'
)

## plot all pairs

In [None]:
# result.plot_constraint_pairs_w_distances()

## plot closest ML pairs 

In [None]:
result.plot_constraint_pairs(3, constraints = 'ML', sort = 'asc')

## plot farthest ML pairs 

In [None]:
result.plot_constraint_pairs(3, constraints = 'ML', sort = 'desc')

## plot far CL pairs 

In [None]:
result.plot_constraint_pairs(2, constraints = 'CL', sort = 'desc')

## plot closest CL pairs 

In [None]:
result.plot_constraint_pairs(7, constraints = 'CL', sort = 'asc')