# Overall comparison

### Imports

In [None]:
%matplotlib inline

In [None]:
# standard imports
import numpy as np 
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
# alt.renderers.enable('png')
from pathlib import Path

# loading the data 
from energyclustering.sampling.preprocessing import DataPreprocessor

# preprocessing the yearly info 
from energyclustering.sampling.preprocessing.info import YearlyInfoPreprocessor

# executing the results in parallell 
from dask.distributed import Client

# sampling models 
from energyclustering.sampling.day_of_year_samplers import RandomBaselineSampler # random day selection baseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler # to combine daily and yearly sampler
from energyclustering.sampling.day_of_year_samplers import GenerateSampleDecorator
from energyclustering.sampling.samplers import EnergyvilleDaySelectionBaseline
from energyclustering.sampling.samplers import MetadataSampler
from energyclustering.sampling.samplers import ConsumptionDataSampler


# elbow method to use for clustering 
from energyclustering.clustering.elbow import ElbowMethod
from energyclustering.clustering.preclustering import PreClusteringClusterer # two stage clustering procedure

# clustering algorithms 
from sklearn.cluster import KMeans, MiniBatchKMeans
from kmedoids import KMedoids
from energyclustering.clustering.kmedoids import CustomKMedoids

# cluster metrics 
import energyclustering.clustering.metrics as dist_metrics

# classifiers 
from sklearn.ensemble import RandomForestClassifier 

# evaluation 
from energyclustering.sampling.evaluation import SamplerEvaluator

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('paper')
    .preprocess_weather_df('paper')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(1000)
    .get_data()
)
daily_data_df.shape

In [None]:
daily_info_df.dtypes

In [None]:
# generate folds for cross validation 
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 5)

## Models to compare

In [None]:
models = dict()
NB_SAMPLES = 250
CLUSTER_RANGE = list(range(10, 101, 5))

# random baseline 
models['random_baseline'] = RandomBaselineSampler(NB_SAMPLES)

# rule based approach 
models['rule-based_metadata_clustering'] = (
    GenerateSampleDecorator(
        sampler = DailySamplerFromClusterSampler(
            yearly_sampler = MetadataSampler(
                clusterer = ElbowMethod(KMeans(n_clusters = 1, n_init = 10), cluster_range = CLUSTER_RANGE), 
                info_preprocessing = YearlyInfoPreprocessor(columns_to_use = ['yearly_consumption', 'connection_power'], normalized = True),
            ), 
            daily_sampler = EnergyvilleDaySelectionBaseline()
        ), 
        n_samples = NB_SAMPLES,
    )
)

# consumption clustering based approach 
models['consumption_clustering'] = (
     GenerateSampleDecorator(
        sampler = DailySamplerFromClusterSampler(
            yearly_sampler = ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = ElbowMethod(KMedoids(n_clusters = 1, method = 'fasterpam'), metric = dist_metrics.euc_distance_matrix_missing, cluster_range = CLUSTER_RANGE, nb_repeats = 10), 
#                 clusterer = ElbowMethod(KMeans(n_clusters = 1, n_init = 1), metric = None, cluster_range = CLUSTER_RANGE, nb_repeats = 1), 
#                 clusterer = KMeans(n_clusters = 5, n_init = 10), 
                fillna = False,
#                 fillna = True,
            ), 
            daily_sampler = ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = ElbowMethod(
                    clusterer = KMeans(n_clusters = 1),
                    cluster_range = CLUSTER_RANGE,
#                     show_progress = True,
                ), 
#                 clusterer = KMeans(n_clusters = 30, n_init = 1), 
            ), 
#             show_progress= True,
        ), 
        n_samples = NB_SAMPLES,
    )
)


In [None]:
from dask_util import get_dask_cluster
cluster = get_dask_cluster(
    pinac_numbers = [],
    himec_numbers = [8, 1, 2, 3, 4, 5, 6],
)


In [None]:
import logging
import datetime
import dask 
logging.basicConfig(format='%(asctime)s - %(name)s: %(message)s', level=logging.DEBUG, filename = f'logs/comparison_{datetime.datetime.now().strftime("%d-%m-%Y")}.log', filemode = 'w')
dask.config.set({'distributed.comm.retry.count': 5, 'distributed.comm.retry.delay.min': '20s', 'distributed.comm.retry.delay.max': '60s'})

In [None]:
%%time 
result_path = Path()/'results'/'comparison_5fold_rerun'
result_path.mkdir(parents = True, exist_ok = True)

energy_scores = []
with Client(cluster) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 400, crossval = True)
    for key, model in list(models.items()):
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
    energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot_df = energy_scores.agg(['mean', 'std'], axis = 0)

In [None]:
cluster.close()

In [None]:
energy_scores

In [None]:
means = []
for fold in folds: 
    means.append(energy_scores.loc[fold].mean(axis = 0))
pd.concat(means, axis = 1)

In [None]:
all_folds = (
    pd.concat(means, axis = 1).stack().to_frame('ES').reset_index().replace({
        "random_baseline" : 'Random sampling',
        "rule-based_metadata_clustering" : "Expert-based",
        "consumption_clustering" : "Data-driven"
    })
    .assign(level_1 = lambda x: x['level_1'].apply(lambda y: f"fold {y+1}"))
)
all_folds;

In [None]:
chart = alt.Chart(all_folds).mark_bar(size = 17).encode(
    x = 'ES', 
    y = alt.Y('level_0:N', title = None,  axis = alt.Axis(domain = False, ticks = False), sort = ['Random sampling', 'Expert-based', 'Data-driven']),
    color = alt.Color('level_0:O', legend = None),
)
text_chart = alt.Chart(all_folds).mark_text(align = 'left', size = 17, dx = 5).encode(
    y = alt.Y("level_0:N", title = None, axis = alt.Axis(domain = False, ticks = False), sort = ['Random Baseline', 'Metadata Clustering', 'Consumption Clustering']), 
    x = alt.X('ES', title = 'Mean ES (lower is better)'), 
    text = alt.Text("ES", format = '.3f'),
    # color = alt.Color('Method:O', legend = None),
)
total_chart = (chart + text_chart).facet(row = alt.Row('level_1', title = None, header=alt.Header(labelFontSize=20)))
fold_chart = big_chart(total_chart, fontsize = 17)
fold_chart

In [None]:
fold_std = pd.concat(means, axis = 1).std(axis = 1)
fold_std

In [None]:
plot_df

In [None]:
alt_df = (
    plot_df.T
     .assign(
        min_std = lambda x: x['mean'] - fold_std, 
        max_std = lambda x: x['mean'] + fold_std, 
        std_text = lambda x: x['mean'].apply(lambda x: f"{x:.3f}") + ' ± ' + fold_std.apply(lambda x: f"{x:.3f}")
    )
    .reset_index()
    .replace({
        "random_baseline" : 'Random sampling',
        "rule-based_metadata_clustering" : "Expert-based",
        "consumption_clustering" : "Data-driven"
    })
    .set_axis(['Method', 'Mean ES', 'Std ES', 'min_std', 'max_std', 'std_text'], axis = 1)
)
alt_df

In [None]:
from chart_util import big_chart
chart = alt.Chart(alt_df, height = 80).mark_bar(size = 20).encode(
    y = alt.Y("Method:N", title = None, axis = alt.Axis(domain = False, ticks = False), sort = ['Random Baseline', 'Metadata Clustering', 'Consumption Clustering']), 
    x = alt.X('Mean ES', title = 'Mean ES (lower is better)'), 
    color = alt.Color('Method:O', legend = None),
)
text_chart = alt.Chart(alt_df, height = 80).mark_text(align = 'left', size = 14, dx = 5).encode(
    y = alt.Y("Method:N", title = None, axis = alt.Axis(domain = False, ticks = False), sort = ['Random Baseline', 'Metadata Clustering', 'Consumption Clustering']), 
    x = alt.X('Mean ES', title = 'Mean ES (lower is better)'), 
    text = alt.Text('Mean ES', format = '.3f'),
    # color = alt.Color('Method:O', legend = None),
)

# hline_chart = alt.Chart(alt_df, height = 80).mark_rule(strokeWidth = 2).encode(
#     y = alt.Y('Method', sort =  ['Random Baseline', 'Metadata Clustering', 'Consumption Clustering']),
#     x = 'min_std', 
#     x2 = 'max_std'
# )
comparison_chart = big_chart(chart + text_chart , fontsize = 15)
comparison_chart

In [None]:
big_chart((chart + text_chart).properties(title = "(a) Mean ES over all folds") & total_chart.properties(title = '(b) Mean ES per fold'))

In [None]:
alt_df = energy_scores.set_axis(['Random', 'Metadata Clustering', 'Consumption Clustering'], axis = 1).reset_index(drop = True).stack().to_frame('value').rename_axis(['index', 'method'], axis = 0).reset_index()
alt_df

In [None]:
chart = alt.Chart(alt_df, width = 1000).mark_boxplot().encode(
    y = alt.Y("method:N", title = None, axis = alt.Axis(domain = False, ticks = False)), 
    x = alt.X('value', title = 'ES (lower is better)'), 
    color = alt.Color('method:O', legend = None),
).interactive(bind_y = False)
big_chart(chart, fontsize = 15); 