# Overall ablation study 

### Imports

In [None]:
# standard imports
import numpy as np 
import pandas as pd
import altair as alt
from pathlib import Path
import itertools

# loading the data 
from energyclustering.sampling.preprocessing import DataPreprocessor

# preprocessing the yearly info 
from energyclustering.sampling.preprocessing.info import YearlyInfoPreprocessor

# executing the results in parallell 
from dask.distributed import Client

# sampling models 
from energyclustering.sampling.day_of_year_samplers import RandomBaselineSampler # random day selection baseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler # to combine daily and yearly sampler
from energyclustering.sampling.day_of_year_samplers import GenerateSampleDecorator
from energyclustering.sampling.day_of_year_samplers import EnergyFilterFromRandomYearSampler # to use energyfilter from a random year
from energyclustering.sampling.samplers import RandomSamplerBaseline
from energyclustering.sampling.samplers import EnergyvilleDaySelectionBaseline
from energyclustering.sampling.samplers import MetadataSampler
from energyclustering.sampling.samplers import ConsumptionDataSampler


# elbow method to use for clustering 
from energyclustering.clustering.elbow import ElbowMethod

# clustering algorithms 
from sklearn.cluster import KMeans
from kmedoids import KMedoids

# cluster metrics 
import energyclustering.clustering.metrics as dist_metrics

# classifiers 
from sklearn.ensemble import RandomForestClassifier 

# evaluation 
from energyclustering.sampling.evaluation import SamplerEvaluator

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('paper')
    .preprocess_weather_df('paper')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(1000)
    .get_data()
)
daily_data_df.shape

In [None]:
daily_info_df.columns

In [None]:
# generate folds for cross validation 
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 5)

## Models to compare

In [None]:
NB_SAMPLES = 250
CLUSTER_RANGE = list(range(10, 101, 5))

In [None]:
combinations_to_skip = set([('random', 'consumption_clustering')])

In [None]:
yearly_samplers = dict()
yearly_samplers['metadata_clustering'] = MetadataSampler(
                clusterer = ElbowMethod(KMeans(n_clusters = 1, n_init = 10), cluster_range = CLUSTER_RANGE), 
                info_preprocessing = YearlyInfoPreprocessor(columns_to_use = ['yearly_consumption', 'connection_power'], normalized = True),
            )

yearly_samplers['consumption_clustering'] = ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = ElbowMethod(KMedoids(n_clusters = 1, method = 'fasterpam'), metric = dist_metrics.euc_distance_matrix_missing, cluster_range = CLUSTER_RANGE, nb_repeats = 10), 
                fillna = False,
            )

# yearly_samplers['random'] = RandomSamplerBaseline(
#     n_samples = None
# )

In [None]:
daily_samplers = dict() 
daily_samplers['random'] = RandomSamplerBaseline(n_samples = None) 

daily_samplers['similar_day'] = EnergyvilleDaySelectionBaseline()

daily_samplers['consumption_clustering'] = ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = ElbowMethod(KMeans(n_clusters = 1), cluster_range = CLUSTER_RANGE), 
            )

In [None]:
models = dict()
for (y_name, y_sampler), (d_name, d_sampler) in itertools.product(yearly_samplers.items(), daily_samplers.items()): 
    if (y_name, d_name) in combinations_to_skip: 
        continue 
    models[f'y={y_name}, d={d_name}'] = GenerateSampleDecorator(
        sampler = DailySamplerFromClusterSampler(
            yearly_sampler = y_sampler, 
            daily_sampler = d_sampler
        ), 
        n_samples = NB_SAMPLES,
    )
models["y=random, d=random"] = RandomBaselineSampler(NB_SAMPLES)
models["y=random, d=similar_day"] = GenerateSampleDecorator(EnergyFilterFromRandomYearSampler(), n_samples = NB_SAMPLES)

In [None]:
from dask_util import get_dask_cluster
cluster = get_dask_cluster(
    pinac_numbers = [31, 32, 33, 34, 35, 36, 37, 38, 39, 40 ],
    himec_numbers = [8, 1, 2, 3, 4],
)

In [None]:
import logging
import datetime
import dask 
logging.basicConfig(format='%(asctime)s - %(name)s: %(message)s', level=logging.DEBUG, filename = f'logs/ablation_{datetime.datetime.now().strftime("%d-%m-%Y")}.log', filemode = 'w')
dask.config.set({'distributed.comm.retry.count': 5, 'distributed.comm.retry.delay.min': '60s', 'distributed.comm.retry.delay.max': '100s'})

In [None]:
%%time 
result_path = Path()/'results'/'ablation'
result_path.mkdir(parents = True, exist_ok = True)

energy_scores = []
with Client(cluster) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 1000, crossval = True)
    for key, model in models.items():
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
    energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

cluster.close()
# aggregate energy scores
plot_df = energy_scores.agg(['mean', 'std'], axis = 0)

In [None]:
ablation_table = plot_df.T['mean'].to_frame('ES').assign(
       year = lambda x: x.index.map(lambda x: x.split(',')[0][2:]), 
        day = lambda x: x.index.map(lambda x: x.split(',')[1][3:]),
).pivot_table(index = 'day', columns = 'year', values = 'ES').reindex(['random', 'similar_day', 'consumption_clustering'], axis = 0).reindex(['random', 'metadata_clustering', 'consumption_clustering'], axis = 1)
ablation_table.round(3)

In [None]:
print(ablation_table.round(3).astype('str').style.to_latex())

In [None]:
cluster.close()