# Overall ablation study 

### Imports

In [None]:
# standard imports
import numpy as np 
import pandas as pd
import altair as alt
from pathlib import Path
import itertools

# loading the data 
from energyclustering.sampling.preprocessing import DataPreprocessor

# preprocessing the yearly info 
from energyclustering.sampling.preprocessing.info import YearlyInfoPreprocessor

# executing the results in parallell 
from dask.distributed import Client

# sampling models 
from energyclustering.sampling.day_of_year_samplers import RandomBaselineSampler # random day selection baseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler # to combine daily and yearly sampler
from energyclustering.sampling.day_of_year_samplers import GenerateSampleDecorator
from energyclustering.sampling.day_of_year_samplers import EnergyFilterFromRandomYearSampler # to use energyfilter from a random year
from energyclustering.sampling.samplers import RandomSamplerBaseline
from energyclustering.sampling.samplers import EnergyvilleDaySelectionBaseline
from energyclustering.sampling.samplers import MetadataSampler
from energyclustering.sampling.samplers import ConsumptionDataSampler


# elbow method to use for clustering 
from energyclustering.clustering.elbow import ElbowMethod

# clustering algorithms 
from sklearn.cluster import KMeans
from kmedoids import KMedoids

# cluster metrics 
import energyclustering.clustering.metrics as dist_metrics

# classifiers 
from sklearn.ensemble import RandomForestClassifier 

# evaluation 
from energyclustering.sampling.evaluation import SamplerEvaluator

import seaborn as sn


In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(1000)
    .get_data()
)
daily_data_df.shape

In [None]:
# generate folds for cross validation 
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

# Load the results

In [None]:
result_path = Path()/'results/ablation'
series = []
for file in result_path.glob('*.pkl'): 
    energy_score_series = pd.read_pickle(file).rename(file.stem)
    series.append(energy_score_series)
energy_scores = pd.concat(series, axis = 1)


In [None]:
energy_scores

In [None]:
energy_scores.mean(axis = 0).to_frame('average ES')

In [None]:
consumption_columns = [column for column in energy_scores.columns if 'd=similar' in column]
new_column_names = [column.split(',')[0] for column in consumption_columns]
plot_df = energy_scores.loc[:, consumption_columns].set_axis(new_column_names, axis = 1)
plot_df
                    

# Plot paired plots 

In [None]:
def plot_scatter_comparison(x, y, bins = 50, log_scale = True): 
    g = sn.histplot(data = energy_scores, x = x, y = y, bins = bins, log_scale = log_scale)
    g.plot([0,1], [0, 1], 'orange', transform = g.transAxes)
    g.set_xlim(0.1, 100)
    g.set_ylim(0.1, 100)
    g.set_aspect('equal', 'box')

In [None]:
plot_scatter_comparison(x = 'y=consumption_clustering, d=consumption_clustering', y = 'y=random, d=random')

In [None]:
plot_scatter_comparison(x = 'y=metadata_clustering, d=similar_day', y = 'y=random, d=random')

In [None]:
plot_scatter_comparison(x = 'y=consumption_clustering, d=consumption_clustering', y = 'y=metadata_clustering, d=similar_day')

In [None]:
energy_scores.rank(axis = 1).mean(axis = 0).to_frame('average rank').sort_values('average rank')

# Check the variance between folds

In [None]:
energy_scores

In [None]:
mean_es_per_fold = []
for fold in folds: 
    mean_es_per_fold.append(energy_scores.loc[fold].mean(axis = 0))
mean_es_per_fold = pd.concat(mean_es_per_fold, axis = 1, keys = [f"fold{i}" for i in range(3)])
mean_es_per_fold.rank(axis = 0)