In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from dask.distributed import Client
import pandas as pd
import altair as alt

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    # no subsampling this time
#     .subsample_days(week_reduction_factor = 5)
    # for testing only!
    .subsample_years(500)
    .get_data()
)
daily_data_df.shape

In [None]:
daily_info_df.loc[:, ('day_info', 'FeelsLikeC')]

# Folds

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

# Test day selection methods using consumption clustering, metadata clustering and energyville baseline
Main idea: fix the way to select years and vary the way to select days.  
Detail: Use consumption clustering with 50 clusters (best performing in previous experiments) to select the years .   
Then use different strategies with different numbers of clusters to select the days and compare the different approaches.  


In [None]:
from energyclustering.sampling.samplers import ConsumptionDataSampler, MetadataSampler, EnergyvilleDaySelectionBaseline, RandomSamplerBaseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler, GenerateSampleDecorator
from sklearn.ensemble import RandomForestClassifier 
from sklearn.cluster import KMeans
from pathlib import Path
from energyclustering.sampling.evaluation.evaluation import SamplerEvaluator

In [None]:
models = dict()
NB_OF_YEARLY_CLUSTERS = 50
NB_SAMPLES = 500
models['daily_sampling_random_baseline_0'] = (
    GenerateSampleDecorator(
        sampler = DailySamplerFromClusterSampler(
            yearly_sampler = ConsumptionDataSampler(
                classifier = RandomForestClassifier(), 
                clusterer = KMeans(NB_OF_YEARLY_CLUSTERS), 
                info_preprocessing = None
            ), 
            daily_sampler = RandomSamplerBaseline(
                n_samples = 100
            )
        ),
        n_samples = NB_SAMPLES
    )
)

models['daily_sampling_EV_baseline_0'] = (
    GenerateSampleDecorator(
        sampler= DailySamplerFromClusterSampler(
            yearly_sampler = ConsumptionDataSampler(
                classifier = RandomForestClassifier(), 
                clusterer = KMeans(NB_OF_YEARLY_CLUSTERS), 
                info_preprocessing = None
            ), 
            daily_sampler = EnergyvilleDaySelectionBaseline(
                allowed_temp_diff = 2.5
            )
        ), 
        n_samples = NB_SAMPLES,
    )
)


for nb_clusters in [10, 20, 30, 50]: 
    models[f'daily_sampling_consumption_{nb_clusters}'] = (
        GenerateSampleDecorator(
            sampler = DailySamplerFromClusterSampler(
                yearly_sampler = ConsumptionDataSampler(
                    classifier = RandomForestClassifier(), 
                    clusterer = KMeans(NB_OF_YEARLY_CLUSTERS), 
                    info_preprocessing = None
                ), 
                daily_sampler = ConsumptionDataSampler(
                    classifier = RandomForestClassifier(), 
                    clusterer = KMeans(nb_clusters), 
                    info_preprocessing = None
                )
            ), 
            n_samples = NB_SAMPLES,
        )
    )
    
    models[f'daily_sampling_metadata_{nb_clusters}'] = (
        GenerateSampleDecorator(
            sampler = DailySamplerFromClusterSampler(
                yearly_sampler = ConsumptionDataSampler(
                    classifier = RandomForestClassifier(), 
                    clusterer = KMeans(NB_OF_YEARLY_CLUSTERS), 
                    info_preprocessing = None
                ), 
                daily_sampler = MetadataSampler(
                    clusterer = KMeans(nb_clusters), 
0                )
            ), 
            n_samples = NB_SAMPLES,
        )
    )
                   
                   
    

In [None]:
%%time 
energy_scores = []

result_path = Path()/'results'/'daily_sampling2'
result_path.mkdir(parents = True, exist_ok = True)

with Client(local_directory = '/cw/dtailocal/', n_workers=30, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 200, crossval = True)
    for key, model in models.items():
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
    energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot_df = energy_scores.mean(axis = 0).to_frame('mean energy score')

# parse the names
consumption_plot_df = plot_df.assign(
    nb_clusters = lambda x: x.index.map(lambda y: int(y.split('_')[-1])), 
    name = lambda x: x.index.map(lambda y: "_".join(y.split('_')[:-1]))
).set_index(['name', 'nb_clusters'], drop = True).sort_index()


consumption_plot_df

## Compared with metadata clustering
Interestingly, consumption clustering seems less sensitive to a clustering that is to fine grained. 
This is probably due to the fact that the classifier cannot distinguish the different clusterings anymore and will just assign an instance to both of them. 


In [None]:
chart = alt.Chart(consumption_plot_df.reset_index()).mark_line().encode(
    x = 'nb_clusters:Q', 
    y = alt.Y('mean energy score:Q', scale = alt.Scale(zero = False)),
    color = 'name:N'
    
)
chart + chart.mark_circle()