In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from dask.distributed import Client
import pandas as pd
import altair as alt

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = 1)
    # for testing only!
#     .subsample_years(300)
    .get_data()
)
daily_data_df.shape

In [None]:
nb_years = data_df.shape[0]

# Folds

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

# The samplers

In [None]:
from energyclustering.sampling.samplers import MetadataSampler
from energyclustering.sampling.samplers import RandomSamplerBaseline, RandomSampler
from energyclustering.sampling.day_of_year_samplers import RandomDayFromYearSampler, SpecificDayFromYearSampler, SimilarDayFromYearSampler
from sklearn.cluster import KMeans

# Evaluate the samplers

In [None]:
from energyclustering.sampling.evaluation import SamplerEvaluator
from pathlib import Path
import time

In [None]:
NB_CLUSTERS = 20
NB_OF_YEARS_TO_SAMPLE = int(nb_years * (2/3)*(1 / NB_CLUSTERS))
NB_OF_DAYS_TO_SAMPLE = 10
models = dict(
    random_day_random_year = RandomSamplerBaseline(NB_OF_YEARS_TO_SAMPLE*NB_OF_DAYS_TO_SAMPLE), 
    random_day_from_metadata_year = RandomDayFromYearSampler(MetadataSampler(KMeans(NB_CLUSTERS)), NB_OF_DAYS_TO_SAMPLE), 
    weather_day_from_random_year = SimilarDayFromYearSampler(RandomSampler(), NB_OF_DAYS_TO_SAMPLE, weather_df), 
    weather_day_from_metadata_year = SimilarDayFromYearSampler(MetadataSampler(KMeans(NB_CLUSTERS)), NB_OF_DAYS_TO_SAMPLE, weather_df)
)
result_path = Path()/'results'/'energyville_full_ablation'
result_path.mkdir(parents = True, exist_ok = True)
with Client(local_directory = '/cw/dtailocal/', n_workers=20, threads_per_worker = 1) as client: 
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 200)
    for key, model in models.items(): 
        start_time = time.time()
        evaluator.evaluate_and_save(model, result_path/ f"{key}_v0.pkl")
        print(f"{key}: {time.time()- start_time} s")
        client.restart()


In [None]:

all_files = list(result_path.iterdir())
results = pd.concat([pd.read_pickle(path) for path in all_files], keys = [path.stem for path in all_files], axis = 1)
results

In [None]:
results.mean(axis = 0)

In [None]:
results.std(axis = 0)