# Test

In [None]:
import pandas as pd

In [None]:
clustering = pd.Series([1,2,3,4], index = ['("id1", 2012)', '("id1", 2012)', '("id3", 2012)' ,'("id4", 2012)'])
clustering

In [None]:
clustering.index.to_frame()[0].apply(lambda x: x[2:-8]).to_frame('ID').reset_index().set_index('ID').loc['id1']

In [None]:
import numpy as np 
from energyclustering.sampling.preprocessing import DataPreprocessor
from dask.distributed import Client
import pandas as pd
import altair as alt
from energyclustering.sampling.samplers import ConsumptionDataSampler, ConsumptionDataSamplerWithValidation, MetadataSampler, EnergyvilleDaySelectionBaseline, RandomSamplerBaseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler,IndividualDailySamplerFromClusterSampler, GenerateSampleDecorator
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from pathlib import Path
from energyclustering.sampling.evaluation.evaluation import SamplerEvaluator

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    # no subsampling this time
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(1000)
    .get_data()
)
daily_data_df.shape

# Folds

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

In [None]:
NB_OF_YEARLY_CLUSTERS = 4
NB_DAILY_CLUSTERS = 30
NB_SAMPLES = 500
models = dict(
    per_profile = GenerateSampleDecorator(
                sampler = IndividualDailySamplerFromClusterSampler(
                    yearly_sampler = ConsumptionDataSampler(
                        classifier = RandomForestClassifier(), 
                        clusterer = KMeans(NB_OF_YEARLY_CLUSTERS), 
                        info_preprocessing = None
                    ), 
                    daily_sampler = ConsumptionDataSampler(
                        classifier = DecisionTreeClassifier(criterion = 'log_loss', max_depth = 4), 
                        clusterer = KMeans(NB_DAILY_CLUSTERS), 
                        info_preprocessing = None
                    )
                ), 
                n_samples = NB_SAMPLES, 
    ),
    per_cluster = GenerateSampleDecorator(
                sampler = DailySamplerFromClusterSampler(
                    yearly_sampler = ConsumptionDataSampler(
                        classifier = RandomForestClassifier(), 
                        clusterer = KMeans(NB_OF_YEARLY_CLUSTERS), 
                        info_preprocessing = None
                    ), 
                    daily_sampler = ConsumptionDataSampler(
                        classifier = DecisionTreeClassifier(criterion = 'log_loss', max_depth = 4), 
                        clusterer = KMeans(NB_DAILY_CLUSTERS), 
                        info_preprocessing = None
                    )
                ), 
                n_samples = NB_SAMPLES, 
    ), 
    per_profile_validation = GenerateSampleDecorator(
                sampler = IndividualDailySamplerFromClusterSampler(
                    yearly_sampler = ConsumptionDataSampler(
                        classifier = RandomForestClassifier(), 
                        clusterer = KMeans(NB_OF_YEARLY_CLUSTERS), 
                        info_preprocessing = None
                    ), 
                    daily_sampler = ConsumptionDataSamplerWithValidation(
                        classifier = DecisionTreeClassifier(criterion = 'log_loss', max_depth = 4), 
                        clusterer = KMeans(NB_DAILY_CLUSTERS), 
                        info_preprocessing = None
                    )
                ), 
                n_samples = NB_SAMPLES, 
    ),
    per_cluster_validation = GenerateSampleDecorator(
                sampler = DailySamplerFromClusterSampler(
                    yearly_sampler = ConsumptionDataSampler(
                        classifier = RandomForestClassifier(), 
                        clusterer = KMeans(NB_OF_YEARLY_CLUSTERS), 
                        info_preprocessing = None
                    ), 
                    daily_sampler = ConsumptionDataSamplerWithValidation(
                        classifier = DecisionTreeClassifier(criterion = 'log_loss', max_depth = 4), 
                        clusterer = KMeans(NB_DAILY_CLUSTERS), 
                        info_preprocessing = None
                    )
                ), 
                n_samples = NB_SAMPLES, 
    ), 
    
)
result_path = Path()/'results'/'per_profile_PoC'
result_path.mkdir(parents = True, exist_ok = True)

with Client(local_directory = '/cw/dtailocal/', n_workers=30, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 400, crossval = False)
    energy_scores = []
    for key, model in models.items():
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
    energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot_df = energy_scores.mean(axis = 0).to_frame('mean energy score')
std_df = energy_scores.std(axis = 0).to_frame('std energy score')

In [None]:
plot_df

In [None]:
std_df