In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from dask.distributed import Client
import pandas as pd
import altair as alt

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(500)
    .get_data()
)
daily_data_df.shape

In [None]:
daily_info_df.loc[:, 'household_info']

# Folds

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

## Clusterers

In [None]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from numba import jit, float64
from dtaidistance import dtw
import kmedoids
from energyclustering.sampling.samplers import ConsumptionDataSampler, MetadataSampler, EnergyvilleDaySelectionBaseline, RandomSamplerBaseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler, GenerateSampleDecorator
from sklearn.ensemble import RandomForestClassifier 
from sklearn.cluster import KMeans
from pathlib import Path
from energyclustering.sampling.evaluation.evaluation import SamplerEvaluator
from energyclustering.clustering.elbow import ElbowMethod

In [None]:
@jit(float64(float64[:], float64[:]), nogil = True, nopython = True)
def euc_dist_missing(a1, a2): 
    return np.nanmean((a1-a2)**2)

euc_distance_matrix_missing = lambda x: pairwise_distances(x, metric = euc_dist_missing)
euc_distance_matrix = lambda x: euclidean_distances(x.fillna(0))
dtw_distance_matrix = lambda x: dtw.distance_matrix_fast(x.to_numpy(), window = 4)

class CustomKMedoids: 
    def __init__(self, nb_clusters, metric, random_state = None): 
        self.nb_clusters = nb_clusters
        self.metric = metric
        self.random_state = random_state
        self.labels_ = None
    
    def fit(self, data):
        matrix = self.metric(data) 
        km = kmedoids.KMedoids(self.nb_clusters, method = 'fasterpam', random_state = self.random_state) 
        c = km.fit(matrix)
        self.labels_ = c.labels_.astype('int')
        return self
    
    

## Models to test

In [None]:
NB_YEARLY_CLUSTERS = 40
NB_DAILY_CLUSTERS = 30 
NB_SAMPLES = 250 

models = dict() 

yearly_clustering = dict( 
    euclidean_40 = CustomKMedoids(NB_YEARLY_CLUSTERS, euc_distance_matrix_missing, random_state = 0), 
    euclidean_auto = ElbowMethod(kmedoids.KMedoids(1, method = 'fasterpam'), euc_distance_matrix_missing, range(20, 80, 2))
)

daily_clustering = dict(
    euclidean_30 = CustomKMedoids(NB_DAILY_CLUSTERS, euc_distance_matrix, random_state = 0),
    euclidean_auto = ElbowMethod(kmedoids.KMedoids(1, method = 'fasterpam'), euc_distance_matrix, range(10, 81, 5))
    
)
for y_name, y_cluster in yearly_clustering.items(): 
    for d_name, d_cluster in daily_clustering.items():
        models[f'y={y_name}, d={d_name}'] = (
            GenerateSampleDecorator(
                DailySamplerFromClusterSampler(
                        yearly_sampler = ConsumptionDataSampler(
                            classifier = RandomForestClassifier(), 
                            clusterer = y_cluster, 
                            info_preprocessing = None
                        ), 
                        daily_sampler = ConsumptionDataSampler(
                            classifier = RandomForestClassifier(), 
                            clusterer = d_cluster, 
                            info_preprocessing = None
                        )
                    ), 
                n_samples = NB_SAMPLES)
        )



## Do the experiment

In [None]:
%%time 
energy_scores = []

result_path = Path()/'results'/'daily_sampling'
result_path.mkdir(parents = True, exist_ok = True)

with Client(local_directory = '/cw/dtailocal/', n_workers=20, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 400, crossval = False)
    for key, model in models.items():
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
    energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot_df = energy_scores.agg(['mean', 'std'], axis = 0)


In [None]:
plot_df.T