In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from dask.distributed import Client
import pandas as pd
import altair as alt
from tqdm import tqdm

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    # no subsampling this time
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(500)
    .get_data()
)
daily_data_df.shape

In [None]:
daily_info_df.loc[:, ('day_info', 'FeelsLikeC')]

## Select number of clusters automatically

In [None]:
from energyclustering.clustering.elbow import ElbowMethod


In [None]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from numba import jit, float64
from dtaidistance import dtw
import kmedoids
from energyclustering.sampling.samplers import ConsumptionDataSampler, MetadataSampler, EnergyvilleDaySelectionBaseline, RandomSamplerBaseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler, GenerateSampleDecorator
from sklearn.ensemble import RandomForestClassifier 
from sklearn.cluster import KMeans
from pathlib import Path
from energyclustering.sampling.evaluation.evaluation import SamplerEvaluator

In [None]:
@jit(float64(float64[:], float64[:]), nogil = True, nopython = True)
def euc_dist_missing(a1, a2): 
    return np.nanmean((a1-a2)**2)

euc_distance_matrix_missing = lambda x: pairwise_distances(x, metric = euc_dist_missing)
euc_distance_matrix = lambda x: euclidean_distances(x.fillna(0))
dtw_distance_matrix = lambda x: dtw.distance_matrix_fast(x.to_numpy(), window = 4)

class CustomKMedoids: 
    def __init__(self, nb_clusters, metric, random_state = None): 
        self.nb_clusters = nb_clusters
        self.metric = metric
        self.random_state = random_state
        self.labels_ = None
    
    def fit(self, data):
        matrix = self.metric(data) 
        km = kmedoids.KMedoids(self.nb_clusters, method = 'fasterpam', random_state = self.random_state) 
        c = km.fit(matrix)
        self.labels_ = c.labels_.astype('int')
        return self
    
    

## Check the number of clusters selected automatically

In [None]:
sampling_model = DailySamplerFromClusterSampler(
                        yearly_sampler = ConsumptionDataSampler(
                            classifier = RandomForestClassifier(), 
                            clusterer = ElbowMethod(kmedoids.KMedoids(1, method = 'fasterpam'), euc_distance_matrix_missing, range(20, 80, 2)), 
                            info_preprocessing = None
                        ), 
                        daily_sampler = ConsumptionDataSampler(
                            classifier = RandomForestClassifier(), 
                            clusterer =  ElbowMethod(kmedoids.KMedoids(1, method = 'fasterpam'), euc_distance_matrix, range(10, 81, 5), show_progress= True, nb_repeats = 10), 
                            info_preprocessing = None
                        ),
                        show_progress = True
                   )
sampling_model.fit(daily_data_df, data_df, daily_info_df)

In [None]:
sampling_model.yearly_sampler.clusterer.nb_clusters

In [None]:
sampling_model.yearly_sampler.clusterer.plot_knee()

In [None]:
instances_per_cluster = sampling_model.yearly_sampler.clustering.value_counts().to_frame('#years').rename_axis('cluster_idx', axis = 0)
instances_per_cluster

In [None]:
daily_clusters_sizes = [ (cluster_idx, model.clusterer.nb_clusters) for cluster_idx, model in sampling_model.daily_sampler_per_cluster.items()]
nb_clusters = pd.DataFrame(daily_clusters_sizes, columns = ['cluster_idx', 'nb_clusters']).set_index('cluster_idx')
instances_per_cluster.join(nb_clusters)

# Look at the elbow visualisation of cluster 23 (the biggest cluster)

In [None]:
sampling_model.daily_sampler_per_cluster[17].clusterer.plot_knee()

In [None]:
    
small_clusters = ElbowMethod(kmedoids.KMedoids(1, method = 'fasterpam'),  euc_distance_matrix, range(1, 40, 5)).fit(daily_df)
small_clusters.plot_knee()

In [None]:
big_clusters = ElbowMethod(kmedoids.KMedoids(1, method = 'fasterpam'),  euc_distance_matrix, range(1, 100, 5)).fit(daily_df)
big_clusters.plot_knee()

In [None]:
big_clusters = ElbowMethod(kmedoids.KMedoids(1, method = 'fasterpam'),  euc_distance_matrix, range(1, 200, 5)).fit(daily_df)
big_clusters.plot_knee()

In [None]:
big_clusters = ElbowMethod(kmedoids.KMedoids(1, method = 'fasterpam'),  euc_distance_matrix, range(1, 3000, 25)).fit(daily_df)
big_clusters.plot_knee()