# Influence of #clusters

### Imports

In [None]:
# standard imports
import numpy as np 
import pandas as pd
import altair as alt
from pathlib import Path
import itertools

# loading the data 
from energyclustering.sampling.preprocessing import DataPreprocessor

# preprocessing the yearly info 
from energyclustering.sampling.preprocessing.info import YearlyInfoPreprocessor

# executing the results in parallell 
from dask.distributed import Client

# sampling models 
from energyclustering.sampling.day_of_year_samplers import RandomBaselineSampler # random day selection baseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler # to combine daily and yearly sampler
from energyclustering.sampling.day_of_year_samplers import GenerateSampleDecorator
from energyclustering.sampling.day_of_year_samplers import EnergyFilterFromRandomYearSampler # to use energyfilter from a random year
from energyclustering.sampling.samplers import RandomSamplerBaseline
from energyclustering.sampling.samplers import EnergyvilleDaySelectionBaseline
from energyclustering.sampling.samplers import MetadataSampler
from energyclustering.sampling.samplers import ConsumptionDataSampler


# elbow method to use for clustering 
from energyclustering.clustering.elbow import ElbowMethod

# clustering algorithms 
from sklearn.cluster import KMeans
from kmedoids import KMedoids

# cluster metrics 
import energyclustering.clustering.metrics as dist_metrics
from energyclustering.clustering.kmedoids import CustomKMedoids

# classifiers 
from sklearn.ensemble import RandomForestClassifier 

# evaluation 
from energyclustering.sampling.evaluation import SamplerEvaluator

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('paper')
    .preprocess_weather_df('paper')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(1000)
    .get_data()
)
daily_data_df.shape

In [None]:
# generate folds for cross validation 
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 5)

## Models to compare

In [None]:
NB_SAMPLES = 250
CLUSTER_RANGE = [1] + list(range(10, 101, 10))
CLUSTER_RANGE_ALGORITHM = list(range(10, 101, 5))

In [None]:
models = dict()
for NB_CLUSTERS in CLUSTER_RANGE: 
    models[f'yearly_clusters={NB_CLUSTERS}'] =  GenerateSampleDecorator(
        sampler = DailySamplerFromClusterSampler(
            yearly_sampler = ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = CustomKMedoids(n_clusters = NB_CLUSTERS, metric = dist_metrics.euc_distance_matrix_missing), 
                fillna = False,
            ), 
            daily_sampler = EnergyvilleDaySelectionBaseline(),
        ), 
        n_samples = NB_SAMPLES,
    )
    models[f'daily_clusters={NB_CLUSTERS}'] =  GenerateSampleDecorator(
        sampler = DailySamplerFromClusterSampler(
                yearly_sampler = ConsumptionDataSampler(
                    classifier = RandomForestClassifier(),
                    clusterer = ElbowMethod(KMedoids(n_clusters = 1, method = 'fasterpam'), metric = dist_metrics.euc_distance_matrix_missing, cluster_range = CLUSTER_RANGE_ALGORITHM, nb_repeats = 10), 
                    fillna = False,
                ), 
                daily_sampler =  ConsumptionDataSampler(
                    classifier = RandomForestClassifier(),
                    clusterer = KMeans(n_clusters = NB_CLUSTERS),
                ),
        ), 
        n_samples = NB_SAMPLES,
    )
        
models['consumption_clustering'] = (
     GenerateSampleDecorator(
        sampler = DailySamplerFromClusterSampler(
            yearly_sampler = ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = ElbowMethod(KMedoids(n_clusters = 1, method = 'fasterpam'), metric = dist_metrics.euc_distance_matrix_missing, cluster_range = CLUSTER_RANGE_ALGORITHM, nb_repeats = 10), 
#                 clusterer = ElbowMethod(KMeans(n_clusters = 1, n_init = 1), metric = None, cluster_range = CLUSTER_RANGE, nb_repeats = 1), 
#                 clusterer = KMeans(n_clusters = 5, n_init = 10), 
                fillna = False,
#                 fillna = True,
            ), 
            daily_sampler = ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = ElbowMethod(
                    clusterer = KMeans(n_clusters = 1),
                    cluster_range = CLUSTER_RANGE_ALGORITHM,
#                     show_progress = True,
                ), 
#                 clusterer = KMeans(n_clusters = 30, n_init = 1), 
            ), 
#             show_progress= True,
        ), 
        n_samples = NB_SAMPLES,
    )
)

In [None]:
from dask_util import get_dask_cluster
cluster = get_dask_cluster(
    pinac_numbers = [31, 32, 33, 34, 35, 36, 37, 38, 39, 40],
    himec_numbers = [8,1, 2],
)
# cluster = None

In [None]:
import logging
import datetime
import dask 
logging.basicConfig(format='%(asctime)s - %(name)s: %(message)s', level=logging.DEBUG, filename = f'logs/number_of_clusters_{datetime.datetime.now().strftime("%d-%m-%Y")}.log', filemode = 'w')
dask.config.set({'distributed.comm.retry.count': 5, 'distributed.comm.retry.delay.min': '60s', 'distributed.comm.retry.delay.max': '100s'})

In [None]:
%%time 
result_path = Path()/'results'/'nb_of_clusters'
result_path.mkdir(parents = True, exist_ok = True)

energy_scores = []
with Client(cluster) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 400, crossval = True)
    for key, model in models.items():
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
    energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

cluster.close()
# aggregate energy scores
plot_df = energy_scores.agg(['mean', 'std'], axis = 0)

In [None]:
# energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())
plot_df = energy_scores.agg(['mean', 'std'], axis = 0)
plot_df.T

In [None]:
optimal = plot_df.loc['mean', 'consumption_clustering']
optimal

In [None]:
plot_df= plot_df.drop(columns = ['consumption_clustering']).T['mean'].to_frame('ES').assign(
    nb_clusters = lambda x: list(map(lambda x: int(x[1]), x.index.str.split('='))), 
    cluster_type = lambda x: list(map(lambda x: x[0], x.index.str.split('=')))
)
plot_df.set_index(['cluster_type', 'nb_clusters']).sort_index()

### Optimal yearly clusters

In [None]:
df = plot_df.set_index(['cluster_type', 'nb_clusters']).sort_index()
df.loc['yearly_clusters'].sort_values('ES')

### Optimal daily clusters

In [None]:
df = plot_df.set_index(['cluster_type', 'nb_clusters']).sort_index()
df.loc['daily_clusters'].sort_values('ES')

In [None]:
from chart_util import big_chart

In [None]:
text_df = plot_df[plot_df.nb_clusters == 100].copy()
text_df.loc[:, 'ES'] +=  [-0.005, +0.005]
text_chart = alt.Chart(text_df).mark_text( align = 'left', dx = 10, fontSize = 20).encode(
       x = 'nb_clusters', 
    y = 'ES:Q',
    color = 'cluster_type',
    text = 'cluster_type'
)

In [None]:
text_df

In [None]:
chart = alt.Chart(plot_df, width = 500).mark_line(size = 3).encode(
    x = alt.X('nb_clusters:Q', title = '#Clusters'),
    y = alt.Y('ES:Q', title = 'average ES (lower is better)',  scale = alt.Scale(zero = False)), 
    color = 'cluster_type',
)
big_chart(chart + text_chart)


In [None]:
big_chart(chart)

In [None]:
year_plot = alt.Chart(plot_df[plot_df.cluster_type == 'yearly_clusters']).mark_line().encode(
       x = 'nb_clusters:Q', 
    y = alt.Y('ES', title = 'mean ES (lower is better)', scale = alt.Scale(zero = False)),
)
optimal_line = alt.Chart(pd.DataFrame([30], columns = ['nb_clusters'])).mark_rule().encode(
        x = 'nb_clusters:Q'
)
text = optimal_line.mark_text(angle = 270, baseline = 'bottom', fontSize = 20).encode(
    text = alt.TextValue('selected #clusters')
)
year_chart = (year_plot + optimal_line + text).resolve_scale(x = 'shared', y= 'shared')
big_chart(year_chart)

In [None]:
day_plot = alt.Chart(plot_df[plot_df.cluster_type == 'daily_clusters']).mark_line().encode(
       x = 'nb_clusters:Q', 
    y = alt.Y('ES', title = 'mean ES (lower is better)', scale = alt.Scale(zero = False)),
)
optimal_line = alt.Chart(pd.DataFrame([optimal], columns = ['ES'])).mark_rule().encode(
    y = 'ES',
)
text = optimal_line.mark_text(baseline = 'bottom', fontSize = 20).encode(
    text = alt.TextValue('adaptive #clusters')
)
day_chart = (day_plot + optimal_line + text).resolve_scale(x = 'shared', y = 'shared')
big_chart(day_chart)

In [None]:
big_chart((year_chart.properties(title = 'yearlong clustering') | day_chart.properties(title = 'daylong clustering')).resolve_scale(x = 'shared', y = 'shared'), fontsize = 18)

In [None]:
energy_scores

In [None]:
cluster.close()