In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from dask.distributed import Client
import pandas as pd
import altair as alt

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = 5)
    # for testing only!
    .subsample_years(500)
    .get_data()
)
daily_data_df.shape

In [None]:
daily_info_df.loc[:, 'household_info']

# Folds

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

# Investigate metadata sampling a bit (not a lot!)
The conclusion is twofold: 
- (1) it seems that connection_capacity contains a whole lot less info than yearly_consumption, clustering using only yearly_consumption results in a better sampling (in terms of ES) 
- (2) it seems that a more fine grained approach is more beneficial for the sampling 

### Look at the connection_power, yearly_consumption subspace

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

In [None]:
# household info 
info = daily_info_df.loc[:, 'household_info'].drop_duplicates().droplevel('date')[['connection_power', 'yearly_consumption']]

# normalize 
normalized_info = pd.DataFrame(MinMaxScaler().fit_transform(info), index = info.index, columns = info.columns)

#cluster normalized
clustering = KMeans(20).fit(normalized_info)
normalized_info = normalized_info.assign(labels = clustering.labels_)

# cluster unnormalized
clustering = KMeans(20).fit(info)
info = info.assign(labels = clustering.labels_)

# visualize
chart = alt.Chart(normalized_info).mark_circle().encode(
    x = 'connection_power', 
    y= 'yearly_consumption', 
    color = 'labels:N'
).interactive()

(chart | alt.Chart(info).mark_rule(strokeWidth = 2).encode(
    y= 'yearly_consumption', 
    color = 'labels:N'
).interactive()).resolve_scale(color = 'independent')


## Test different preprocessing of the info data

In [None]:
from energyclustering.sampling.samplers import MetadataSampler, RandomSamplerBaseline
from energyclustering.sampling.day_of_year_samplers import SimilarDayFromYearSampler
from sklearn.cluster import KMeans
from energyclustering.sampling.evaluation import SamplerEvaluator
from pathlib import Path
NB_CLUSTERS = 10
NB_DAYS = 10

In [None]:
class TwoColumnsNormalized:
    def __init__(self): 
        self.scaler = MinMaxScaler()
    
    def fit(self, info_df):
        info = info_df.loc[:, ['connection_power', 'yearly_consumption']]
        scaler = self.scaler.fit(info)
    
    def transform(self, info_df): 
        info_df = info_df[['connection_power', 'yearly_consumption']]
        return pd.DataFrame(self.scaler.transform(info_df), index = info_df.index, columns = info_df.columns)
    
    def fit_transform(self, info_df): 
        self.fit(info_df)
        return self.transform(info_df)
    
class TwoColumns:
    def __init__(self): 
        self.scaler = MinMaxScaler()
    
    def fit(self, info_df):
        pass
    
    def transform(self, info_df): 
        info_df = info_df[['connection_power', 'yearly_consumption']]
        return info_df
    
    def fit_transform(self, info_df): 
        self.fit(info_df)
        return self.transform(info_df)
    
class OneColumn:
    def __init__(self): 
        pass
    
    def fit(self, info_df):
        pass
    
    def transform(self, info_df): 
        info_df = info_df[['yearly_consumption']]
        return info_df
    
    def fit_transform(self, info_df): 
        self.fit(info_df)
        return self.transform(info_df)
    
class AllColumnsNormalized:
    def __init__(self): 
        self.scaler = MinMaxScaler()
    
    def fit(self, info_df):
        scaler = self.scaler.fit(info_df)
    
    def transform(self, info_df): 
        return pd.DataFrame(self.scaler.transform(info_df), index = info_df.index, columns = info_df.columns)
    
    def fit_transform(self, info_df): 
        self.fit(info_df)
        return self.transform(info_df)
    
    

In [None]:
models = dict(
    baseline = 
        SimilarDayFromYearSampler(
            MetadataSampler(
                clusterer = KMeans(NB_CLUSTERS),
                info_preprocessing = None
            ), NB_DAYS, weather_df), 
    random = 
        SimilarDayFromYearSampler(
            RandomSamplerBaseline(n_samples = 30),
            NB_DAYS, weather_df
        ),
    all_columns = 
        SimilarDayFromYearSampler(
            MetadataSampler(
                clusterer = KMeans(NB_CLUSTERS),
                info_preprocessing = AllColumnsNormalized()
            ), NB_DAYS, weather_df), 
    two_columns = 
        SimilarDayFromYearSampler(
            MetadataSampler(
                clusterer = KMeans(NB_CLUSTERS),
                info_preprocessing = TwoColumnsNormalized()
            ), NB_DAYS, weather_df),
    two_columns_unnormalized = 
        SimilarDayFromYearSampler(
            MetadataSampler(
                clusterer = KMeans(NB_CLUSTERS),
                info_preprocessing = TwoColumns()
            ), NB_DAYS, weather_df),
    one_column = 
        SimilarDayFromYearSampler(
            MetadataSampler(
                clusterer = KMeans(NB_CLUSTERS),
                info_preprocessing = OneColumn()
            ), NB_DAYS, weather_df),
)

In [None]:
%%time 
energy_scores = []

result_path = Path()/'results'/'temp'
result_path.mkdir(parents = True, exist_ok = True)

with Client(local_directory = '/cw/dtailocal/', n_workers=40, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 200, crossval = True)
    for key, model in models.items():
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())
energy_scores

In [None]:
plot_df = energy_scores.mean(axis = 0).to_frame('mean energy score')
plot_df

In [None]:
random_df = plot_df.loc[['random']]
random_df

Test different amount of clusters¶## Test different amount of clusters

In [None]:
models = dict()
for nb_clusters in [1, 2, 3, 5, 10, 15,20, 30, 50, 75, 100, 150, 200]: 
    models[f'one_column_{nb_clusters}'] = SimilarDayFromYearSampler(
            MetadataSampler(
                clusterer = KMeans(nb_clusters),
                info_preprocessing = OneColumn()
            ), NB_DAYS, weather_df)
    models[f'two_columns_{nb_clusters}'] = SimilarDayFromYearSampler(
            MetadataSampler(
                clusterer = KMeans(nb_clusters),
                info_preprocessing = TwoColumnsNormalized()
            ), NB_DAYS, weather_df)
    models[f'tall_columns_{nb_clusters}'] = SimilarDayFromYearSampler(
            MetadataSampler(
                clusterer = KMeans(nb_clusters),
                info_preprocessing = AllColumnsNormalized()
            ), NB_DAYS, weather_df)

In [None]:
%%time 
energy_scores = []

result_path = Path()/'results'/'temp_full'
result_path.mkdir(parents = True, exist_ok = True)

with Client(local_directory = '/cw/dtailocal/', n_workers=40, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 200, crossval = True)
    for key, model in models.items():
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot_df = pd.concat([energy_scores.mean(axis = 0).to_frame('mean energy score'), energy_scores.std(axis = 0).to_frame('std')], axis = 1)

# parse the names
metadata_plot_df = plot_df.assign(
    nb_clusters = lambda x: x.index.map(lambda y: int(y.split('_')[-1])), 
    name = lambda x: x.index.map(lambda y: "_".join(y.split('_')[:-1])), 
    min_area = lambda x: x['mean energy score'] - x['std'], 
    max_area = lambda x: x['mean energy score'] + x['std'],
).set_index(['name', 'nb_clusters'], drop = True).sort_index()


metadata_plot_df

In [None]:
def big_chart(chart, fontsize = 20): 
    return chart.configure_axis(
        grid = False, 
    labelFontSize = fontsize,
    titleFontSize = fontsize, 
        offset = 5, 
).configure_title(
    fontSize = fontsize
    ).configure_legend(
titleFontSize=fontsize,
labelFontSize=fontsize
).configure_view(
    strokeWidth=0
)

In [None]:
metadata_plot_df = metadata_plot_df.rename({'one_column':'consumption', 'two_columns':'consumption+power', 'tall_columns': 'all_features'})
chart = alt.Chart(metadata_plot_df.reset_index()).mark_line().encode(
    x = alt.X('nb_clusters:Q', title = '#clusters'),
    y = alt.Y('mean energy score:Q', scale = alt.Scale(zero = False), title = 'ES'),
    color = alt.Color('name', title = 'Used features')
)
random_chart = alt.Chart(random_df.reset_index()).mark_rule(color = 'gray', strokeWidth = 3).encode(
    y = 'mean energy score:Q'
)

c = alt.layer(chart, chart.mark_circle(), random_chart).properties().configure_axis(
    grid=False
)
big_chart(c)

In [None]:
chart.encode(y = alt.Y('std:Q', scale = alt.Scale(zero = False)))

# Investigate consumption clustering 

In [None]:
from energyclustering.sampling.samplers import ConsumptionDataSampler
from sklearn.ensemble import RandomForestClassifier 

## Check different clusterings

In [None]:
from sklearn_extra.cluster import KMedoids

In [None]:
from energyclustering.clustering.similarity.wasserstein import wasserstein_distance_between_years
from energyclustering.clustering.clusterers import MyKMedoids, PrecomputedClustering, PrecomputedDistanceMetricClustering
from sklearn.metrics.pairwise import pairwise_distances
from numba import jit, float64
from pyclustering.utils.metric import type_metric, distance_metric;
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.kmeans import kmeans


In [None]:
from energyclustering.webapp.resultparser import COBRASResult
WASSER = 'full_distance_matrix_wasserstein'
directory = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/')
clustering_series, _ = COBRASResult('result_20211124_koen', directory/WASSER).get_clustering_df()
clustering_series = clustering_series.pipe(lambda x: x.set_axis(x.index.map(str), axis = 0)).label
clustering_series.head()

In [None]:
clustering_series.unique().shape[0]

In [None]:
@jit(float64(float64[:], float64[:]), nogil = True, nopython = True)
def dist(a1, a2): 
    return np.nanmean((a1-a2)**2)
custom_metric = distance_metric(type_metric.USER_DEFINED, func = dist)

class CustomKMeans: 
    def __init__(self, nb_clusters, random_state = None): 
        self.nb_clusters = nb_clusters
    
    def fit(self, data): 
        # initialize initial centers using K-Means++ method
        initial_centers = kmeans_plusplus_initializer(data, self.nb_clusters).initialize()
        # create instance of K-Means algorithm with prepared centers
        kmeans_instance = kmeans(data, initial_centers, metric = custom_metric)
        # run cluster analysis and obtain results
        kmeans_instance.process()
        labels = np.zeros(data.shape[0])
        for cluster_idx, instance_indices in enumerate(kmeans_instance.get_clusters()): 
            labels[instance_indices] = cluster_idx
        self.labels_ = labels.astype('int')
        return self
    
    

In [None]:
NB_CLUSTERS = 40 
models = dict(
    expert = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(), 
                clusterer = PrecomputedClustering(clustering_series),
                info_preprocessing = None
            ), NB_DAYS, weather_df
    ),
    euclidean = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = KMeans(NB_CLUSTERS),
                info_preprocessing = None
            ), NB_DAYS, weather_df
    ),
    wasserstein_precomputed = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = PrecomputedDistanceMetricClustering(NB_CLUSTERS, directory/WASSER/'full_distance_matrix.pkl'),
                info_preprocessing = None
            ), NB_DAYS, weather_df
    ),
    euclidean_two_columns = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = KMeans(NB_CLUSTERS),
                info_preprocessing = TwoColumns()
            ), NB_DAYS, weather_df
    ),
    
    euclidean_one_column = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = KMeans(NB_CLUSTERS),
                info_preprocessing = OneColumn()
            ), NB_DAYS, weather_df
    ),
    
) 


In [None]:
%%time 
energy_scores = []

result_path = Path()/'results'/'temp_full'
result_path.mkdir(parents = True, exist_ok = True)

with Client(local_directory = '/cw/dtailocal/', n_workers=40, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 200, crossval = True)
    for key, model in models.items():
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot1_df = energy_scores.mean(axis = 0).to_frame('mean energy score')
plot1_df

## Check different numbers of clustering 

In [None]:
from sklearn.calibration import CalibratedClassifierCV

In [None]:
models = dict()
for nb_clusters in [1, 2, 3, 5, 10, 15,20, 30, 50, 75, 100, 150, 200]:
    models[f'euclidean_{nb_clusters}'] =  SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = KMeans(nb_clusters),
                info_preprocessing = None
            ), NB_DAYS, weather_df)
    
#     models[f'euclidean_calibrated_{nb_clusters}'] =  SimilarDayFromYearSampler(
#             ConsumptionDataSampler(
#                 classifier = CalibratedClassifierCV(RandomForestClassifier()),
#                 clusterer = KMeans(nb_clusters),
#                 info_preprocessing = None
#             ), NB_DAYS, weather_df)
    
    
    models[f'wasserstein_pre_{nb_clusters}'] = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer =  PrecomputedDistanceMetricClustering(nb_clusters, directory/WASSER/'full_distance_matrix.pkl'),
                info_preprocessing = None
            ), NB_DAYS, weather_df)
    models[f'euclidean_missing_{nb_clusters}'] = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = CustomKMeans(NB_CLUSTERS),
                info_preprocessing = None,
            ), NB_DAYS, weather_df)

In [None]:
%%time 
energy_scores = []

result_path = Path()/'results'/'temp'
result_path.mkdir(parents = True, exist_ok = True)

with Client(local_directory = '/cw/dtailocal/', n_workers=40, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 200, crossval = True)
    for key, model in models.items():
        energy_score = evaluator.evaluate_and_save(model, result_path/f"{key}.pkl")
        energy_scores.append(energy_score)
energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot_df = energy_scores.mean(axis = 0).to_frame('mean energy score')

# parse the names
consumption_plot_df = plot_df.assign(
    nb_clusters = lambda x: x.index.map(lambda y: int(y.split('_')[-1])), 
    name = lambda x: x.index.map(lambda y: "_".join(y.split('_')[:-1]))
).set_index(['name', 'nb_clusters'], drop = True).sort_index()


consumption_plot_df.loc[('expert_COBRAS', 13), 'mean energy score'] = plot1_df.loc['expert', 'mean energy score']
consumption_plot_df

In [None]:
plot_df = consumption_plot_df#.drop(index = ['euclidean_calibrated'])
chart = alt.Chart(plot_df.reset_index()).mark_line(strokeWidth = 3).encode(
    x = alt.X('nb_clusters:Q', title = '#clusters'),
    y = alt.Y('mean energy score:Q', scale = alt.Scale(zero = False), title = 'ES (lower is better)'),
    color = 'name'
)

chart = alt.layer(chart, chart.mark_circle(size = 50))
c = random_chart + chart
big_chart(c)

## Compared with metadata clustering
Interestingly, consumption clustering seems less sensitive to a clustering that is to fine grained. 
This is probably due to the fact that the classifier cannot distinguish the different clusterings anymore and will just assign an instance to both of them. 


In [None]:
all_plot_df = (
    pd.concat([plot_df,metadata_plot_df])
    .sort_index()
    .drop(index = [ 'all_features', 'consumption', 'euclidean', 'expert_COBRAS'])
#     .loc[(slice(None), slice(0,101)),:]
)
all_plot_df

In [None]:
chart = chart.properties(data= all_plot_df.reset_index()).encode(color = alt.Color(legend = None))
big_chart(random_chart + chart)

## Check the decision tree and random forest behind this

### using a random forest
We can see that connection power is being used a lot

In [None]:
models = dict(
    euclidean = SimilarDayFromYearSampler(
                ConsumptionDataSampler(
                    classifier = RandomForestClassifier(),
                    clusterer = KMeans(50),
                    info_preprocessing = None
                ), NB_DAYS, weather_df), 
    wasserstein = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer =  PrecomputedDistanceMetricClustering(50, directory/WASSER/'full_distance_matrix.pkl'),
                info_preprocessing = None
            ), NB_DAYS, weather_df)
)

energy_scores = []
with Client(local_directory = '/cw/dtailocal/', n_workers=40, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 200, crossval = False)
    for key, model in models.items(): 
        energy_score = evaluator.evaluate(model)
        energy_scores.append(energy_score)
        
energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot_df = energy_scores.mean(axis = 0).to_frame('mean energy score')
plot_df

In [None]:
info = daily_info_df.loc[:, 'household_info'].drop_duplicates().droplevel('date')

In [None]:
feature_importances = pd.DataFrame(columns = info.columns)


for key, model in models.items(): 
    feature_importances.loc[key] = model.yearly_sampler.classifier.feature_importances_
feature_importances

### Using a decision tree
This performs less good but we can inspect the tree to see what happens. 
You can indeed clearly see that connection_capacity is sometimes used but only in certain cases! 
The most used feature is yearly consumption! 

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
models = dict(
    euclidean = SimilarDayFromYearSampler(
                ConsumptionDataSampler(
                    classifier = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 10),
                    clusterer = KMeans(50),
                    info_preprocessing = None
                ), NB_DAYS, weather_df), 
    wasserstein = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 10),
                clusterer =  PrecomputedDistanceMetricClustering(50, directory/WASSER/'full_distance_matrix.pkl'),
                info_preprocessing = None
            ), NB_DAYS, weather_df)
)

energy_scores = []
with Client(local_directory = '/cw/dtailocal/', n_workers=40, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 200, crossval = False)
    for key, model in models.items(): 
        energy_score = evaluator.evaluate(model)
        energy_scores.append(energy_score)
        
energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot_df = energy_scores.mean(axis = 0).to_frame('mean energy score')
plot_df

In [None]:
feature_importances = pd.DataFrame(columns = info.columns)


for key, model in models.items(): 
    feature_importances.loc[key] = model.yearly_sampler.classifier.feature_importances_
feature_importances

In [None]:
%matplotlib inline

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize = (100, 15), dpi = 300)
annotations = plot_tree(models['wasserstein'].yearly_sampler.classifier, feature_names = feature_importances.columns)

## Just as a sanity check show that doing deterministic assignments is really not good 

In [None]:
models = dict(
    deterministic = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer =  PrecomputedDistanceMetricClustering(50, directory/WASSER/'full_distance_matrix.pkl'),
                info_preprocessing = None, 
                deterministic = True,
            ), NB_DAYS, weather_df),
    probabilistic = SimilarDayFromYearSampler(
            ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer =  PrecomputedDistanceMetricClustering(50, directory/WASSER/'full_distance_matrix.pkl'),
                info_preprocessing = None
            ), NB_DAYS, weather_df),
)

energy_scores = []
result_path = Path()/'results'/'temp_full'
result_path.mkdir(parents = True, exist_ok = True)
with Client(local_directory = '/cw/dtailocal/', n_workers=40, threads_per_worker = 1) as client:
    evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 200, crossval = True)
    for key, model in models.items(): 
        energy_score = evaluator.evaluate_and_save(model, result_path/f'{key}.pkl')
        energy_scores.append(energy_score)
        
energy_scores = pd.concat(energy_scores, axis = 1, keys = models.keys())

# aggregate energy scores
plot_df = energy_scores.mean(axis = 0).to_frame('mean energy score')
plot_df