In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from dask.distributed import Client
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    # no subsampling this time
#     .subsample_days(week_reduction_factor = 5)
    # for testing only!
    .subsample_years(100)
    .get_data()
)
daily_data_df.shape

In [None]:
daily_info_df.loc[:, ('day_info', 'FeelsLikeC')]

# Folds

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

In [None]:
from energyclustering.sampling.samplers import ConsumptionDataSampler, MetadataSampler, EnergyvilleDaySelectionBaseline, RandomSamplerBaseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler, GenerateSampleDecorator
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from pathlib import Path
from energyclustering.sampling.evaluation.evaluation import SamplerEvaluator

In [None]:
# Custom metric
from pyclustering.utils.metric import type_metric, distance_metric;
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.kmeans import kmeans
def dist(a1, a2): 
    return np.nanmean((a1-a2)**2)
    

In [None]:
custom_metric = distance_metric(type_metric.USER_DEFINED, func=dist)
custom_metric

In [None]:
class CustomKMeans: 
    def __init__(self, nb_clusters, random_state = None): 
        self.nb_clusters = nb_clusters
    
    def fit(self, data): 
        # initialize initial centers using K-Means++ method
        initial_centers = kmeans_plusplus_initializer(data, self.nb_clusters).initialize()
        # create instance of K-Means algorithm with prepared centers
        kmeans_instance = kmeans(data, initial_centers, metric = custom_metric)
        # run cluster analysis and obtain results
        kmeans_instance.process()
        labels = np.zeros(data.shape[0])
        for cluster_idx, instance_indices in enumerate(kmeans_instance.get_clusters()): 
            labels[instance_indices] = cluster_idx
        self.labels_ = labels.astype('int')
    
    


In [None]:
clusterer = CustomKMeans(5, 0)
clusterer.fit(data_df.fillna(0))

In [None]:
clusterer.labels_

# Check consumption data clustering into detail

In [None]:
sampler = DailySamplerFromClusterSampler(
                yearly_sampler = ConsumptionDataSampler(
#                     classifier = RandomForestClassifier(), 
                    classifier = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 5), 
                    clusterer = CustomKMeans(10, random_state = 0), 
                    info_preprocessing = None
                ), 
                daily_sampler = ConsumptionDataSampler(
                    classifier = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 25), 
                    clusterer = KMeans(5, random_state = 0), 
                    info_preprocessing = None
                )
            )

sampler.fit(daily_data_df, data_df, daily_info_df)


In [None]:
sampler.yearly_sampler.clustering.value_counts()

In [None]:
clustering = sampler.yearly_sampler.clustering
small_clusters = data_df.loc[clustering.index[~clustering.isin([0,7])]]

In [None]:
small_clusters.isna().sum(axis = 1)

In [None]:
big_clusters = data_df.loc[clustering.index[clustering == 0]]
big_clusters.isna().sum(axis =1 ).hist(bins = 25)

In [None]:
sampler.daily_sampler_per_cluster[0].clustering.value_counts()

In [None]:
sampler.daily_sampler_per_cluster[3].clustering.value_counts()

# Year selection 

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.yearly_sampler.classifier, feature_names = daily_info_df.loc[:, 'household_info'].columns)


# Cluster 3
Contains ONLY one profile! But it does find distinct daily patterns

In [None]:
def plot_daily_cluster(yearly_cluster_idx, daily_cluster_idx): 
    clustering = sampler.daily_sampler_per_cluster[yearly_cluster_idx].clustering
    data_to_plot = daily_data_df.loc[clustering[clustering == daily_cluster_idx].index, :]
    data_to_plot = data_to_plot.stack().to_frame('value').rename_axis(('meterID', 'date', 'time'), axis = 0).reset_index()
    line_chart = alt.Chart(data_to_plot).mark_line().transform_calculate(
        color_test = "datum.meterID + '-' + datum.date"
    ).encode(
        x = 'time',
        y = 'value', 
        color = alt.Color('color_test:N', legend = None)
    )
    box_chart = alt.Chart(data_to_plot).mark_boxplot().encode(
        x = 'time', 
        y = 'value'
    )
    return (line_chart | box_chart).properties(title = f"yearly_cluster {yearly_cluster_idx}, daily cluster {daily_cluster_idx}")

In [None]:
cluster_idx = 2
for i in range(5): 
    plot_daily_cluster(cluster_idx, i).display()

### Look at the classifier

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[cluster_idx].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)


In [None]:
cluster_idx = 2
for i in range(5): 
    plot_daily_cluster(cluster_idx, i).display()

### Look at the classifier

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[cluster_idx].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)


In [None]:
cluster_idx = 1
for i in range(5): 
    plot_daily_cluster(cluster_idx, i).display()

### Look at the classifier

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[cluster_idx].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)


In [None]:
cluster_idx = 0
for i in range(5): 
    plot_daily_cluster(cluster_idx, i).display()

### Look at the classifier

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[cluster_idx].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)


In [None]:
plot_cluster(3,10)

In [None]:
plot_cluster(3,3)

In [None]:
plot_cluster(1,9)

In [None]:
plot_cluster(3,6)

In [None]:
plot_cluster(3,2)

In [None]:
plot_cluster(3,0)

## Look at tree

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[3].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)