#### Standard imports

In [None]:
import altair as alt
import numpy as np 
import pandas as pd 
from pathlib import Path 

#### Template for charts

In [None]:
def big_chart(chart, fontsize = 20): 
    return chart.configure_axis(
        grid = False, 
    labelFontSize = fontsize,
    titleFontSize = fontsize, 
        offset = 5, 
).configure_title(
    fontSize = fontsize
    ).configure_legend(
titleFontSize=fontsize,
labelFontSize=fontsize
).configure_view(
    strokeWidth=0
)

def cluster_chart(plot_df): 
    chart = alt.Chart(plot_df).mark_circle(opacity = 1).encode(
        x = 'connection_power:Q', 
        y = 'yearly_consumption:Q', 
        color = alt.Color('cluster_idx:N', legend = None,  scale = alt.Scale(scheme = 'category20')),
    )
    return chart.interactive()

#### Load the data

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(None)
    .get_data()
)
# household info 
household_info = daily_info_df.loc[:, 'household_info'].drop_duplicates().droplevel('date')

daily_data_df.shape

## Visualization of profiles in yearly_consumption, connection_capacity space

In [None]:
household_info

In [None]:
chart = alt.Chart(household_info).mark_circle().encode(
       x = 'connection_power:Q', 
    y = 'yearly_consumption:Q'
).interactive()
big_chart(chart)

In [None]:
NB_CLUSTERS = 40 

# K-means clustering based on yearly_consumption and connection_capacity

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
data_to_cluster = (
    household_info[['connection_power', 'yearly_consumption']]
    .pipe(lambda x: MinMaxScaler().fit_transform(x))
)

clusterer = KMeans(n_clusters = NB_CLUSTERS).fit(data_to_cluster)
labels = pd.Series(clusterer.labels_, index = household_info.index, name = 'cluster_idx')
labels

In [None]:
plot_df = household_info[['connection_power', 'yearly_consumption']].join(labels)
two_attributes = cluster_chart(plot_df).properties(title = 'Yearly consumption + connection capacity clustering')
two_attributes

# K-means clustering based on yearly_consumption 

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
data_to_cluster = (
    household_info[['yearly_consumption']]
    .pipe(lambda x: MinMaxScaler().fit_transform(x))
)

clusterer = KMeans(n_clusters = NB_CLUSTERS).fit(data_to_cluster)
labels = pd.Series(clusterer.labels_, index = household_info.index, name = 'cluster_idx')
labels

In [None]:
plot_df = household_info[['yearly_consumption', 'connection_power']].join(labels)
one_attribute = cluster_chart(plot_df).properties(title = 'Yearly consumption clustering')
one_attribute

# K-medoids clustering based on consumption timeseries

In [None]:
from numba import jit, float64
from sklearn.metrics.pairwise import pairwise_distances
import kmedoids

In [None]:
%%time 
@jit(float64(float64[:], float64[:]), nogil = True, nopython = True)
def euc_dist_missing(a1, a2): 
    return np.nanmean((a1-a2)**2)

euc_distance_matrix_missing = lambda x: pairwise_distances(x, metric = euc_dist_missing, force_all_finite = 'allow-nan', n_jobs = -1)
# euc_distance_matrix = lambda x: euclidean_distances(x.fillna(0))
# dtw_distance_matrix = lambda x: dtw.distance_matrix_fast(x.to_numpy(), window = 4)

class CustomKMedoids: 
    def __init__(self, nb_clusters, metric, random_state = None): 
        self.nb_clusters = nb_clusters
        self.metric = metric
        self.random_state = random_state
        self.labels_ = None
    
    def fit(self, data):
        matrix = self.metric(data) 
        km = kmedoids.KMedoids(self.nb_clusters, method = 'fasterpam', random_state = self.random_state) 
        c = km.fit(matrix)
        self.labels_ = c.labels_.astype('int')
        return self
cache_path = Path()/f'cache/yearly_consumption_clustering_{NB_CLUSTERS}.pkl'
if cache_path.exists(): 
    labels = pd.read_pickle(cache_path)
else: 
    clusterer = CustomKMedoids(NB_CLUSTERS, euc_distance_matrix_missing, random_state = 0).fit(data_df)
    labels = pd.Series(clusterer.labels_, index = data_df.index, name = 'cluster_idx')
    cache_path.parent.mkdir(exist_ok=True)
    labels.to_pickle(cache_path)


In [None]:
plot_df = household_info[['yearly_consumption', 'connection_power']].join(labels).reset_index()
consumption_chart = cluster_chart(plot_df).properties(title = "Consumption TS clustering").encode(tooltip = 'meterID')
consumption_chart

In [None]:
keep_labels = labels.loc[household_info.index]

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
household_info
tree = RandomForestClassifier(criterion = 'log_loss').fit(household_info, keep_labels)
# leafs = pd.Series(tree.apply(household_info), index = household_info.index, name = 'cluster_idx')
# plot_df = household_info[['yearly_consumption', 'connection_power']].join(leafs)
# cluster_chart(plot_df)

In [None]:
pd.Series(tree.feature_importances_, index = household_info.columns)

In [None]:
def show_sampling_probs_for_instance(meterID): 
    highlight_df = household_info[['yearly_consumption', 'connection_power']].assign(color = lambda x: x.index == meterID)
    highlight = alt.Chart(highlight_df.reset_index()).mark_circle().encode(
        x = 'connection_power', 
        y = 'yearly_consumption', 
        color= alt.Color('color', scale = alt.Scale(domain = [False, True], range = ['gray', 'blue'])), 
        opacity = alt.Opacity('color', scale = alt.Scale(domain = [False, True], range = [0.5, 1])), 
        tooltip = 'meterID', 
        order = alt.Order('color', sort = 'ascending')
    )
    info = household_info.loc[[meterID]].to_numpy()
    probabilities = pd.Series(tree.predict_proba(info)[0], name = 'probs')
    prob_df = household_info[['yearly_consumption', 'connection_power']].join(keep_labels.to_frame().join(probabilities, on = 'cluster_idx'))
    prob_chart = alt.Chart(prob_df).mark_circle().encode(
        x = 'connection_power',
        y = 'yearly_consumption', 
        color = alt.Color('probs', scale = alt.Scale(scheme = 'teals')), 
        tooltip = 'probs'
    )
    return highlight.interactive() | prob_chart.interactive()
    

In [None]:
probs = show_sampling_probs_for_instance("('smartmeter_1989', 2016)")
probs

In [None]:
probs = show_sampling_probs_for_instance("('smartmeter_1130', 2017)")
probs

In [None]:
probs = show_sampling_probs_for_instance("('smartmeter_1154', 2017)")
probs

# Compare

In [None]:
two_attributes | one_attribute | consumption_chart