In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from energyclustering.sampling.inspection.consumptionclustering import ConsumptionClusteringInspector
from dask.distributed import Client
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
import matplotlib.pyplot as plt
import seaborn as sns
from energyclustering.sampling.samplers import ConsumptionDataSampler, MetadataSampler, EnergyvilleDaySelectionBaseline, RandomSamplerBaseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler, GenerateSampleDecorator
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from pathlib import Path
from energyclustering.sampling.evaluation.evaluation import SamplerEvaluator

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
# %config InlineBackend.figure_formats = ['svg']

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    # no subsampling this time
#     .subsample_days(week_reduction_factor = 5)
    # for testing only!
    .subsample_years(1000)
    .get_data()
)
daily_data_df.shape

In [None]:
household_info = daily_info_df.loc[:, 'household_info'].droplevel('date').pipe(lambda x: x[~x.index.duplicated(keep = 'first')])

In [None]:
daily_info_df.loc[:, ('day_info', 'FeelsLikeC')]

# Folds

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

In [None]:
train_set = np.concatenate((folds[0],folds[1]))
test_set = folds[2]

## Custom metric for clustering

In [None]:
# Custom metric
from pyclustering.utils.metric import type_metric, distance_metric;
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.kmeans import kmeans
from energyclustering.clustering.clusterers import MyKMedoids, PrecomputedClustering, PrecomputedDistanceMetricClustering
from pyclustering.cluster.kmedoids import kmedoids
from sklearn.metrics import pairwise_distances
from numba import jit, float64

In [None]:


@jit(float64(float64[:], float64[:]), nogil = True, nopython = True)
def dist(a1, a2): 
    return np.nanmean((a1-a2)**2)
custom_metric = distance_metric(type_metric.USER_DEFINED, func = dist)

custom_distance_matrix = pairwise_distances(data_df.to_numpy(), metric = dist, n_jobs = -1, force_all_finite = False)
custom_distance_matrix = pd.DataFrame(custom_distance_matrix, index = data_df.index, columns = data_df.index)



In [None]:
class CustomKMeans: 
    def __init__(self, nb_clusters, random_state = None): 
        self.nb_clusters = nb_clusters
    
    def fit(self, data): 
        # initialize initial centers using K-Means++ method
        initial_centers = kmeans_plusplus_initializer(data, self.nb_clusters).initialize()
        # create instance of K-Means algorithm with prepared centers
        kmeans_instance = kmeans(data, initial_centers, metric = custom_metric)
        # run cluster analysis and obtain results
        kmeans_instance.process()
        labels = np.zeros(data.shape[0])
        for cluster_idx, instance_indices in enumerate(kmeans_instance.get_clusters()): 
            labels[instance_indices] = cluster_idx
        self.labels_ = labels.astype('int')
        return self
    
    
class CustomKMedoids: 
    def __init__(self, nb_clusters, random_state = None): 
        self.nb_clusters = nb_clusters
        self.random_state = random_state
    
    def fit(self, data): 
        # initialize initial medoids at random
        generator = np.random.default_rng(self.random_state)
        initial_medoids = generator.choice(data.shape[0], size=self.nb_clusters, replace=False)
        # create instance of K-Means algorithm with prepared centers
        kmeans_instance = kmedoids(data.to_numpy(), initial_medoids, data_type='distance_matrix')
        # run cluster analysis and obtain results
        kmeans_instance.process()
        labels = np.zeros(data.shape[0])
        for cluster_idx, instance_indices in enumerate(kmeans_instance.get_clusters()): 
            labels[instance_indices] = cluster_idx
        self.labels_ = labels.astype('int')
        return self
    

# Cluster inspection

In [None]:
from energyclustering.sampling.inspection.classificationinspection import ClassificationInspection

In [None]:
def inspect(clusterer, data_to_use): 
    inspect = ClassificationInspection(clusterer, RandomForestClassifier(), data_to_use, household_info, train_set, test_set).fit_model()
    display(inspect.training_cluster_size_df().T)
    display(inspect.confusion_matrix(sort_by_size = True))
    display(inspect.classification_performance())
    
    
    

In [None]:
NB_CLUSTERS = 40

## KMeans
Only 2 useful clusters

In [None]:
inspect(KMeans(NB_CLUSTERS), data_df.fillna(0))

## KMeans with missingness

In [None]:
inspect(CustomKMeans(NB_CLUSTERS), data_df)

## Kmedoids

In [None]:
inspect(KMedoids(NB_CLUSTERS), data_df.fillna(0))

## KMedoids with missingness

In [None]:
inspect(CustomKMedoids(NB_CLUSTERS), custom_distance_matrix)

## Wasserstein

In [None]:
WASSER = 'full_distance_matrix_wasserstein'
directory = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/')
inspect(PrecomputedDistanceMetricClustering(NB_CLUSTERS, directory/WASSER/'full_distance_matrix.pkl'), data_df)

In [None]:
sampler = DailySamplerFromClusterSampler(
                yearly_sampler = ConsumptionDataSampler(
#                     classifier = RandomForestClassifier(), 
                    classifier = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 5), 
                    clusterer = KMeans(30, random_state = 0), 
                    info_preprocessing = None
                ), 
                daily_sampler = ConsumptionDataSampler(
                    classifier = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 25), 
                    clusterer = KMedoids(10, random_state = 0), 
                    info_preprocessing = None
                )
            )

sampler.fit(daily_data_df, data_df, daily_info_df)


In [None]:
sampler.yearly_sampler.clustering.value_counts().to_frame('#items')


# Year selection 

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.yearly_sampler.classifier, feature_names = daily_info_df.loc[:, 'household_info'].columns)


# Cluster 3
Contains ONLY one profile! But it does find distinct daily patterns

In [None]:
daily_data_df

In [None]:
clustering = sampler.yearly_sampler.clustering
clustering.to_frame('test')

In [None]:
plot_df = daily_data_df.sample(200).stack().rename_axis(('meterID','day','timestamp'), axis = 0).to_frame('value').join(clustering.rename_axis('meterID', axis = 0).to_frame('cluster_idx')).reset_index()
plot_df

In [None]:
plot_df['color'] = plot_df.meterID.astype('str')+plot_df.day.astype('str')

In [None]:
g = sns.FacetGrid(plot_df, row="cluster_idx", sharey=False, aspect = 3)
g.map(sns.boxplot, "timestamp", "value")

In [None]:
g = sns.FacetGrid(plot_df, row="cluster_idx", hue = 'color', sharey=False, aspect = 3)
g.map(sns.lineplot, "timestamp", "value", size = 0.1)

### Look at the classifier

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[cluster_idx].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)


### Look at the classifier

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[cluster_idx].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)


In [None]:
cluster_idx = 1
for i in range(5): 
    plot_daily_cluster(cluster_idx, i).display()

### Look at the classifier

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[cluster_idx].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)


In [None]:
cluster_idx = 0
for i in range(5): 
    plot_daily_cluster(cluster_idx, i).display()

### Look at the classifier

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[cluster_idx].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)


In [None]:
plot_cluster(3,10)

In [None]:
plot_cluster(3,3)

In [None]:
plot_cluster(1,9)

In [None]:
plot_cluster(3,6)

In [None]:
plot_cluster(3,2)

In [None]:
plot_cluster(3,0)

## Look at tree

In [None]:
plt.figure(figsize = (20,10), dpi = 100)
plot_tree(sampler.daily_sampler_per_cluster[3].classifier, feature_names = daily_info_df.loc[:, 'day_info'].columns)