In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from energyclustering.sampling.inspection.consumptionclustering import ConsumptionClusteringInspector
from dask.distributed import Client
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
import matplotlib.pyplot as plt
import seaborn as sns
from energyclustering.sampling.samplers import ConsumptionDataSampler, MetadataSampler, EnergyvilleDaySelectionBaseline, RandomSamplerBaseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler, GenerateSampleDecorator
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from pathlib import Path
from energyclustering.sampling.evaluation.evaluation import SamplerEvaluator

In [None]:
from energyclustering.sampling.inspection.classificationinspection import ClassificationInspection

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
%matplotlib inline
# %config InlineBackend.figure_formats = ['svg']

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    # no subsampling this time
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(1000)
    .get_data()
)
household_info = daily_info_df.loc[:, 'household_info'].droplevel('date').pipe(lambda x: x[~x.index.duplicated(keep = 'first')])

# Folds

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

In [None]:
train_set = np.concatenate((folds[0],folds[1]))
test_set = folds[2]

## Custom metric for clustering

In [None]:
# Custom metric
from pyclustering.utils.metric import type_metric, distance_metric;
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.kmeans import kmeans
from energyclustering.clustering.clusterers import MyKMedoids, PrecomputedClustering, PrecomputedDistanceMetricClustering
from pyclustering.cluster.kmedoids import kmedoids
from sklearn.metrics import pairwise_distances
from numba import jit, float64

In [None]:


@jit(float64(float64[:], float64[:]), nogil = True, nopython = True)
def dist(a1, a2): 
    return np.nanmean((a1-a2)**2)
custom_metric = distance_metric(type_metric.USER_DEFINED, func = dist)

custom_distance_matrix = pairwise_distances(data_df.to_numpy(), metric = dist, n_jobs = -1, force_all_finite = False)
custom_distance_matrix = pd.DataFrame(custom_distance_matrix, index = data_df.index, columns = data_df.index)



In [None]:
class CustomKMeans: 
    def __init__(self, nb_clusters, random_state = None): 
        self.nb_clusters = nb_clusters
    
    def fit(self, data): 
        # initialize initial centers using K-Means++ method
        initial_centers = kmeans_plusplus_initializer(data, self.nb_clusters).initialize()
        # create instance of K-Means algorithm with prepared centers
        kmeans_instance = kmeans(data, initial_centers, metric = custom_metric)
        # run cluster analysis and obtain results
        kmeans_instance.process()
        labels = np.zeros(data.shape[0])
        for cluster_idx, instance_indices in enumerate(kmeans_instance.get_clusters()): 
            labels[instance_indices] = cluster_idx
        self.labels_ = labels.astype('int')
        return self
    
    
class CustomKMedoids: 
    def __init__(self, nb_clusters, random_state = None): 
        self.nb_clusters = nb_clusters
        self.random_state = random_state
    
    def fit(self, data): 
        # initialize initial medoids at random
        generator = np.random.default_rng(self.random_state)
        initial_medoids = generator.choice(data.shape[0], size=self.nb_clusters, replace=False)
        # create instance of K-Means algorithm with prepared centers
        kmeans_instance = kmedoids(data.to_numpy(), initial_medoids, data_type='distance_matrix')
        # run cluster analysis and obtain results
        kmeans_instance.process()
        labels = np.zeros(data.shape[0])
        for cluster_idx, instance_indices in enumerate(kmeans_instance.get_clusters()): 
            labels[instance_indices] = cluster_idx
        self.labels_ = labels.astype('int')
        return self
    

In [None]:
def inspect(yearly_clusterer, daily_clusterer, yearly_data_to_use, daily_data_to_use, min_cluster_size = 10): 
    global inspector
    inspect = ClassificationInspection(yearly_clusterer, RandomForestClassifier(), yearly_data_to_use, household_info, train_set, test_set).fit_model()
    display(inspect.training_cluster_size_df().T)
    clusters_to_investigate = inspect.training_cluster_size_df().pipe(lambda x: x[x['#items'] > min_cluster_size]).index
    for cluster_idx in clusters_to_investigate: 
        instances_in_cluster = inspect.clustering.pipe(lambda x: x[x == cluster_idx]).index
        test_instances_in_cluster = instances_in_cluster.intersection(test_set)
        train_instances_in_cluster = instances_in_cluster.intersection(train_set)
        daily_data = daily_data_to_use.loc[instances_in_cluster]
        day_info = daily_info_df.loc[instances_in_cluster, 'day_info']
        inspector = ClassificationInspection(daily_clusterer, DecisionTreeClassifier(min_samples_leaf = 25, max_depth = 4, min_impurity_decrease = 0.01), daily_data, day_info, train_instances_in_cluster, test_instances_in_cluster)
        inspector = inspector.fit_model()
        display(HTML(f'<h1>cluster {cluster_idx}, #items {len(instances_in_cluster)}</h1>'))
        inspector.plot_clustering_line(sample = 500)
        display(inspector.training_cluster_size_df().T)
        display(inspector.confusion_matrix(sort_by_size = True))
        inspector.plot_tree()
        display(inspector.classification_performance())
    
    

In [None]:
inspect(KMeans(100), KMeans(20), data_df.fillna(0), daily_data_df, min_cluster_size = 5)

In [None]:
inspect(CustomKMedoids(40), KMedoids(20), custom_distance_matrix, daily_data_df.fillna(0), min_cluster_size = 5)

In [None]:
WASSER = 'full_distance_matrix_wasserstein'
directory = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/')
inspect(PrecomputedDistanceMetricClustering(NB_CLUSTERS, directory/WASSER/'full_distance_matrix.pkl'), data_df)