In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from dask.distributed import Client
import pandas as pd
import altair as alt

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    # no subsampling this time
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    .subsample_years(500)
    .get_data()
)
household_info = daily_info_df.loc[:, 'household_info'].droplevel('date').pipe(lambda x: x[~x.index.duplicated(keep = 'first')])
daily_data_df.shape

# Clustering algoritms and distance matrices 
After some experimentation I figured out the following things: 
- clustering using dtw distances becomes unfeaseable for large numbers of instances and clusters 
    - Calculating a single dtw distance $O(4l)$ (linear because of the warping constraint) 
    - KMedoids: needs the complete distance matrix
    - KMedoids BUT there is a fast implementation called FasterPAM which is really fast! (implemented in rust) 
    - KMeans w. barycentric averaging 
    - Spectral clustering: also an option BUT also limited in number of instances 
- clustering with euclidean distances is a lot easier for large number of instances and clusters 
    - KMeans and KMedoids run a lot faster, but with an increasing number of clusters they become more expensive to run as well (even parallellized) 
    - DBSCAN is fast but works based on a radius, which is difficult to define in this case. + it is connectivity based, which is not what we want here!
    - BIRCH would have been an option, but this has a dependence on the number of features which is 96 in our case! (sklearn proposes a practical limit of 20 features)
    - MiniBatchKMeans seems a good option, it runs really fast but only finds an approximate solution (although that should not be too bad for us) 
    
 
    

In [None]:
from dtaidistance import dtw, clustering
import time
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import euclidean_distances
from tqdm import tqdm
import kmedoids


In [None]:
def kmedoids_dtw(data, n_clusters): # very slow compared to kmedoids_fast_dtw
    model = clustering.KMedoids(dtw.distance_matrix_fast,  {"window": 4},max_it = 10, k=n_clusters, show_progress = False)
    cluster_idx = model.fit(data)
    return cluster_idx 

def kmeans_dba(data, n_clusters):  
    model = clustering.KMeans(k=n_clusters, max_it=10, max_dba_it=10, dists_options={"window": 4}, show_progress = False)
    cluster_idx, performed_it = model.fit(data, use_c=True, use_parallel=True)
    return cluster_idx

def kmeans_euc(data, n_clusters): 
    model = KMeans( n_clusters=n_clusters)
    model.fit(data)
    return model.labels_

def kmedoids_fast_euc(data, n_clusters): 
    km = kmedoids.KMedoids(n_clusters, method='fasterpam')
    matrix = euclidean_distances(data)
    c = km.fit(matrix)
    return c.labels_

def kmedoids_fast_dtw(data, n_clusters): 
    matrix = dtw.distance_matrix_fast(data, window = 4)
    km = kmedoids.KMedoids(n_clusters, method = 'fasterpam') 
    c = km.fit(matrix)
    return c.labels_

def minibatchkmeans(data, n_clusters): 
    model = MiniBatchKMeans( n_clusters=n_clusters, batch_size = 256*40)
    model.fit(data)
    return model.labels_

all_algorithms = {a.__name__: a for a in [kmeans_dba, kmeans_euc, kmedoids_fast_euc, kmedoids_fast_dtw, minibatchkmeans]}
fast_algorithms = {a.__name__: a for a in [kmeans_euc, kmedoids_fast_euc, kmedoids_fast_dtw, minibatchkmeans]}

In [None]:
def runtime_experiment(sizes, n_clusters, clustering_algorithms): 
    timings = pd.DataFrame(index = pd.MultiIndex.from_product([sizes, n_clusters]), columns = clustering_algorithms.keys())
    hfig = display(timings, display_id=True)
    for size in sizes: 
        days = daily_data_df.sample(size, random_state = 0, replace = True).to_numpy()
        for k in n_clusters: 
            for algo_name, cluster in clustering_algorithms.items(): 
                start_time = time.time()
                labels = cluster(days, k)
                end_time = time.time() - start_time
                timings.loc[(size, k), algo_name] = end_time 
                hfig.update(timings)
    return timings

# all algorithms some slow

In [None]:
runtime_experiment(np.logspace(8, 12, num = 3, base = 2, dtype = 'int'), [100], all_algorithms)

# fast algorithms

In [None]:
# runtime_experiment(np.logspace(11, 20, num = 8, base = 2, dtype = 'int'), [500, 1000, 2000], fast_algorithms)

## Two step clustering
  

In [None]:
from energyclustering.clustering.preclustering import PreClusteringClusterer

euc_distance_matrix = lambda x: euclidean_distances(x)
class CustomKMedoids: 
    def __init__(self, nb_clusters, metric, random_state = None): 
        self.nb_clusters = nb_clusters
        self.metric = metric
        self.random_state = random_state
        self.labels_ = None
    
    def fit(self, data):
        matrix = self.metric(data) 
        km = kmedoids.KMedoids(self.nb_clusters, method = 'fasterpam', random_state = self.random_state) 
        c = km.fit(matrix)
        self.labels_ = c.labels_.astype('int')
        return self
clusterer = PreClusteringClusterer(
        pre_clusterer = MiniBatchKMeans(n_clusters = 5000, batch_size = 40*300), 
        post_clusterer = CustomKMedoids(40, euc_distance_matrix, random_state = 0)
    )

In [None]:
%%time
clusterer.fit(daily_data_df.sample(300*365).to_numpy())

# Check whether dtw distances help in classification 
We test kmeans, minibatchkmeans, kmedoids and kmedoids_dtw

In [None]:
from energyclustering.sampling.inspection.classificationinspection import ClassificationInspection
from sklearn.ensemble import RandomForestClassifier

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)
train_set = np.concatenate((folds[0],folds[1]))
test_set = folds[2]

In [None]:
def accuracy_exp(clusterer, data_to_use, data_to_plot = None):
    global inspect
    inspect = ClassificationInspection(clusterer, RandomForestClassifier(), data_to_use, daily_info_df.loc[:, 'day_info'], train_set, test_set).fit_model()
    if data_to_plot is None:
        inspect.data = daily_data_df
    else: 
        inspect.data = data_to_plot
    
    display(inspect.training_cluster_size_df().T)
#     display(inspect.confusion_matrix(sort_by_size = True))
    display(inspect.classification_performance())
    display(inspect.plot_clustering_line(sample = 25))
    

## Do a yearly clustering first

In [None]:
matrix = euclidean_distances(data_df.fillna(0))
matrix = pd.DataFrame(matrix, index = data_df.index)
inspect = ClassificationInspection(kmedoids.KMedoids(10, method='fasterpam', random_state = 0), RandomForestClassifier(), matrix,household_info, train_set, test_set).fit_model()
inspect.data = data_df
display(inspect.cluster_size_df().T)
inspect.plot_yearly_clustering_line()

In [None]:
profiles_to_plot = inspect.clustering.pipe(lambda x: x[x == 0]).index
profiles_to_plot

In [None]:
daily_data_df_subset = daily_data_df.loc[profiles_to_plot]
daily_data_df_subset
shuffled = daily_data_df_subset.index.get_level_values(0).unique().to_numpy(copy = True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)
train_set = np.concatenate((folds[0],folds[1]))
test_set = folds[2]

In [None]:
NB_CLUSTERS = 50

In [None]:
accuracy_exp(KMeans(n_clusters = NB_CLUSTERS), daily_data_df_subset, daily_data_df_subset)

In [None]:
matrix = dtw.distance_matrix_fast(daily_data_df_subset.to_numpy(), window = 4)
matrix = pd.DataFrame(matrix, index = daily_data_df_subset.index)

accuracy_exp(kmedoids.KMedoids(NB_CLUSTERS, method ='fasterpam') , matrix, daily_data_df_subset)

In [None]:
matrix = euclidean_distances(daily_data_df_subset.to_numpy())
matrix = pd.DataFrame(matrix, index = daily_data_df_subset.index)

accuracy_exp(kmedoids.KMedoids(NB_CLUSTERS, method = 'fasterpam') , matrix, daily_data_df_subset)