# Try to cluster the days and use this clustering to cluster the profiles

In [None]:
from util import *
from visualisation import *
import numpy as np 
import pandas as pd
import altair as alt
alt.renderers.enable('png')
import itertools
import dtaidistance.dtw as dtw
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import adjusted_rand_score
from cluster_visualisation import *
from profile_similarity import *
alt.data_transformers.disable_max_rows()
from tqdm import tqdm
%load_ext autoreload
%autoreload 2

## Read the data and subsample

In [None]:
%%time
info_df, data_df = read_data_pickle()
data_df = data_df.sort_index()
# only keep the last year of each profile 
last_of_each_profile = ~data_df.index.get_level_values(0).duplicated(keep = 'last')
data_df = data_df.loc[last_of_each_profile]
# data_df = data_df.sample(20, random_state = 2134)
print(f"There are {len(data_df)} profiles")
data_df.head()

## Transform the data in a useable format

In [None]:
day_df = get_day_df(data_df)
day_df.head()

In [None]:
day_df.shape

## Idea: do outlier detection first 
The idea is that these outliers are pretty much assigned to random clusters, it is better that they are not assigned to any clusters! 

## Cluster the days 
*Note: there is quite a significant difference between kmedoids and kmeans! ARI only 0.39*  
- k-means gives more attention to the average behavior (base usage) because the peaks kind of average out 
- k-medoids seems to care more about distinguishing feateres and not about the rest 

In [None]:
NB_OF_CLUSTERS = 500

In [None]:
%%time
labels_kmedoids, centers_kmedoids = cluster_KMedoids(day_df, nb_of_clusters = NB_OF_CLUSTERS, random_state = 10)

In [None]:
%%time 
labels_kmeans, centers_kmeans = cluster_KMeans(day_df, nb_of_clusters = NB_OF_CLUSTERS, random_state = 10)
centers_kmeans = get_medoids_per_cluster(labels_kmeans, day_df)

In [None]:
adjusted_rand_score(labels_kmedoids.to_numpy(), labels_kmeans.to_numpy())

## Show the clustering

In [None]:
%%time
# daily_clustering_chart(day_subset_df, labels_kmedoids);

In [None]:
%%time 
# daily_clustering_chart(day_subset_df, labels_kmeans)

## Calculate the DTW distances between the medoids 

In [None]:
%%time
labels_to_use, medoids_to_use = labels_kmedoids, centers_kmedoids 
medoid_distances = get_DTW_distance_matrix(centers_kmedoids.to_numpy(), window = 6, psi = 0, njobs = 4)
medoid_distances

## Calculate the distance matrix based on a matching problem
The main idea is the following when calculating the distance between two profiles x and y
you match the days and calculate the distance between the days. 
- the distance between two days that are in the same cluster is 0 
- the distance between two days that are in different clusters is the distance between the cluster medoids

This is an assignment problem! So all the matching clusters have distance 0 so we can just remove these.  
For the rest we make a cost matrix that describes the cost of matching a day from profile 1 to profile 2 (distance between the centroids)  
and let scipy solve the problem for us :D 

In [None]:
%%time
distance_matrix = profile_distance_matrix_based_on_daily_clustering(labels_to_use, medoid_distances)
distance_matrix.head()

## Cluster the profiles based on this distance matrix

In [None]:
%%time
full_labels, full_centers = cluster_KMedoids(data_df,30 , distance_matrix.to_numpy(), random_state = 1435)
# full_labels = cluster_spectral(data_df, distance_matrix.to_numpy(), 50)
full_labels.index = full_labels.index.droplevel(1)
full_labels.to_csv('04_28_full_clustering.csv')

In [None]:
# full_labels = pd.read_csv('04_22_full_clustering.csv')
# full_labels = full_labels.set_index('meterID').labels

In [None]:
cluster_count = full_labels.value_counts().to_frame('#profiles').rename_axis(index = 'cluster')
big_clusters = cluster_count[cluster_count['#profiles'] > 1].index
alt.Chart(cluster_count.reset_index()).mark_bar().encode(
    x = 'cluster:N', 
    y = '#profiles'
)

## Show the clustering

In [None]:
day_df.head()

In [None]:
full_labels.head()

In [None]:
distance_matrix.head()

In [None]:
# code for distances between medoids 
medoid_meters = full_centers.index.get_level_values(0).unique()
for idx1, idx2 in itertools.combinations(range(0,len(medoid_meters)), 2):
    day1_plot = all_day_plot(medoid_meters[idx1], data_df)
    day2_plot = all_day_plot(medoid_meters[idx2], data_df)
    distance = distance_matrix.loc[medoid_meters[idx1], medoid_meters[idx2]]
    chart = (day1_plot | day2_plot).resolve_scale(y='shared').properties(title = f"distance = {distance}")
    chart.save(f'pictures/cluster_{idx1}_with_cluster_{idx2}.png')

In [None]:
# code for distances within cluster 
for cluster_idx in range(0, len(medoid_meters)):
    profiles_in_cluster = full_labels[full_labels == cluster_idx].index.unique()
    medoid = medoid_meters[cluster_idx]
    for profile in profiles_in_cluster:
        day1_plot = all_day_plot(medoid, data_df)
        day2_plot = all_day_plot(profile, data_df)
        distance = distance_matrix.loc[medoid, profile]
        chart = (day1_plot | day2_plot).resolve_scale(y='shared').properties(title = f"distance = {distance}")
        chart.save(f'pictures/cluster_{cluster_idx}_with_profile_{profile}.png')

In [None]:
show_clustering(data_df, full_labels.to_frame('labels'), max_shown_instances = 8)

In [None]:
show_clustering(data_df, full_labels[full_labels.isin(big_clusters)].to_frame('labels'), max_shown_instances = 4, type = 'heatmap')