# Try to cluster the days and use this clustering to cluster the profiles

In [None]:
from util import *
from visualisation import *
import numpy as np 
import pandas as pd
import altair as alt
alt.renderers.enable('png')
import itertools
import dtaidistance.dtw as dtw
from scipy.optimize import linear_sum_assignment
from cluster_visualisation import *
alt.data_transformers.disable_max_rows()
from tqdm import tqdm
%load_ext autoreload
%autoreload 2

In [None]:
import tslearn

In [None]:
info_df, data_df = read_data(nrows = 200)
# only keep the last year of each profile 
last_of_each_profile = ~data_df.index.get_level_values(0).duplicated(keep = 'last')
data_df = data_df.loc[last_of_each_profile]
data_df = data_df.sample(15, random_state = 2134)
data_df.head()

In [None]:
day_df = get_day_df(data_df)
day_df.head()

## Use first x days of each profile

In [None]:
NB_OF_DAYS = 100
day_subset_df = day_df.groupby(['meterID', 'year']).sample(NB_OF_DAYS)
# day_subset_df = day_df

In [None]:
%%time
distance_matrix = get_DTW_distance_matrix(day_subset_df, 4, 0)
distance_matrix

In [None]:
%%time
distance_matrix = get_DTW_distance_matrix_old(day_subset_df, 4, 0)
distance_matrix

In [None]:
%%time
labels, centers = cluster_KMedoids(day_subset_df, distance_matrix, nb_of_clusters = 50, random_state = 10)

## Show the clustering

In [None]:
daily_clustering_chart(day_subset_df, labels)

## Calculate the clustering of profiles based on this
The main idea is the following when calculating the distance between two profiles x and y
you match the days and calculate the distance between the days. 
- the distance between two days that are in the same cluster is 0 
- the distance between two days that are in different clusters is the distance between the cluster medoids

This is an assignment problem! So all the matching clusters have distance 0 so we can just remove these.  
For the rest we make a cost matrix that describes the cost of matching a day from profile 1 to profile 2 (distance between the centroids)  
and let scipy solve the problem for us :D 

In [None]:
def similarity_based_on_daily_clustering(profile1, profile2, labels, centers): 
    idx = pd.IndexSlice
    # cluster labels of each profile
    labels1 = labels.loc[idx[profile1],:].value_counts()
    labels2 = labels.loc[idx[profile2],:].value_counts()

    # put them in the same df 
    both_labels = labels1.to_frame('labels1').join(labels2.to_frame('labels2'), how = 'outer')

    # remove the matches 
    both_labels = both_labels.subtract(both_labels.min(skipna = False, axis = 1), axis = 0)

    # replace zero with Nan 
    both_labels = both_labels.replace({0.0:np.NaN})

    # remove all rows with NaN twice 
    both_labels = both_labels.dropna(axis = 0, how = 'all')

    # get the row clusters and column clusters 
    rows = both_labels['labels1'].dropna()
    columns = both_labels['labels2'].dropna()

    # preallocate the cost matrix (use pandas to keep it easy)
    row_index = []
    for cluster, times in rows.iteritems():
        row_index.extend([cluster]*int(times))
    column_index = []
    for cluster,times in columns.iteritems(): 
        column_index.extend([cluster]*int(times))
    cost_matrix = pd.DataFrame(index = row_index, columns = column_index, dtype = 'float')

    # fill the cost matrix with DTW distances between medoids 
    for row, column in itertools.product(cost_matrix.index.unique(), cost_matrix.columns.unique()):
        medoid1 = centers.iloc[row].to_numpy()
        medoid2 = centers.iloc[column].to_numpy()
        distance = dtw.distance(medoid1, medoid2, window =4, psi = 0, use_c = True)
        cost_matrix.loc[row,column] = distance
    cost_array = cost_matrix.to_numpy()
    row_ind, col_ind = linear_sum_assignment(cost_array)
    best_cost = cost_array[row_ind, col_ind].sum()
    return best_cost

In [None]:
all_profiles = labels.index.get_level_values(0).unique()
all_profiles
distance_matrix = np.zeros((len(all_profiles), len(all_profiles)))
for idx1, idx2 in itertools.combinations(range(0,len(all_profiles)), 2):
    meterID1 = all_profiles[idx1]
    meterID2 = all_profiles[idx2] 
    distance = similarity_based_on_daily_clustering(meterID1, meterID2, labels, centers)
    distance_matrix[idx1, idx2] = distance 
distance_matrix = distance_matrix + distance_matrix.T
distance_matrix = pd.DataFrame(distance_matrix, index = all_profiles, columns = all_profiles)
distance_matrix;

In [None]:

# full_labels, full_centers = cluster_KMedoids(data_df, distance_matrix.to_numpy(), 5)
full_labels = cluster_spectral(data_df, distance_matrix.to_numpy(), 8)
full_labels.index = full_labels.index.droplevel(1)
full_labels.to_csv('full_clustering.csv')


In [None]:
full_labels.value_counts().to_frame('#profiles').rename_axis(index = 'cluster')

In [None]:
show_clustering(data_df, full_labels.to_frame('labels'))