In [None]:
from util import *
from visualisation import *
import numpy as np 
import pandas as pd
import altair as alt
alt.renderers.enable('png')
import itertools
import dtaidistance.dtw as dtw
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import adjusted_rand_score
from cluster_visualisation import *
from profile_similarity import *
alt.data_transformers.disable_max_rows()
from tqdm import tqdm
%load_ext autoreload
%autoreload 2

In [None]:
info_df, data_df = read_data(nrows = 2000)
# only keep the last year of each profile 
last_of_each_profile = ~data_df.index.get_level_values(0).duplicated(keep = 'last')
data_df = data_df.loc[last_of_each_profile]
data_df = data_df.sample(100, random_state = 2134)
print(f"There are {len(data_df)} profiles")
data_df.head()

In [None]:
day_df = get_day_df(data_df)
day_df.head()

### Try to find some profiles of interest to use 
- smartmeter_1013: pretty high consumption starting from 7 pm 
- smartmeter_120: pretty high consumption before 7pm
- smartmeter_1250, smartmeter_1254: very low consumption (but some variance) 
- smartmeter_1107: constant low consumption 
- smartmeter_1275: almost constant higher consumption 0.5
- smartmeter_300, smartmeter_234: should be similar peak before 12 and similar base usage 

In [None]:
profiles = day_df.index.get_level_values(0).unique()
show_profiles(day_df.loc[profiles[20:30]])

### Calculate the things necessary for the similarity metric

In [None]:
%%time
NB_OF_CLUSTERS = 1000
labels_kmedoids, centers_kmedoids = cluster_KMedoids(day_df, nb_of_clusters = NB_OF_CLUSTERS, random_state = 10)
medoid_distances = get_DTW_distance_matrix(centers_kmedoids.to_numpy(), window = 4, psi = 0, njobs = 2)


### Calculate the distance matrix

In [None]:
%%time
distance_matrix = profile_distance_matrix_based_on_daily_clustering(labels_kmedoids, medoid_distances)
distance_matrix

In [None]:
distances_to_10 = distance_matrix.loc['smartmeter_10'].sort_values().drop('smartmeter_10')


In [None]:
test_pair('smartmeter_10', distances_to_10.index[0])

In [None]:
%%time
test_pair_no_approx('smartmeter_10', distances_to_10.index[0])

## Test some pairs of profiles seperately

In [None]:
labels_kmedoids

In [None]:
def test_pair_no_approx(profile1, profile2): 
    distance_matrix = get_DTW_distance_matrix(day_df.loc[[profile1,profile2]].to_numpy(), window = 4, psi = 0, njobs = 2)
    labels = labels_kmedoids.loc[[profile1, profile2]].copy()
    labels.loc[:] = range(0, len(labels))
    chart = all_day_chart(day_df.loc[profile1].stack().to_frame('value').reset_index()).properties(title = profile1) | all_day_chart(day_df.loc[profile2].stack().to_frame('value').reset_index()).properties(title = profile2)
    similarity = profile_distance_based_on_daily_clustering(profile1, profile2, labels, distance_matrix)
    return chart.resolve_scale(y='shared').properties(title = f"distance={similarity}")
                                              
def test_pair(profile1, profile2): 
    chart = all_day_chart(day_df.loc[profile1].stack().to_frame('value').reset_index()).properties(title = profile1) | all_day_chart(day_df.loc[profile2].stack().to_frame('value').reset_index()).properties(title = profile2)
    similarity = profile_distance_based_on_daily_clustering(profile1, profile2, labels_kmedoids, medoid_distances)
    print(f"distance={similarity}")
    return chart.resolve_scale(y='shared').properties(title = f"distance={similarity}")

In [None]:
test_pair('smartmeter_1013', 'smartmeter_120')

In [None]:
test_pair('smartmeter_300', 'smartmeter_234')