# Code to cluster based on a given dissimilarity metric

In [None]:
import numpy as np 
import pandas as pd
import altair as alt
from pathlib import Path
from sklearn_extra.cluster import KMedoids
alt.data_transformers.disable_max_rows()

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

In [None]:
from distance_measures import distance_scaled_principle_directions, get_svd

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
zero_path = RESULT_PATH / 'zero_interval_is_error.csv'
interval_path = RESULT_PATH /'intervals_with_info.csv'
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists() and zero_path.exists(), 'These paths should exist'

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1], nrows = 100)
data_df = pd.read_csv(data_path, index_col = [0,1], nrows = 100)
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'

In [None]:
data_subset = data_df.iloc[0:15]

In [None]:
%%time
from sklearn.metrics import pairwise_distances
distances = pairwise_distances(data_subset.to_numpy(dtype ='float'), metric = distance_scaled_principle_directions, force_all_finite = 'allow-nan', n_jobs = 4, components_to_use = 10)
distances1 = pairwise_distances(data_subset.to_numpy(dtype ='float'), metric = distance_scaled_principle_directions, force_all_finite = 'allow-nan', n_jobs = 4, components_to_use = None)

In [None]:
distance_df = pd.DataFrame(distances)
distance1_df = pd.DataFrame(distances1)

In [None]:

distance_df.loc[0:15,0:15].style.background_gradient(axis = None)

In [None]:

distance1_df.loc[0:15,0:15].style.background_gradient(axis = None)

In [None]:
size_of_profiles = pairwise_distances(data_df.iloc[:15].sum(axis = 1).to_numpy().reshape((-1,1)), metric = lambda x,y: x + y)
pd.DataFrame(size_of_profiles).style.background_gradient(axis = 0)

In [None]:
similar = 10, 11
dissimilar = 5, 6


In [None]:
def plot_profiles(i1, i2): 
    subset = (
        # select right subset
        data_df.iloc[[i1,i2],:]
        .stack().to_frame('value')
        .reset_index()
        .assign(ID = lambda x: x.meterID.astype('str')+','+x.year.astype('str'))
    )
    return alt.Chart(subset, width = 1000, height = 200).mark_line().encode(
        x = 'timestamp:T', 
        y = 'value', 
        color = 'ID:N'
    ).interactive(bind_y = False)
plot_profiles(*dissimilar)

In [None]:
plot_profiles(*similar)