In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
from sklearn.decomposition import NMF
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import SpectralClustering

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
zero_path = RESULT_PATH / 'zero_interval_is_error.csv'
interval_path = RESULT_PATH /'intervals_with_info.csv'
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists() and zero_path.exists(), 'These paths should exist'

# Util methods

In [None]:
def vec_dt_replace(series, year=None, month=None, day=None):
    return pd.to_datetime(
        {'year': series.year if year is None else year,
         'month': series.month if month is None else month,
         'day': series.day if day is None else day, 
        'hour': series.hour,
        'minute': series.minute})

In [None]:
def add_date(series): 
    return pd.to_datetime(series, format='%H:%M:%S')

In [None]:
def get_DTW_distance_matrix(series, window, psi): 
    series = SeriesContainer.wrap(series)
    distance_matrix = dtw.distance_matrix_fast(series, window=window, psi=psi,compact = False)
    return distance_matrix

In [None]:
def cluster_timeseries_k_mediods_DTW(series, n_clusters, window, psi):
    # from LinkageTree implementation in dtaidistance
    series = SeriesContainer.wrap(series)
    distance_matrix = dtw.distance_matrix_fast(series, window=window, psi=psi,compact = False)
    # so this distance matrix is upper triangular but it needs to be a full matrix for the clusterer
    distance_matrix[np.isinf(distance_matrix)] = 0
    # this works because the diagonal is 0
    full_matrix = distance_matrix + distance_matrix.T
    clusterer = KMedoids(n_clusters, metric='precomputed', init='k-medoids++', max_iter=1000)
    
    clusterer.fit(full_matrix)
    labels = clusterer.labels_
    centers = series[clusterer.medoid_indices_]
    return labels, centers

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']

# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]


In [None]:
data_to_use = data_df

## Plot a single profile

In [None]:
IDX_TO_PLOT = 10 
data_to_plot = data_to_use.iloc[IDX_TO_PLOT].to_frame('Consumption').reset_index()
alt.Chart(data_to_plot, width = 1500, height = 300).mark_line().encode(
    x = 'timestamp:T', 
    y = 'Consumption'
)

# Group by month and sum

In [None]:
normalized_monthly_consumption = data_to_use.groupby(data_to_use.columns.month, axis = 1).sum().apply(lambda x: x / np.sum(x), axis = 1, raw = True)
alt.Chart(normalized_monthly_consumption.iloc[IDX_TO_PLOT].to_frame('normalized monthly consumption').reset_index()).mark_line().encode(
    x = alt.X('timestamp', title = 'Month'), 
    y = alt.Y('normalized monthly consumption', title = 'Normalized Monthly Consumption')
)

## Look at outliers
This are profiles that inject more than what they consume and thus normalizing does not really work 

In [None]:
outliers = data_to_use.loc[normalized_monthly_consumption[(normalized_monthly_consumption < -20).any(axis = 1)].index]
alt.Chart(outliers.stack().to_frame('value').reset_index()).mark_line().encode(
    x = 'timestamp:T', 
    y = 'value', 
    color = 'meterID'
).interactive()

In [None]:
outliers.sum(axis = 1)

### Look at the non outliers

In [None]:
alt.Chart(normalized_monthly_consumption.loc[normalized_monthly_consumption.index.difference(outliers.index)].stack().to_frame('value').reset_index()).mark_line().encode(
    x = 'timestamp', 
    y = 'value', 
    color = 'meterID'
)

## Cluster these using k-means

In [None]:
data_to_cluster = (
    normalized_monthly_consumption
    .dropna(axis = 0)
    .pipe(lambda x: x[~(x < 0).any(axis = 1)])
)

clusterer = KMeans(15, random_state = 1213)
# clusterer = KMedoids(100)
labels = clusterer.fit_predict(data_to_cluster.to_numpy())
labels = pd.Series(labels, index = data_to_cluster.index, name = 'cluster_idx')
cluster_sizes = labels.value_counts().rename_axis('cluster_idx').rename('size')
big_clusters = cluster_sizes[cluster_sizes >= 10]
vis_df = data_to_cluster.stack().to_frame('value').join(labels)
vis_df['id'] = vis_df.index.get_level_values(0).astype('str') + ','+vis_df.index.get_level_values(1).astype('str')
# vis_df = vis_df[vis_df.cluster_idx.isin(big_clusters.index)]
alt.Chart(vis_df.reset_index()).mark_line(clip = True, opacity=0.4, thickness=0.2).encode(
    x = 'timestamp', 
    y = alt.Y('value', scale = alt.Scale(domain = [0,0.3])), 
    color = alt.Color('id:N', legend = None, scale = alt.Scale(scheme='rainbow'))
).facet(facet = 'cluster_idx', columns = 5).resolve_scale(y = 'shared', color = 'independent').resolve_axis(y = 'independent').resolve_legend('independent')

In [None]:
visw_df = vis_df[vis_df.cluster_idx.isin([0,1,6,7,9,11,13])]
alt.Chart(visw_df.reset_index()).mark_line(clip = True, opacity=0.4, thickness=0.2).encode(
    x = 'timestamp', 
    y = alt.Y('value', scale = alt.Scale(domain = [0,0.3])), 
    color = alt.Color('id:N', legend = None, scale = alt.Scale(scheme='rainbow'))
).facet(facet = 'cluster_idx', columns = 4).resolve_scale(y = 'shared', color = 'independent').resolve_axis(y = 'independent').resolve_legend('independent')

In [None]:
concave = [18, 34, 46, 74, 85, 91, 95, 97, 98, 106, 148]
straight = [0, 43, 56, 109, 114,134]
convex = [33,130]

In [None]:
vis2_df = normalized_monthly_consumption.join(labels.astype('int')).groupby('cluster_idx').agg(['mean', 'min', 'max']).stack(level = 0).reset_index().rename(columns = {'level_1':'month'})
vis2_df

In [None]:
vis2_df = normalized_monthly_consumption.join(labels.astype('int')).groupby('cluster_idx').agg(['mean', 'min', 'max']).stack(level = 0).reset_index().rename(columns = {'level_1':'month'})
vis2_df = vis2_df[vis2_df.cluster_idx.isin(big_clusters.index)]
def cluster_summary(clusters_to_show):
    chart1 = alt.Chart(vis2_df[vis2_df.cluster_idx.isin(clusters_to_show)], width = 600, height = 600).mark_area(opacity = 0.2).encode(
        x = 'month:O', 
        y = 'min:Q', 
        y2 = 'max:Q', 
        color = 'cluster_idx:N'
    )
    chart2 = alt.Chart(vis2_df[vis2_df.cluster_idx.isin(clusters_to_show)], width = 600, height = 600).mark_line().encode(
        x = 'month:O', 
        y = 'mean:Q',
        color = 'cluster_idx:N'
    )
    return (chart1 | chart2).resolve_scale(y='shared')
cluster_summary(concave)

In [None]:
cluster_summary(convex)

In [None]:
cluster_summary(straight)

In [None]:
others = big_clusters.index.drop(convex).drop(concave).drop(straight).to_list()
cluster_summary(others)

### Remove outliers and cluster

In [None]:
outlier_detector = LocalOutlierFactor(50, contamination = 0.2)
outlier_labels = outlier_detector.fit_predict(normalized_monthly_consumption.dropna(axis = 0).to_numpy())
outliers = pd.Series(outlier_labels == -1, index = normalized_monthly_consumption.dropna().index, name = 'outlier_labels')
vis_df = normalized_monthly_consumption.dropna().stack().to_frame('value').join(outliers).reset_index()
alt.Chart(vis_df).mark_line().encode(
    x = 'timestamp', 
    y = 'value', 
    color = 'meterID', 
    facet = 'outlier_labels'
).resolve_scale(color = 'independent').resolve_legend('independent')

In [None]:
data = normalized_monthly_consumption.dropna().loc[~outliers]


In [None]:
clusterer = KMeans(25)
# clusterer = KMedoids(15)
labels = clusterer.fit_predict(data.to_numpy())
labels = pd.Series(labels, index = data.index, name = 'cluster_idx')
vis_df = data.stack().to_frame('value').join(labels)
alt.Chart(vis_df.reset_index()).mark_line().encode(
    x = 'timestamp', 
    y = 'value', 
    color = 'meterID', 
    facet = 'cluster_idx'
).resolve_scale(y = 'independent', color = 'independent').resolve_axis(y = 'independent').resolve_legend('independent')