In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
from sklearn.decomposition import NMF
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import LocalOutlierFactor

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
zero_path = RESULT_PATH / 'zero_interval_is_error.csv'
interval_path = RESULT_PATH /'intervals_with_info.csv'
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists() and zero_path.exists(), 'These paths should exist'

# Util methods

In [None]:
def vec_dt_replace(series, year=None, month=None, day=None):
    return pd.to_datetime(
        {'year': series.year if year is None else year,
         'month': series.month if month is None else month,
         'day': series.day if day is None else day, 
        'hour': series.hour,
        'minute': series.minute})

In [None]:
def add_date(series): 
    return pd.to_datetime(series, format='%H:%M:%S')

In [None]:
def get_DTW_distance_matrix(series, window, psi): 
    series = SeriesContainer.wrap(series)
    distance_matrix = dtw.distance_matrix_fast(series, window=window, psi=psi,compact = False)
    return distance_matrix

In [None]:
def cluster_timeseries_k_mediods_DTW(series, n_clusters, window, psi):
    # from LinkageTree implementation in dtaidistance
    series = SeriesContainer.wrap(series)
    distance_matrix = dtw.distance_matrix_fast(series, window=window, psi=psi,compact = False)
    # so this distance matrix is upper triangular but it needs to be a full matrix for the clusterer
    distance_matrix[np.isinf(distance_matrix)] = 0
    # this works because the diagonal is 0
    full_matrix = distance_matrix + distance_matrix.T
    clusterer = KMedoids(n_clusters, metric='precomputed', init='k-medoids++', max_iter=1000)
    
    clusterer.fit(full_matrix)
    labels = clusterer.labels_
    centers = series[clusterer.medoid_indices_]
    return labels, centers

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']

# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]


# Monthly clustering

In [None]:
normalized_monthly_consumption = data16_df.groupby(data16_df.columns.isocalendar().week, axis = 1).sum().apply(lambda x: x / np.sum(x), axis = 1, raw = True)
alt.Chart(normalized_monthly_consumption.stack().to_frame('value').reset_index()).mark_line().encode(
    x = 'week', 
    y = 'value', 
    color = 'meterID'
)

## Look at outliers
This are profiles that inject more than what they consume and thus normalizing does not really work 

In [None]:
outliers = data16_df.loc[normalized_monthly_consumption[(normalized_monthly_consumption < -5).any(axis = 1)].index]
alt.Chart(outliers.stack().to_frame('value').reset_index()).mark_line().encode(
    x = 'timestamp:T', 
    y = 'value', 
    color = 'meterID'
).interactive()

In [None]:
outliers.sum(axis = 1)

### Look at the non outliers

In [None]:
alt.Chart(normalized_monthly_consumption.loc[normalized_monthly_consumption.index.difference(outliers.index)].stack().to_frame('value').reset_index()).mark_line().encode(
    x = 'week', 
    y = 'value', 
    color = 'meterID'
)

## Cluster these using k-means

In [None]:
# clusterer = KMeans(100)
clusterer = KMedoids(100)
labels = clusterer.fit_predict(normalized_monthly_consumption.dropna(axis = 0).to_numpy())
labels = pd.Series(labels, index = normalized_monthly_consumption.dropna().index, name = 'cluster_idx')
vis_df = normalized_monthly_consumption.stack().to_frame('value').join(labels)
alt.Chart(vis_df.reset_index()).mark_line().encode(
    x = 'week', 
    y = 'value', 
    color = 'meterID', 
    facet = 'cluster_idx'
).resolve_scale(y = 'independent').resolve_axis(y = 'independent')

### Remove outliers and cluster

In [None]:
outlier_detector = LocalOutlierFactor(50, contamination = 0.2)
outlier_labels = outlier_detector.fit_predict(normalized_monthly_consumption.dropna(axis = 0).to_numpy())
outliers = pd.Series(outlier_labels == -1, index = normalized_monthly_consumption.dropna().index, name = 'outlier_labels')
vis_df = normalized_monthly_consumption.dropna().stack().to_frame('value').join(outliers).reset_index()
alt.Chart(vis_df).mark_line().encode(
    x = 'week', 
    y = 'value', 
    color = 'meterID', 
    facet = 'outlier_labels'
).resolve_scale(color = 'independent').resolve_legend('independent')

In [None]:
data = normalized_monthly_consumption.dropna().loc[~outliers]


In [None]:
# clusterer = KMeans(25)
clusterer = KMedoids(25)
labels = clusterer.fit_predict(data.to_numpy())
labels = pd.Series(labels, index = data.index, name = 'cluster_idx')
vis_df = data.stack().to_frame('value').join(labels)
alt.Chart(vis_df.reset_index()).mark_line().encode(
    x = 'week', 
    y = 'value', 
    color = 'meterID', 
    facet = 'cluster_idx'
).resolve_scale(y = 'independent', color = 'independent').resolve_axis(y = 'independent').resolve_legend('independent')