# POC: non-negative matrix factorization
Let's check if non-negative matrix factorization could help us, don't care about the data problems in the data for now

## Some observations
- Seems to give good results for repetitive profiles (IDX 0 for example) 
- Does not work for profiles with injection! Cannot-handle negative values


In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
from sklearn.decomposition import NMF
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()
from dtaidistance import clustering, dtw
from dtaidistance.util import SeriesContainer
from dtaidistance.clustering.kmeans import KMeans
from dtaidistance.dtw_barycenter import dba
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import LocalOutlierFactor

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
zero_path = RESULT_PATH / 'zero_interval_is_error.csv'
interval_path = RESULT_PATH /'intervals_with_info.csv'
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists() and zero_path.exists(), 'These paths should exist'

# Util methods

In [None]:
def vec_dt_replace(series, year=None, month=None, day=None):
    return pd.to_datetime(
        {'year': series.year if year is None else year,
         'month': series.month if month is None else month,
         'day': series.day if day is None else day, 
        'hour': series.hour,
        'minute': series.minute})

In [None]:
def add_date(series): 
    return pd.to_datetime(series, format='%H:%M:%S')

In [None]:
def get_DTW_distance_matrix(series, window, psi): 
    series = SeriesContainer.wrap(series)
    distance_matrix = dtw.distance_matrix_fast(series, window=window, psi=psi,compact = False)
    return distance_matrix

In [None]:
def cluster_timeseries_k_mediods_DTW(series, n_clusters, window, psi):
    # from LinkageTree implementation in dtaidistance
    series = SeriesContainer.wrap(series)
    distance_matrix = dtw.distance_matrix_fast(series, window=window, psi=psi,compact = False)
    # so this distance matrix is upper triangular but it needs to be a full matrix for the clusterer
    distance_matrix[np.isinf(distance_matrix)] = 0
    # this works because the diagonal is 0
    full_matrix = distance_matrix + distance_matrix.T
    clusterer = KMedoids(n_clusters, metric='precomputed', init='k-medoids++', max_iter=1000)
    
    clusterer.fit(full_matrix)
    labels = clusterer.labels_
    centers = series[clusterer.medoid_indices_]
    return labels, centers

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']

# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]


## Choose a profile

In [None]:
IDX = 4
profile_to_check = data16_df.iloc[IDX].to_frame('value')
profile_to_check['time'] = add_date(profile_to_check.index.time)
profile_to_check['date'] = profile_to_check.index.date.astype('str')

## Show the profile

In [None]:
alt.Chart(profile_to_check.reset_index()[['timestamp', 'value']], title = 'full profile', width = 2000, height = 200).mark_line().encode(
    x = 'timestamp:T', 
    y = 'value:Q', 
    
).interactive(bind_y = False)

In [None]:
all_day_chart = alt.Chart(profile_to_check, title = 'All days').mark_line().encode(
    x = 'time:T',
    y = 'value', 
    color = 'date'
)

In [None]:
profile_matrix = pd.pivot_table(profile_to_check, index = 'date', columns = 'time', values = 'value').dropna(axis = 0)

In [None]:
series = SeriesContainer.wrap(profile_matrix.to_numpy())
barycentric_average = pd.Series(dba(series,None), index = profile_matrix.columns)
barycentric_average

In [None]:
average_day_chart = alt.Chart(barycentric_average.to_frame('value').reset_index(), title = 'barycentric average').mark_line().encode(
    x = 'time:T', 
    y = 'value:Q'
)

In [None]:
(all_day_chart | average_day_chart).resolve_scale(x = 'shared', y= 'shared')

In [None]:
distance_matrix = get_DTW_distance_matrix(profile_matrix.to_numpy(), 8, 0)
detector = LocalOutlierFactor(n_neighbors = 20, metric = 'precomputed', contamination = 0.50)
labels = detector.fit_predict(distance_matrix)
anomaly_labels = pd.Series(labels == -1, index = profile_matrix.index, name = 'anomaly')
profile_vis_cluster = profile_matrix.stack().to_frame('value').join(anomaly_labels).reset_index()
profile_vis_cluster.time = add_date(profile_vis_cluster.time)
alt.Chart(profile_vis_cluster.reset_index()).mark_line().encode(
    x = 'time:T', 
    y = 'value', 
    color = 'date'
).facet(row = 'anomaly')

## Cluster using kmedoids

In [None]:
NB_OF_CLUSTERS = 15

In [None]:

# non_anomalies = profile_matrix[~anomaly_labels]
non_anomalies = profile_matrix
labels, centers = cluster_timeseries_k_mediods_DTW(non_anomalies.to_numpy(), NB_OF_CLUSTERS, 8, 0)
labels = pd.Series(labels, index = non_anomalies.index, name = 'labels')
centers = pd.DataFrame(centers, columns = non_anomalies.columns)
centers_vis = centers.stack().to_frame('value').reset_index()
profile_vis_cluster = non_anomalies.stack().to_frame('value').join(labels).reset_index()
profile_vis_cluster.time = add_date(profile_vis_cluster.time)

medoid_chart = alt.Chart(centers_vis).mark_line().encode(
    x = 'time:T', 
    y = 'value', 
    color = 'level_0:N'
)
alt.Chart(profile_vis_cluster.reset_index()).mark_line().encode(
    x = 'time:T', 
    y = 'value', 
    color = 'date'
).facet(column = 'labels') 

## Cluster using kmeans and barycentric averaging


In [None]:
# non_anomalies = profile_matrix[~anomaly_labels]
non_anomalies = profile_matrix
series = SeriesContainer.wrap(non_anomalies.to_numpy())
model = KMeans(k=NB_OF_CLUSTERS, max_it=10, max_dba_it=10, dists_options={"window": 8,'psi':0})
label_dict, performed_it = model.fit(series, use_c=True, use_parallel=True)


labels = pd.Series(index = non_anomalies.index, name = 'labels')
for key,value in label_dict.items(): 
    labels.iloc[list(value)] = key
profile_vis_cluster = non_anomalies.stack().to_frame('value').join(labels).reset_index()
profile_vis_cluster.time = add_date(profile_vis_cluster.time)
alt.Chart(profile_vis_cluster.reset_index()).mark_line().encode(
    x = 'time:T', 
    y = 'value', 
    color = 'date'
).facet(column = 'labels')

In [None]:
centroids = pd.DataFrame(model.means, columns = add_date(non_anomalies.columns))
centroid_vis = centroids.stack().to_frame('value').reset_index()
bary_chart = alt.Chart(centroid_vis).mark_line().encode(
    x = 'time:T', 
    y= 'value:Q', 
    color = 'level_0:O'
)
bary_chart.properties(title = 'barycenter') | medoid_chart.properties(title = 'medoid')

# decompose using NMF

In [None]:

# transformed_centers = centers.apply(lambda x: x - np.min(x), axis = 1, raw = True)
matrix = profile_matrix[~anomaly_labels].to_numpy()
# matrix = transformed_centers.to_numpy()
# alpha controls regularization (pushing weights towards 0 such that representations become sparse)
decomposer = NMF(8, max_iter = 10000, alpha = 0.1, l1_ratio = 1, regularization = 'both').fit(matrix)
print('reconstruction error', decomposer.reconstruction_err_)
components = decomposer.components_
components_df = pd.DataFrame(components, columns = profile_matrix.columns)
components_df.index.name = 'component_nb'
components_df;

## Show the components

In [None]:

transformed_centers_vis = transformed_centers.stack().to_frame('value').reset_index()

component_vis = components_df.stack().to_frame('value').reset_index()
component_vis['time'] = pd.to_datetime(component_vis['time'], format='%H:%M:%S')
component_vis

medoid_chart = alt.Chart(transformed_centers_vis).mark_line().encode(
    x = 'time:T', 
    y = 'value', 
    color = 'level_0:N'
)
alt.Chart(component_vis, title = 'first 5 components').mark_line().encode(
    x = 'time:T', 
    y = 'value:Q', 
    color= 'component_nb:N'
) | medoid_chart

In [None]:
representation_matrix = pd.DataFrame(decomposer.transform(matrix)).sort_index()
representation_matrix[0:62].style.background_gradient(cmap = 'Blues', axis = 1)

# show reconstruction + used components

In [None]:
IDX = 12
transformed = decomposer.transform(transformed_centers.iloc[[IDX]].to_numpy())
original = decomposer.inverse_transform(transformed)
day = transformed_centers.iloc[IDX].to_frame('original_value')
day['after_reconstruction'] = original[0]
day = day.stack().reset_index()
day.columns = ['time', 'type', 'value']
day.time = add_date(day.time)
print(transformed)
orig_chart = alt.Chart(day).mark_line().encode(
    x = 'time:T', 
    y = 'value:Q', 
    color = 'type:N'
)

vis_df = components_df.stack().to_frame('value').reset_index()
vis_df['weight'] = transformed[0, vis_df.component_nb]
vis_df.time = add_date(vis_df.time)
vis_df = vis_df[vis_df.weight > 0]
vis_df['value'] = vis_df['value']*vis_df['weight']
vis_df

component_chart = alt.Chart(vis_df).mark_line().encode(
    x = 'time:T', 
    y = 'value', 
    size  = 'weight',
    opacity = 'weight',
    color = 'component_nb:N'
)

orig_chart + component_chart

# Show the reconstruction vs real profile

In [None]:
IDX = 8
transformed = decomposer.transform(centers.iloc[[IDX]].to_numpy())
original = decomposer.inverse_transform(transformed)
day = centers.iloc[IDX].to_frame('original_value')
day['after_reconstruction'] = original[0]
day = day.stack().reset_index()
day.columns = ['time', 'type', 'value']
day.time = add_date(day.time)
print(transformed)
alt.Chart(day).mark_line().encode(
    x = 'time:T', 
    y = 'value:Q', 
    color = 'type:N'
)