# POC: non-negative matrix factorization
Let's check if non-negative matrix factorization could help us, don't care about the data problems in the data for now

## Some observations
- Seems to give good results for repetitive profiles (IDX 0 for example) 
- Does not work for profiles with injection! Cannot-handle negative values


In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
from sklearn.decomposition import NMF
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()
from dtaidistance import clustering, dtw
from dtaidistance.util import SeriesContainer
from dtaidistance.clustering.kmeans import KMeans
from dtaidistance.dtw_barycenter import dba
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import LocalOutlierFactor

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
zero_path = RESULT_PATH / 'zero_interval_is_error.csv'
interval_path = RESULT_PATH /'intervals_with_info.csv'
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists() and zero_path.exists(), 'These paths should exist'

# Util methods

In [None]:
def vec_dt_replace(series, year=None, month=None, day=None):
    return pd.to_datetime(
        {'year': series.year if year is None else year,
         'month': series.month if month is None else month,
         'day': series.day if day is None else day, 
        'hour': series.hour,
        'minute': series.minute})

In [None]:
def add_date(series): 
    return pd.to_datetime(series, format='%H:%M:%S')

In [None]:
def get_DTW_distance_matrix(series, window, psi): 
    series = SeriesContainer.wrap(series)
    distance_matrix = dtw.distance_matrix_fast(series, window=window, psi=psi,compact = False)
    return distance_matrix

In [None]:
def cluster_timeseries_k_mediods_DTW(series, n_clusters, window, psi):
    # from LinkageTree implementation in dtaidistance
    series = SeriesContainer.wrap(series)
    distance_matrix = dtw.distance_matrix_fast(series, window=window, psi=psi,compact = False)
    # so this distance matrix is upper triangular but it needs to be a full matrix for the clusterer
    distance_matrix[np.isinf(distance_matrix)] = 0
    # this works because the diagonal is 0
    full_matrix = distance_matrix + distance_matrix.T
    clusterer = KMedoids(n_clusters, metric='precomputed', init='k-medoids++', max_iter=1000)
    
    clusterer.fit(full_matrix)
    labels = clusterer.labels_
    centers = series[clusterer.medoid_indices_]
    return labels, centers

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1], nrows = 100)
data_df = pd.read_csv(data_path, index_col = [0,1], nrows = 100)
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


## Choose a profile

In [None]:
# IDX = 5 first example in ppt
IDX = 10
profile_to_check = data_df.iloc[IDX].to_frame('value')
profile_to_check['time'] = add_date(profile_to_check.index.time)
profile_to_check['date'] = profile_to_check.index.date.astype('str')

## Show the profile

In [None]:
alt.Chart(profile_to_check.reset_index()[['timestamp', 'value']], title = 'full profile', width = 1400, height = 400).mark_line().encode(
    x = 'timestamp:T', 
    y = 'value:Q', 
    
).interactive(bind_y = False)

In [None]:
all_day_chart = alt.Chart(profile_to_check, title = 'All days').mark_line(opacity=0.2, thickness=0.1).encode(
    x = 'time:T',
    y = 'value', 
    color = alt.Color('date', scale = alt.Scale(scheme = 'rainbow'), legend = None)
)
all_day_chart

In [None]:
profile_matrix = pd.pivot_table(profile_to_check, index = 'date', columns = 'time', values = 'value').dropna(axis = 0)

# decompose using NMF

In [None]:

matrix = profile_matrix.to_numpy()
# alpha controls regularization (pushing weights towards 0 such that representations become sparse)
decomposer = NMF(8, max_iter = 100000, alpha = 0.1, l1_ratio = 0.9, regularization = 'both').fit(matrix)
print('reconstruction error', decomposer.reconstruction_err_)
components = decomposer.components_
components_df = pd.DataFrame(components, columns = profile_matrix.columns)
components_df.index.name = 'component_nb'
components_df

In [None]:
components_df.sum(axis = 1)

In [None]:
representation_matrix = pd.DataFrame(decomposer.transform(profile_matrix.dropna()), index = profile_matrix.index).sort_index()
median_coefficients = representation_matrix.replace(0, np.nan).median(axis = 0).rename('scale')
total_weight = representation_matrix.sum(axis = 0).rename('total_weight').pipe(lambda x: x/x.sum())
total_weight
# median_coefficients

In [None]:
component_vis = components_df.stack().to_frame('value').reset_index()
component_vis['time'] = pd.to_datetime(component_vis['time'], format='%H:%M:%S')
component_vis = component_vis.join(median_coefficients, on = 'component_nb')
component_vis['scaled_value'] = component_vis['value'] * component_vis['scale']

component_chart = alt.Chart(component_vis, title = 'first 5 components').mark_line().encode(
    x = 'time:T', 
    y = 'value:Q', 
    color= alt.Color('component_nb:N', legend = None)
)
(component_chart | all_day_chart).resolve_scale(color = 'independent')

In [None]:
component_chart.properties(title = 'NMF components')

In [None]:
component_weight_vis = component_vis.join(total_weight, on = 'component_nb')
component_weight_chart = alt.Chart(component_weight_vis, title = 'NMF components with importance').mark_line().encode(
    x = 'time:T', 
    y = 'scaled_value:Q', 
#     size = alt.Size('total_weight',scale = alt.Scale(range = [0.5,5])),
    color= alt.Color('total_weight:Q', title = 'Importance')
)
component_weight_chart

In [None]:
representation_matrix[0:62].style.background_gradient(cmap = 'Blues', axis = 1)

# show reconstruction + used components

In [None]:
IDX = 140
transformed = decomposer.transform(profile_matrix.iloc[[IDX]].to_numpy())
original = decomposer.inverse_transform(transformed)
day = profile_matrix.iloc[IDX].to_frame('original_value')
day['after_reconstruction'] = original[0]
day = day.stack().reset_index()
day.columns = ['time', 'type', 'value']
day.time = add_date(day.time)
print(transformed)
orig_chart = alt.Chart(day).mark_line(thickness = 2).encode(
    x = 'time:T', 
    y = 'value:Q', 
    color = 'type:N'
)

vis_df = components_df.stack().to_frame('value').reset_index()
vis_df['weight'] = transformed[0, vis_df.component_nb]
vis_df.time = add_date(vis_df.time)
vis_df = vis_df[vis_df.weight > 0.01]
vis_df['value'] = vis_df['value']*vis_df['weight']
vis_df

component_chart = alt.Chart(vis_df).mark_line().encode(
    x = 'time:T', 
    y = 'value', 
    color = alt.Color('component_nb:N', scale = alt.Scale(scheme = 'rainbow'))
)

(orig_chart | component_chart).resolve_legend('independent').resolve_scale(color = 'independent', y = 'shared')

# Show the reconstruction vs real profile

In [None]:
IDX = 8
transformed = decomposer.transform(centers.iloc[[IDX]].to_numpy())
original = decomposer.inverse_transform(transformed)
day = centers.iloc[IDX].to_frame('original_value')
day['after_reconstruction'] = original[0]
day = day.stack().reset_index()
day.columns = ['time', 'type', 'value']
day.time = add_date(day.time)
print(transformed)
alt.Chart(day).mark_line().encode(
    x = 'time:T', 
    y = 'value:Q', 
    color = 'type:N'
)