# POC: non-negative matrix factorization similarity
So basically the idea is to learn the NMF on the days of two profiles and compare the profiles in the representation format
## Some observations


In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
from sklearn.decomposition import NMF
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
zero_path = RESULT_PATH / 'zero_interval_is_error.csv'
interval_path = RESULT_PATH /'intervals_with_info.csv'
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists() and zero_path.exists(), 'These paths should exist'

In [None]:
def vec_dt_replace(series, year=None, month=None, day=None):
    return pd.to_datetime(
        {'year': series.year if year is None else year,
         'month': series.month if month is None else month,
         'day': series.day if day is None else day, 
        'hour': series.hour,
        'minute': series.minute})

In [None]:
def add_date(series): 
    return pd.to_datetime(series, format='%H:%M:%S')

## Read the data

In [None]:
NROWS = 100
info_df = pd.read_csv(info_path, index_col = [0,1])
data_df = pd.read_csv(data_path, index_col = [0,1], nrows = NROWS)
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']

# only 2016 data
data15_df = data_df.loc[idx[:,2015],:] 

In [None]:
data15_df

## Choose two profiles

In [None]:
# 4 and 5 are similar
IDX1 = 4
IDX2 = 5
profile_to_check = data15_df.iloc[[IDX1, IDX2]].stack().to_frame('value')
profile_to_check['time'] = profile_to_check.index.get_level_values(2).time
profile_to_check['date'] = profile_to_check.index.get_level_values(2).date
profile_matrix = pd.pivot_table(profile_to_check, index = ['meterID', 'date'], columns = 'time', values = 'value')
profile_matrix = profile_matrix.dropna(axis = 0)
profile_matrix

## The profiles plotted

In [None]:
alt.Chart(profile_to_check.reset_index()[['timestamp', 'value', 'meterID']], title = 'full profile', width = 2000, height = 500).mark_line().encode(
    x = 'timestamp:T', 
    y = 'value:Q', 
    color = 'meterID:N'
    
).interactive(bind_y = False)

## Try non-negative matrix decomposition

In [None]:
matrix = profile_matrix.dropna(axis = 0).to_numpy()
# alpha controls regularization (pushing weights towards 0 such that representations become sparse)
NB_OF_COMPONENTS = 1
decomposer = NMF(NB_OF_COMPONENTS, max_iter = 10000, alpha = 0.1, l1_ratio = 1).fit(matrix)
components = decomposer.components_
components_df = pd.DataFrame(components, columns = profile_matrix.columns)
components_df.index.name = 'component_nb'
components_df

## Learned components

In [None]:
component_vis = components_df.stack().to_frame('value').reset_index()
component_vis['time'] = pd.to_datetime(component_vis['time'], format='%H:%M:%S')
component_vis

alt.Chart(component_vis, title = 'first 5 components').mark_line().encode(
    x = 'time:T', 
    y = 'value:Q', 
    color = 'component_nb:N'
)

## Component representation for all days 

In [None]:
representation_matrix = pd.DataFrame(decomposer.transform(profile_matrix.dropna(axis = 0)), index = profile_matrix.index).swaplevel(0,1, axis = 0).sort_index()
representation_matrix[0:62].style.background_gradient(cmap = 'Blues')

In [None]:
component_mean = representation_matrix.reset_index().groupby('meterID')[list(range(0,NB_OF_COMPONENTS))].mean()
component_mean

In [None]:
np.linalg.norm(component_mean.iloc[0].to_numpy() - component_mean.iloc[1].to_numpy())

## Plot a day and the reconstruction

In [None]:
IDX = 82
transformed = decomposer.transform(profile_matrix.iloc[[IDX]].to_numpy())
original = decomposer.inverse_transform(transformed)
day = profile_matrix.iloc[IDX].to_frame('original_value')
day['after_reconstruction'] = original[0]
day = day.stack().reset_index()
day.columns = ['time', 'type', 'value']
day.time = add_date(day.time)
print(transformed)
orig_chart = alt.Chart(day).mark_line().encode(
    x = 'time:T', 
    y = 'value:Q', 
    color = 'type:N'
)

vis_df = components_df.stack().to_frame('value').reset_index()
vis_df['weight'] = transformed[0, vis_df.component_nb]
vis_df.time = add_date(vis_df.time)
vis_df = vis_df[vis_df.weight > 0]
vis_df['value'] = vis_df['value']*vis_df['weight']
vis_df

component_chart = alt.Chart(vis_df).mark_line(opacity = 0.5).encode(
    x = 'time:T', 
    y = 'value', 
    color = 'component_nb:N'
)

orig_chart + component_chart