# Generate annotated profiles

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
import sys
import matplotlib.pyplot as plt
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

In [None]:
sys.path.insert(0, '../handling_zeros_and_nans')
from interval_information import get_interval_df

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
zero_path = RESULT_PATH / 'zero_interval_is_error.csv'
interval_path = RESULT_PATH /'intervals_with_info.csv'
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists() and zero_path.exists(), 'These paths should exist'

# Read info and data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']

# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]


# Select example profiles

In [None]:
i1 = 501
i2 = 502

data_df_selected = data_df.iloc[[i1, i2]]
display(data_df_selected)

data_df_selected.index = data_df_selected.index.map(lambda o: str(o[0]) + ', ' + str(o[1]))
data_df_selected.index.name = 'meter'

data_df_selected_longform = data_df_selected.transpose().reset_index().melt('timestamp') # easier for altair
data_df_selected_longform.rename(columns={'values':'consumption (kWh)'}, inplace=True)

alt.Chart(data_df_selected_longform).mark_line(opacity=0.5).encode(x='timestamp', y='value', color='meter') \
    .properties(width = 600, height = 300).interactive(bind_y = False)

In [None]:
data_df_selected.loc['day_of_year'] = data_df_selected.columns.map(lambda o: o.dayofyear)
data_df_selected.loc['time_of_day'] = data_df_selected.columns.map(lambda o: o.time)
data_df_selected_unstacked = data_df_selected.transpose().set_index(['day_of_year','time_of_day']).unstack()

data_df_selected_stacked = data_df_selected_unstacked.stack().stack().to_frame('consumption (kWh)').reset_index()

data_df_selected_stacked_toplot = data_df_selected_stacked # wrong date of 1900 added to the time_of_the_day 
                                                           # for altair to function properly
data_df_selected_stacked_toplot['time_of_day'] = pd.to_datetime(data_df_selected_stacked_toplot['time_of_day'], 
                                                                format='%H:%M:%S')

## Plot:

In [None]:
alt.Chart(data_df_selected_stacked_toplot).mark_line(opacity=0.2, thickness=0.2) \
    .encode(x=alt.X('time_of_day:T', timeUnit='hoursminutes'), y='consumption (kWh):Q', 
            color=alt.Color('day_of_year', scale=alt.Scale(scheme='rainbow'))) \
    .properties(width = 600, height = 300).facet(row='meter:N') \
    .interactive(bind_y = False)

# Apply SVD:
**Info:**
1. Rows of w are the principal directions (axes) of daily time series. Their norm is 1. (Note that columns of w have also unit norm but this has nothing to do with our method.) There are only 96 principal components because the data has 96 dimensions (#samples in a day).
2. We can scale rows of w by the singular values in s (by multiplying diag(s) by w) to take into account their 'importance'; i.e., the variation of the data along these directions. The scaled ones are called *principal components*.

In [None]:
meters_selected = data_df_selected_unstacked.columns.get_level_values(0).unique().values
xx = [data_df_selected_unstacked[i].values for i in meters_selected]

prd_df_unstacked = data_df_selected_unstacked.iloc[0:0,:].copy().reindex(range(1,1+xx[0].shape[1])) # principal directions (unit-norm)
prc_df_unstacked = data_df_selected_unstacked.iloc[0:0,:].copy().reindex(range(1,1+xx[0].shape[1])) # principal components (scaled by importance)
prd_df_unstacked.index.name = 'principal_direction_no'
prc_df_unstacked.index.name = 'principal_component_no'
uu = []
ss = []
ww = []
for i, x in enumerate(xx):
    x = x[~pd.isnull(x).any(axis=1)]
    print(f'For profile {i}, {xx[i].shape[0] - x.shape[0]} of {xx[i].shape[0]} days were removed because of NaNs.')
    x = np.array(x, dtype=np.float)
    
    # SVD:
    u,s,w = np.linalg.svd(x) # x = u diag(s) w   (w is already in the transposed form)
    
    sm = np.pad(np.diag(s), ((0,x.shape[0]-x.shape[1]), (0,0))) # rectangular matrix form of singular values s
    # np.linalg.multi_dot([u, sm, w]) - x   should be almost zero
    
    # Correct the signs of the principal directions: When the dot product between a direction and [1, 1, ..., 1] is negative, change its sign.
    for j in range(w.shape[1]):
        if np.dot(w[j,:], np.ones(w.shape[0])) < 0:
            u[:,j] = -u[:,j] # These two operations do not break the consistency of the SVD multiplication. 
            w[j,:] = -w[j,:] # The modified matrices are still the SVD of x because of the almost-uniqueness of SVD.
                             # The resulting similarity also does not change. 
    
    prd_df_unstacked[meters_selected[i]] = w
    prc_df_unstacked[meters_selected[i]] = np.diag(s).dot(w)
    uu.append(u)
    ss.append(s)
    ww.append(w)
    
    # plt.figure()
    # plt.plot(np.transpose(w[0:5,:]))
    
    # plt.figure(figsize=(10,10))
    # plt.imshow(u[0:96])

prd_df_stacked = prd_df_unstacked.stack().stack().to_frame('consumption (kWh)').reset_index()
prc_df_stacked = prc_df_unstacked.stack().stack().to_frame('consumption (kWh)').reset_index()

prd_df_stacked_toplot = prd_df_stacked # wrong date of 1900 added to the time_of_the_day for altair to function properly
prc_df_stacked_toplot = prc_df_stacked # 
prd_df_stacked_toplot['time_of_day'] = pd.to_datetime(prd_df_stacked_toplot['time_of_day'], format='%H:%M:%S')
prc_df_stacked_toplot['time_of_day'] = pd.to_datetime(prc_df_stacked_toplot['time_of_day'], format='%H:%M:%S')

# Calculate similarity based on SVD:

## 1. based on principal components:
We calculate all pairwise dot products between the principal components of the two data matrices X_0 and X_1, sum their squares, and scale by 1/(number of columns in W). [Time series distance measures, Spiegel, 2015] [Krzanowski, W. J. Between-Groups Comparison of Principal Components, 1979]

In [None]:
r0 = np.diag(ss[0]).dot(ww[0].T)
r1 = np.diag(ss[1]).dot(ww[1].T)

d_1 = (1/r0.shape[0])*np.trace(np.linalg.multi_dot([r0.T, r1, r1.T, r0]))
print('dissimilarity based on principal components:', d_1)


# (1/r0.shape[0])*np.trace(np.linalg.multi_dot([ww[0].T, ww[1]]))

## 2. based on principal directions:
Similar to method 1; we use principal directions (that are unit-norm) instead of principal components

In [None]:
d_2 = (1/r0.shape[0])*np.trace(np.linalg.multi_dot([ww[0].T, ww[1]]))
print('dissimilarity based on principal directions:', d_2)


# (1/r0.shape[0])*np.trace(np.linalg.multi_dot([ww[0].T, ww[1]]))

# Plot:

In [None]:
noc = 10 # number of components

In [None]:
alt.Chart(prd_df_stacked_toplot[prd_df_stacked_toplot['principal_direction_no'] <= noc]) \
    .mark_line(opacity=0.5, thickness=0.2) \
    .encode(x=alt.X('time_of_day:T', timeUnit='hoursminutes'), y='consumption (kWh):Q', 
            color=alt.Color('principal_direction_no', scale=alt.Scale(scheme='lightgreyred'), sort='descending')) \
    .properties(width = 600, height = 300).facet(row='meter:N') \
    .interactive(bind_y = False) #TODO: something weird about the plot

In [None]:
alt.Chart(prc_df_stacked_toplot[prc_df_stacked_toplot['principal_component_no'] <= noc]) \
    .mark_line(opacity=0.5, thickness=0.2) \
    .encode(x=alt.X('time_of_day:T', timeUnit='hoursminutes'), y='consumption (kWh):Q', 
            color=alt.Color('principal_component_no', scale=alt.Scale(scheme='lightgreyred'), sort='descending')) \
    .properties(width = 600, height = 300).facet(row='meter:N') \
    .interactive(bind_y = False) #TODO: something weird about the plot