# EnergyVille: seasonal decomposition try-out - v2
- simple missing value imputation by taking the average of the previous day and the next day
- seasonal decomposition of time series
- correlation between the trend component and the weather
- clustering of the seasonal component to represent daily patterns in a year (this may be useful in EnergyVille)
- average daily consumption pattern

## Imports and set-up

In [None]:
# this is a bit hacky, so let me know if you have a better way of doing this. But to use the src, and maybe this is good practice in general, we need to run the notebook from the root folder of the repo.
%pwd
%cd ../

# %qtconsole # open an interactive console

In [None]:
%load_ext autoreload
%autoreload 2

from statsmodels.tsa.seasonal import seasonal_decompose, STL
import altair as alt
from altair.expr import datum
import pandas as pd
import numpy as np
alt.data_transformers.disable_max_rows()
from pathlib import Path

DATA_PATH = Path().absolute() / 'data'/ 'consumption.csv'

import energy_ville.data as evd
import data_cleaning.find_problems_in_data as fpid

import sklearn_extra.cluster as clstr
import copy
from tqdm import tqdm

## Read the data

In [None]:
master_table = evd.get_master_table()
# data_reading_full = evd.get_data_reading_full()
df = evd.get_data_reading_preprocessed()

iIDs = fpid.get_iID_info()[0]

time_indices_full, time_first, time_last = fpid.get_time_info()

df = df.reindex(pd.MultiIndex.from_product([df.index.levels[0], time_indices_full], names=['iID', 'datetime'])) # add missing time samples (if any) as NaNs

### Fill missing values using seasonal mean
The seasonal mean can be calculated either from all periods or from only the previous and next periods (for which the corresponding time sample is not missing) because the STL decomposition methods *cannot* handle missing values.

In [None]:
def calculate_seasonal_mean(ts, n, lr=1): # function taken from https://www.machinelearningplus.com/time-series/time-series-analysis-python/
    """
    Compute the mean of corresponding seasonal periods
    ts: 1D array-like of the time series
    n: Seasonal window length of the time series
    """
    out = np.copy(ts)
    for i, val in enumerate(ts):
        if np.isnan(val):
            ts_seas = ts[i::-n]  # previous seasons only
            if np.isnan(np.nanmean(ts_seas)):
                ts_seas = np.concatenate([ts[i-1::-n], ts[i::n]])  # previous and forward
            out[i] = np.nanmean(ts_seas) * lr
    return out

def calculate_two_period_seasonal_mean(ts, n):
    """
    Fill missing values by the average of the corresponding time samples in the previous period and the next period
    ts: 1D array-like of the time series
    n: Seasonal window length of the time series
    """
    out = np.copy(ts)
    for i, val in enumerate(ts):
        if np.isnan(val):
            
            ts_prev = ts[i::-n]  # values in the previous periods
            ts_next = ts[i::n]  # values in the next periods
            
            ind_prev = np.where(~np.isnan(ts_prev))[0][0:] # latest non-NaN value in the past
            ind_next = np.where(~np.isnan(ts_next))[0][0:] # first non-NaN value in the future
            
            pr = ts_prev[ind_prev[0]] if ind_prev.size != 0 else np.nan
            nx = ts_next[ind_next[0]] if ind_next.size != 0 else np.nan
            
            out[i] = np.nanmean([pr,] + [nx,])
    return out

number_of_samples_per_day = round(len(time_indices_full) / (time_indices_full[-1] - time_indices_full[0]).days)

df_orig = copy.deepcopy(df)

# df = df.groupby(level=0).transform(lambda o: calculate_seasonal_mean(o, n=number_of_samples_per_day))
df = df.groupby(level=0).transform(lambda o: calculate_two_period_seasonal_mean(o, n=number_of_samples_per_day))

### Decomposition using STL:
The parameter *period* should be 48 (the number of samples in a day) or 48\*7 for weekly periodicity.

When the parameter *seasonal* is increased, the seasonal component changes more slowly. I selected it as 91 just because a season lasts for 3 months and it has to be an odd number. 

*robust* option gives a smoother trend signal.

Execution takes too long for the whole dataset, so let's calculate decomposition for a small subset of iIDs.

In [None]:
# iID_inds_to_calculate = np.random.choice(len(iIDs), 3, replace=False)
iID_inds_to_calculate = list(range(len(iIDs)))
iIDs_to_calculate = iIDs[iID_inds_to_calculate]

grouped = df.groupby(level=0)

for iID, group in tqdm(grouped):
    if iID in iIDs_to_calculate:
        # result = seasonal_decompose(group.Consumption, period = number_of_samples_per_day)
        result = STL(group.Consumption, period = number_of_samples_per_day, seasonal = 181, robust = True).fit()

        df.loc[iID, 'Consumption_Seasonal'] = result.seasonal.values # all three are for "Consumption"
        df.loc[iID, 'Consumption_Trend'] = result.trend.values
        df.loc[iID, 'Consumption_Resid'] = result.resid.values

#### Plot:

In [None]:
plot_df = copy.deepcopy(df)
plot_df.drop(['Offtake', 'Injection'], axis=1, inplace=True)

plot_df = plot_df.loc[iIDs_to_calculate[91]] # select one iID among the computed ones

In [None]:
plot_df1 = plot_df.stack(dropna=False).reset_index()
plot_df1.columns = ['datetime', 'score_type', 'value']
plot_df1.head()
alt.Chart(plot_df1).mark_line(strokeWidth=0.5).encode(
        x = 'datetime:T', 
        y = alt.Y('value:Q', title=''),
        row = alt.Row('score_type:N', sort = 'ascending')
).properties(width=2000, height=100
).resolve_scale(y='independent'
).interactive()#.facet(row = 'score_type:N', data = plot_df1)

## Cluster seasonal time series:

In [None]:
import sklearn_extra.cluster as clstr

plot_df['date'] = pd.to_datetime(plot_df.index.date)

grp = plot_df.groupby(plot_df.date).Consumption_Seasonal
dates = []
ts = []
grps = []
for key, gr in grp:
    dates.append(key)
    ts.append(gr.values)
    grps.append(gr)

n_clusters = 4
kmedoids = clstr.KMedoids(n_clusters=n_clusters, random_state=73).fit(ts)

plot_df['cluster'] = ''
for i, date in enumerate(dates):
    plot_df.loc[plot_df.date==date, 'cluster'] = kmedoids.labels_[i]

plot_df['cluster_medoid'] = False
for i in range(n_clusters):
    plot_df.loc[plot_df.date==dates[kmedoids.medoid_indices_[i]], 'cluster_medoid'] = True

plot_df = plot_df.reset_index()

In [None]:
alt.Chart(plot_df).mark_line().encode(
    x = alt.X('datetime:T', title='date&time', scale=alt.Scale(domain=('2016-01-01 00:00:00', '2017-01-01 00:00:00'))),
    y = alt.Y('Consumption_Seasonal:Q', title='seasonal component'),
    color=alt.Color('cluster:O', legend=alt.Legend(title='cluster'), scale=alt.Scale(scheme='dark2')), 
    size = alt.Size('cluster_medoid:O', legend=alt.Legend(title='cluster medoids'))
).properties(width=6000, height=200).interactive()

In [None]:
# plot_df['timeoftheday'] = plot_df.time.dt.time

alt.Chart(plot_df[plot_df.cluster_medoid]).mark_line().encode(
   x = alt.X('hoursminutes(datetime):O', title='time of the day'),
    y = alt.Y('Consumption_Seasonal:Q', title='seasonal component'),
    color=alt.Color('cluster:O', legend=alt.Legend(title='cluster medoids'), scale=alt.Scale(scheme='dark2'))
).properties(width=500, height=200).interactive()