# A try to consolidate all cumulative measurement approaches

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
from scipy.signal import find_peaks, find_peaks_cwt
from kde_diffusion import kde1d
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import KernelDensity
import warnings
from scipy.stats import norm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

from interval_information import get_interval_df
from peak_detection import (
    get_connection_and_pv_power_peaks, 
    get_model_based_global_peaks,
    get_similarity_based_peaks
)
from statistical_models import (
    NormalDistribution, 
    AutoKDEDistribution, 
    KDEDistribution,
)

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

## Plot function 


In [None]:
def plot_profile_with_intervals(meterID, year, period_type_column = None, data = None, daterange = None):
    # plots the profile, using the period data in data 
    # the color can be determined using the period_type_column
    if data is None : 
        data = profile_intervals
    if daterange is not None: 
        start_time =  f'2016-{daterange[0]}-1 00:00:00'
        end_time = f'2016-{daterange[1]}-1 00:00:00'
        profile_df = data16_df.loc[(meterID, year),start_time:end_time]
        periods_for_profile =data.loc[(meterID,year), :]
        periods_for_profile = periods_for_profile[(periods_for_profile['end_time'] > start_time ) & (periods_for_profile['start_time'] < end_time)]
    else: 
        profile_df = data16_df.loc[(meterID, year),:]
        periods_for_profile =data.loc[(meterID,year), :]
        
#     print(periods_for_profile[['start_time', 'end_time']])
#     print(zero_periods_for_profile[['start_time', 'end_time', 'is_disconnection_period']])
    line = alt.Chart(profile_df.to_frame('value').reset_index()).mark_line().encode(
        x = alt.X('timestamp:T'), 
        y = alt.Y('value:Q')
    )
    if period_type_column is None: 
        color_encoding = alt.ColorValue('blue') 
    else: 
        color_encoding = alt.Color(f'{period_type_column}:N')
    plot_df =periods_for_profile.reset_index(drop=True)
    rect = alt.Chart(plot_df).mark_rect(opacity = 0.4).encode(
        x = 'start_time:T',
        x2 = 'end_time:T', 
        color = color_encoding
    ) + alt.Chart(plot_df).mark_circle(size = 100).encode(
        x = 'start_time:T',
        y = alt.YValue(profile_df.max()),
#         x2 = 'end_time:T', 
        color = color_encoding
    )
    chart = rect + line
    if 'connection_power' in periods_for_profile.columns: 
        connection_power = float(periods_for_profile.connection_power.iat[0])

        connection_power_line = alt.Chart(periods_for_profile.reset_index()).mark_rule(color = 'black', opacity = 0.8).encode(
            y =  'mean(connection_power):Q'
        )
        chart += connection_power_line
    return chart.properties(width = 2200, title = f"{meterID} in {year}").interactive()



## Confusion matrix
Small table for convenience comparison

In [None]:
def confusion_matrix(name1, series1, name2, series2): 
    return pd.crosstab(series1, series2, rownames = [name1], colnames =[name2])

## Read the data

In [None]:
%%time
info_df = pd.read_csv(info_path, dtype={'meterID':'str'}).set_index(['meterID', 'year'], drop=True)
print(f'#profiles = {info_df.shape[0]}')
data_df = pd.read_csv(data_path, dtype={'meterID':'str'}).set_index(['meterID', 'year'], drop=True)
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


## For development look at subset

In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']
info16_df

In [None]:
# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]
data16_df

In [None]:
interval_df = get_interval_df(data16_df, info16_df, keep_zero = True, keep_nan = True)
interval_df

## Inspect some NaN intervals

In [None]:
nan_intervals = interval_df[interval_df.interval_value.isna()].sort_values('interval_length', ascending = False )
profiles = nan_intervals.index.get_level_values(0).unique()


In [None]:
IDX = 11
profile = profiles[IDX]
print(profile)
plot_profile_with_intervals(profile, 2016, data = interval_df, period_type_column = 'interval_value')

## Inspect some zero intervals

In [None]:
zero_intervals = interval_df[interval_df.interval_value == 0].sort_values('interval_length', ascending = False )
profiles = zero_intervals.index.get_level_values(0).unique()


In [None]:
IDX = 100
profile = profiles[IDX]
print(profile)
plot_profile_with_intervals(profile, 2016, data = interval_df, period_type_column = 'interval_value')

### Special profile: after a missing interval there is a zero 
In this part of the data there is one profile where values after a missing interval are always zero.  
But these zero values are clearly wrong!  
So this might be something we also need to take into account later.  
**For now I just ignore this given that it is only one profile**


In [None]:
intervals_of_interest = interval_df[(interval_df['0th_value_after_end'] == 0) & (interval_df['1th_value_after_end'] != 0) ]
zero_after_interval = pd.DataFrame(index = interval_df.index)
zero_after_interval['special_interval'] = False
zero_after_interval.loc[intervals_of_interest.index] = True
zero_after_interval;

In [None]:
IDX = 2
vis_df = pd.concat([interval_df, zero_after_interval], axis = 1)
detected_peak_ids = vis_df[vis_df.special_interval].index.get_level_values(0).unique()
print(f'{len(detected_peak_ids)} profiles with a detected peak')
non_detected_peak_ids = vis_df[~ vis_df.special_interval].index.get_level_values(0).unique()
profile_to_show = detected_peak_ids[IDX]
profile_to_show = 'Sl2clpa0lIpO1Q' # profile where we have this weird scenario 
print(profile_to_show)
# ';' supresses the output of the line. So if you want to see the plot remove the ; at the end of the next line
plot_profile_with_intervals(profile_to_show, 2016, data = vis_df, period_type_column = "special_interval").properties(height = 400);

# Check peaks due to PV_power and connection_power
If a value after an interval is larger than the connection power or lower than the negative PV_power/connection_power we know for sure it is a cumulative value.  
**For now I assume a power_factor of 1 to convert kVA to kW ($kVA \approx kW$), a lower power_factor will only work better because the threshold becomes 'tighter'!**

In [None]:
connection_power_peaks = get_connection_and_pv_power_peaks(interval_df)
connection_power_peaks.value_counts().to_frame('count').rename_axis(index = ['cumulative_value'])

So clearly this rule only helps to detect very few peaks but these peak detections are correct!

### Visualise some of the detected and non detected peaks

In [None]:
IDX = 1
vis_df = pd.concat([interval_df, connection_power_peaks.to_frame('connection_power_peak')], axis = 1)
detected_peak_ids = vis_df[vis_df.connection_power_peak].index.get_level_values(0).unique()
non_detected_peak_ids = vis_df[~ vis_df.connection_power_peak].index.get_level_values(0).unique()
injection_ids = data16_df[(data16_df < 0).any(axis = 1)].index.get_level_values(0).unique()
profile_to_show = injection_ids[IDX]
# profile_to_show = "Sl2clpSwmYpN1Q" # profile with clear cumulative measurement under connection capacity
print(profile_to_show)
plot_profile_with_intervals(profile_to_show, 2016, data = vis_df, period_type_column = 'connection_power_peak').properties(height = 400);

### Check how this does for the nan intervals longer than 12 hours
For long intervals, this methods seems to be enough to get a perfect detection  
After such a long interval it seems that if there is a cumulative peak it will always exceed the connection capacity  
This might be useful later on 

In [None]:
HOURS = 12
periods = HOURS * 4 
IDX = 21
long_intervals = interval_df[interval_df.interval_length >= periods]
vis_df = long_intervals.join(connection_power_peaks.to_frame('connection_power_peak'))
detected_peak_ids = vis_df[vis_df.connection_power_peak].index.get_level_values(0).unique()
non_detected_peak_ids = vis_df[~ vis_df.connection_power_peak].index.get_level_values(0).unique()
print(f"{len(detected_peak_ids)} profiles with a detected peak")
print(f"{len(non_detected_peak_ids)} profiles with a undetected peak")
profile_to_show = non_detected_peak_ids[IDX]
# profile_to_show = "Sl2clpSwmYpN1Q" # profile with clear cumulative measurement under connection capacity
print(profile_to_show)
plot_profile_with_intervals(profile_to_show, 2016, data = vis_df, period_type_column = 'connection_power_peak').properties(height = 400);

# Global context with statistical model 
For this method, a statistical model is fitted on all the measurements (except the measurements after an interval).  
Using this statistical model, we look at the likelihood of a value after a nan interval.  
If the likelihood is very low, the value is considered a cumulative value.  

A normal distribution is not very good! Because a lot of values are close to zero but there are some peaks. The normal distribution cannot very well capture the distribution of the measurements.  

As such I switched to a KDE, this does a better job because it can capture multi-model distribution (e.g. distributions with more than one peak, you will see an example later in this notebook)

In [None]:
def get_learned_model(row, model): 
    meterID, year = row.name
    interval_endings = nan_intervals.loc[(meterID, year), 'end_time']
    row_normal_values = row.drop(interval_endings, errors = 'ignore').dropna()
    model = model()
    model.fit(row_normal_values.to_numpy().T)
    return model

def get_peaks_with_model(nan_intervals, model, return_models = False):
    models = data16_df.apply(get_learned_model, model= model, axis = 1)
    models = nan_intervals[nan_intervals.first_value_after_end != 'end'].join(models.to_frame('model'))
    is_gauss_peak = models.apply(lambda row: row['model'].test_value(float(row['first_value_after_end'])), axis = 1)
    if return_models: 
        return is_gauss_peak, models
    return is_gauss_peak

In [None]:
global_gauss_peaks, global_gauss_models = get_model_based_global_peaks(data16_df, interval_df, lambda: NormalDistribution(0.99), return_models = True)
global_gauss_peaks.value_counts().to_frame('count')

In [None]:
%%time 
# takes around 11 minutes on pinac-d
# this takes a while, we can later switch to faster KDE methods (sklearn is accurate but SLLLOOWWW)
global_kde_peaks,global_kde_models = get_model_based_global_peaks(data16_df, interval_df, lambda: KDEDistribution(0.99, 0.07), return_models = True)
global_kde_peaks.value_counts().to_frame('count')

### Compare Gauss with KDE

In [None]:
pd.crosstab(global_gauss_peaks, global_kde_peaks, rownames = ['gauss_peaks'], colnames =['kde_peaks'])

So Gauss detects more cumulative values than KDE 

### Show some plots

In [None]:
IDX = 1
vis_df = pd.concat([interval_df, global_gauss_peaks.to_frame('gauss'), global_kde_peaks.to_frame('kde')], axis = 1).astype({'gauss':'bool', 'kde':'bool'})
def convert(a): 
    return "_".join([s for s,i in zip(['gauss', 'kde'],a) if i])
vis_df['detected_by'] = vis_df[['gauss','kde']].apply(convert, axis = 1)
vis_df['gauss_not_kde'] = vis_df['gauss'] & (~vis_df['kde'])
# display(vis_df)
gauss_not_kde_ids = vis_df.query('gauss_not_kde').index.get_level_values(0).unique()
# profile_to_show = 'Sl2clpSwmYpN1Q' # a profile with a clear peak that is not discovered using connection capacity
profile_to_show = gauss_not_kde_ids[IDX]
print(profile_to_show)
plot_profile_with_intervals(profile_to_show, 2016, data = vis_df, period_type_column = 'detected_by').properties(height = 400).display()
global_kde_models.loc[profile_to_show, 'model'].iloc[0].get_chart()

# Local context with statistical model 
Instead of using all measurements to learn a statistical model, we could also only use measurements close to the missing interval.  
However, this has some issues the context size has to be chosen well.  
If the context is chosen to small, we might mark values as weird (cumulative) while they are not in reality.  
This is why I think it is better to stick with the global context.  
But feel free to disagree with me 😉  

# Similarity based method
So this method is going to look for similar days using two distance metrics.  
One distance metric assumes that the value after the interval is a cumulative peak, the other distance metric assumes that the value after the interval is a correct measurement.   
The assumption of the distance metric that finds the best match is assumed to be correct.  

**For now I think the code only works for intervals that start/end on the same day.**

In [None]:
%%time 
# reasonably fast takes 1min30s 
same_day_intervals = interval_df[interval_df.start_time.dt.date == interval_df.end_time.dt.date]
similarity_peaks, similarity_peaks_info = get_similarity_based_peaks(data16_df, same_day_intervals, return_all_info = True)
similarity_peaks

### Compare with connection_power_peaks

In [None]:
rel_connection_peaks = connection_power_peaks[similarity_peaks.index]
confusion_matrix('connection_power', rel_connection_peaks, 'similarity based', similarity_peaks)

### Compare with global context kde peaks

In [None]:
rel_kde_peaks = global_kde_peaks[similarity_peaks.index]
confusion_matrix('global_kde', rel_kde_peaks, 'similarity', similarity_peaks)

### Conclusion of similarity
So the similarity seems to detect even more peaks than the rest, but it does also detect the peaks they detect.  
**Important note: for now this method can only handle intervals that start and end on the same day!**  
This method might also work better for profiles that are repetitive!

Special cases:  
- it occurs that both real_distance and cumulative_distance are very low!  
in this case we don't really know which one of the two but it doesn't really matter  
This is often the case for very short intervals (e.g. length 1)

## Show some profiles where kde and similarity differ

In [None]:
vis_df = interval_df.join(similarity_peaks.to_frame('similarity').astype('bool')).join(global_kde_peaks.to_frame('kde').astype('bool'))
def convert(a): 
    return "_".join([s for s,i in zip(['similarity', 'kde'],a) if i])
vis_df['detected_by'] = vis_df[['similarity','kde']].apply(convert, axis = 1)
vis_df.kde.value_counts(dropna = False)

In [None]:
IDX = 3
temp_df = vis_df.dropna(axis = 0, subset = ['similarity', 'kde'])
detected_sim_not_kde = temp_df[temp_df.similarity & ~temp_df.kde].index.get_level_values(0).unique()
detected_kde_not_sim = temp_df[~temp_df.similarity & temp_df.kde].index.get_level_values(0).unique()
profile_to_show = detected_kde_not_sim[IDX]
print(profile_to_show)
display(similarity_peaks_info.loc[profile_to_show])
plot_profile_with_intervals(profile_to_show, 2016, data = vis_df, period_type_column = 'detected_by').properties(height = 400)

# Combine all three techniques