# New similarity based cumulative value approach 
This approach first searched the nearest neighbors using the context as a guideline.  
Each nearest neighbor checks which assumption fits the best.  
If most nearest neighbors vote for the same assumption we mark the value as this.  


In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
from scipy.signal import find_peaks, find_peaks_cwt
from kde_diffusion import kde1d
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import KernelDensity
import warnings
from scipy.stats import norm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

from interval_information import get_interval_df
from peak_detection import (
    get_connection_and_pv_power_peaks, 
    get_model_based_global_peaks,
    get_similarity_based_peaks, 
    construct_search_intervals, 
    add_data_to_search_intervals, 
    sim_known_data, 
    match_knn_then_assumption, 
    get_knn_similarity_based_peaks
    
)
from statistical_models import (
    NormalDistribution, 
    AutoKDEDistribution, 
    KDEDistribution,
)

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

## Plot function 


In [None]:
def plot_profile_with_intervals(meterID, year, period_type_column = None, data = None, daterange = None):
    # plots the profile, using the period data in data 
    # the color can be determined using the period_type_column
    if data is None : 
        data = profile_intervals
    if daterange is not None: 
        start_time =  f'2016-{daterange[0]}-1 00:00:00'
        end_time = f'2016-{daterange[1]}-1 00:00:00'
        profile_df = data16_df.loc[(meterID, year),start_time:end_time]
        periods_for_profile =data.loc[(meterID,year), :]
        periods_for_profile = periods_for_profile[(periods_for_profile['end_time'] > start_time ) & (periods_for_profile['start_time'] < end_time)]
    else: 
        profile_df = data16_df.loc[(meterID, year),:]
        periods_for_profile =data.loc[(meterID,year), :]
        
#     print(periods_for_profile[['start_time', 'end_time']])
#     print(zero_periods_for_profile[['start_time', 'end_time', 'is_disconnection_period']])
    line = alt.Chart(profile_df.to_frame('value').reset_index()).mark_line().encode(
        x = alt.X('timestamp:T'), 
        y = alt.Y('value:Q')
    )
    if period_type_column is None: 
        color_encoding = alt.ColorValue('blue') 
    else: 
        color_encoding = alt.Color(f'{period_type_column}:N')
    plot_df =periods_for_profile.reset_index(drop=True)
    rect = alt.Chart(plot_df).mark_rect(opacity = 0.6).encode(
        x = 'start_time:T',
        x2 = 'end_time:T', 
        color = color_encoding
    ) + alt.Chart(plot_df).mark_circle(size = 100).encode(
        x = 'start_time:T',
        y = alt.YValue(profile_df.max()),
#         x2 = 'end_time:T', 
        color = color_encoding
    )
    chart = rect + line
    if 'connection_power' in periods_for_profile.columns: 
        connection_power = float(periods_for_profile.connection_power.iat[0])

        connection_power_line = alt.Chart(periods_for_profile.reset_index()).mark_rule(color = 'black', opacity = 0.8).encode(
            y =  'mean(connection_power):Q'
        )
        chart += connection_power_line
    return chart.properties(width = 2200, title = f"{meterID} in {year}").interactive()



## Confusion matrix
Small table for convenience comparison

In [None]:
def confusion_matrix(name1, series1, name2, series2): 
    return pd.crosstab(series1, series2, rownames = [name1], colnames =[name2])

## Read the data

In [None]:
%%time
info_df = pd.read_csv(info_path, dtype={'meterID':'str'}).set_index(['meterID', 'year'], drop=True)
print(f'#profiles = {info_df.shape[0]}')
data_df = pd.read_csv(data_path, dtype={'meterID':'str'}).set_index(['meterID', 'year'], drop=True)
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


## For development look at subset

In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']
info16_df

In [None]:
# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]
data16_df

## Calculate the intervals with additional information

In [None]:
data16_df = data16_df[data16_df.isna().any(axis = 1)]


In [None]:
interval_df = get_interval_df(data16_df, info16_df, keep_zero = True, keep_nan = True)
interval_df

# Check peaks due to PV_power and connection_power
If a value after an interval is larger than the connection power or lower than the negative PV_power/connection_power we know for sure it is a cumulative value.  
**For now I assume a power_factor of 1 to convert kVA to kW ($kVA \approx kW$), a lower power_factor will only work better because the threshold becomes 'tighter'!**

In [None]:
connection_power_peaks = get_connection_and_pv_power_peaks(interval_df)
connection_power_peaks.value_counts().to_frame('count').rename_axis(index = ['cumulative_value'])

So clearly this rule only helps to detect very few peaks but these peak detections are correct!

## KDE method

In [None]:
%%time 
# takes around 11 minutes on pinac-d
# this takes a while, we can later switch to faster KDE methods (sklearn is accurate but SLLLOOWWW)
global_kde_peaks,global_kde_models = get_model_based_global_peaks(data16_df, interval_df, lambda: KDEDistribution(0.99, 0.07), return_models = True)
global_kde_peaks.value_counts().to_frame('count')

In [None]:
interval_df.dtypes

In [None]:
low_start_and_low_end = pd.Series(index = interval_df.index, dtype = 'bool')
intervals_w_problems = interval_df[['0th_value_after_end', '1th_value_after_end', 'value_before_start']].isin(['start', 'end']).any(axis = 1)
intervals_w_problems

In [None]:
low_start_and_low_end.loc[~intervals_w_problems] = (interval_df.loc[~intervals_w_problems, ['0th_value_after_end', '1th_value_after_end', 'value_before_start']].abs() < 0.1).all(axis = 1)
low_start_and_low_end[intervals_w_problems] = False
low_start_and_low_end

# New similarity based method
This one basically looks for the nearest neighbors first and is then going to check which assumption fits the best

In [None]:
%%time 
# reasonably fast takes 1min40s 
same_day_intervals = interval_df[interval_df.start_time.dt.date == interval_df.end_time.dt.date]
similarity_peaks = get_knn_similarity_based_peaks(data16_df, same_day_intervals, context_size = '6H', reference_day_window = 50, k = 5)
similarity_peaks

In [None]:
def match_knn_then_assumption(row, data_df, reference_day_window = 50, context_size = '4H', k = 5):
    meterID, year, _, _ = row.name 
    # start and end time of the interval INCLUSIVE
    # so start_time is the first NaN value and end_time is the last NaN value
    start_time, end_time = row['start_time']+pd.Timedelta('15min') , row['end_time']-pd.Timedelta('15min')
    # later all timestamps will be put on the same date so do this here as well 
    start_time2, end_time2 = start_time.replace(year = 2016, month = 1, day = 1), end_time.replace(year = 2016, month = 1, day =1)
   

    # make the dataframe with all the relevant data
    search_intervals_df = construct_search_intervals(start_time, end_time, reference_day_window, context_size, data16_df)
    rel_data_df = add_data_to_search_intervals(meterID, year, search_intervals_df, data16_df)

    # seperate the missing day from all the other days
    missing_day = rel_data_df.loc[start_time - pd.Timedelta(context_size)/2]
    reference_days = rel_data_df.drop(index = start_time-pd.Timedelta(context_size)/2)
    
    # stats on the missing day 
    min_value_missing_day, max_value_missing_day  = abs(missing_day.squeeze().min()), abs(missing_day.squeeze().max())
    max_distance = max(min_value_missing_day, max_value_missing_day) / 2 
    
    # drop reference days with data problems
    # TODO fix for zero days then this is not really correct :) 
    reference_days.dropna(inplace = True)

    # calculate the distances between the missing day and the reference days 
    distances_known_data = reference_days.apply(sim_known_data, axis = 1, missing_day = missing_day.squeeze().to_numpy(), raw = True)
    
    # sort the distances from small to large
    sorted_distances = distances_known_data.sort_values()
       
    # take the best k matches
    best_matches = reference_days.loc[sorted_distances.iloc[:k].index]
    
    # for these matches calculate how well the cumulative and real value assumption fit 
    best_match_info = pd.DataFrame(index = best_matches.index)
    peak_time = end_time2 + pd.Timedelta('15min')
    # calculate the expected value after the interval using each assumption
    best_match_info['cumulative'] = best_matches.apply(lambda x: np.sum(x.loc[start_time2: peak_time]), axis = 1)
    best_match_info['real'] = best_matches[peak_time]
    
    # calculate the difference between the observed value and the expected value
    observed_value =  missing_day.squeeze()[peak_time]
    best_match_info = best_match_info.join(sorted_distances.to_frame('simularity'), how = 'left')
    best_match_info['observed'] = observed_value
    best_match_info['cumulative_distance'] = np.abs(best_match_info['cumulative'] - observed_value)
    best_match_info['real_distance'] = np.abs(best_match_info['real'] - observed_value)
    
    # let each profile vote
    real_votes = best_match_info.real_distance < best_match_info.cumulative_distance
    cumulative_votes = best_match_info.cumulative_distance < best_match_info.real_distance
    dont_know_votes = best_match_info[['cumulative_distance','real_distance']].min(axis = 1) > max_distance
    best_match_info.loc[real_votes, 'vote']  = 'real'
    best_match_info.loc[cumulative_votes, 'vote'] = 'cumulative'
    best_match_info.loc[dont_know_votes, 'vote'] = 'dont_know'
    
    # count votes 
    votes = best_match_info[best_match_info.vote != 'dont_know']
    vote_count = votes.vote.value_counts()
    relative_vote_count = vote_count/ len(votes)
    
    decision_certainty = relative_vote_count.max()
    if decision_certainty >= 0.80: 
        decision = relative_vote_count.idxmax()
        if decision == 'dont_know': 
            decision = None
    else: 
        decision = None
    
#     best_match_info = best_match_info[['real_distance', 'cumulative_distance']]
    
    return decision, relative_vote_count, missing_day, best_matches, best_match_info

# All info df


In [None]:
vis_df = (
    interval_df
        .drop(columns = ['0th_value_after_end', '1th_value_after_end', 'value_before_start', 'PV_power'])
        .join(connection_power_peaks.to_frame('connection_peak'))
        .join(global_kde_peaks.to_frame('kde_peak'))
        .join(similarity_peaks.to_frame('similarity_peak'))
        .join(low_start_and_low_end.to_frame('low_start_and_end'))
        .fillna({'similarity_peak': np.nan})
    )
OVERWRITE = True
if OVERWRITE or not result_path.exists(): 
    vis_df.to_csv(result_path)
vis_df

In [None]:
# vis_df = pd.read_csv(result_path)
# vis_df['start_time'] = pd.to_datetime(vis_df['start_time'])
# vis_df['end_time'] = pd.to_datetime(vis_df['end_time'])
# vis_df = vis_df.set_index(['meterID', 'year', 'end', 'start'])

In [None]:
test_df = vis_df[~vis_df.connection_peak & vis_df.similarity_peak]
len(test_df)

In [None]:
def confusion_matrix(name1, series1, name2, series2): 
    return pd.crosstab(series1, series2, rownames = [name1], colnames =[name2], dropna = False)


In [None]:
confusion_matrix('connection_power', connection_power_peaks, 'KDE', global_kde_peaks)

In [None]:
rel_connection_peaks = connection_power_peaks[similarity_peaks.index]
confusion_matrix('connection_power', rel_connection_peaks.fillna('NA'), 'similarity based', similarity_peaks.fillna("NA"))

### Compare with global context kde peaks

In [None]:
rel_kde_peaks = global_kde_peaks[similarity_peaks.index]
confusion_matrix('global_kde', rel_kde_peaks.fillna('NA'), 'similarity', similarity_peaks.fillna("NA"))

# Look at some specific cases

In [None]:
def inspect_similarity_approach(random_interval, reference_day_window = 50, context_size = '4H', k = 5): 
    decision, decision_info, missing_day, best_matches, best_match_info = match_knn_then_assumption(random_interval, data16_df, reference_day_window, context_size, k)
    print(f"decision = {decision}")
    display(decision_info.to_frame().T)
    display(random_interval.to_frame().T)
    start, end = random_interval.start_time - pd.Timedelta(days = reference_day_window //2+1), random_interval.end_time + pd.Timedelta(days = reference_day_window //2+1)
    profile_data = data16_df.loc[random_interval.name[:2], start:end]

    profile_data_vis = profile_data.to_frame('value').reset_index()
    profile_data_vis

    bars_df = best_matches.index.to_frame().reset_index(drop = True)
    bars_df['type'] = 'reference_day'
    start_missing, end_missing = random_interval[['start_time', 'end_time']]
    start_missing -= pd.Timedelta(context_size)/2
    end_missing += pd.Timedelta(context_size)/2

    bars_df = bars_df.append({'start':start_missing, 'end':end_missing, 'type':'missing_day'}, ignore_index = True)
    bars_df

    full_chart = alt.Chart(bars_df).mark_rect(opacity = 0.6).encode(
            x = 'start:T',
            x2 = 'end:T', 
            color = 'type'
        ) + alt.Chart(profile_data_vis, width = 2000).mark_line().encode(
        x = 'timestamp:T', 
        y = 'value'
    ) 

    missing_day_vis = missing_day.stack(dropna=False).to_frame('value').reset_index().rename(columns = {'level_1':'time'})

    missing_chart = alt.Chart(missing_day_vis, title = 'the missing interval + context').mark_line().encode(
        x='time:T', 
        y= 'value', 
        tooltip = [alt.Tooltip('time', format = '%H:%M'),'value']
    )

    matches_vis =best_matches.stack().to_frame('value').reset_index().rename(columns = {'level_2':'time'})
    matches_chart = alt.Chart(matches_vis, title = 'best matches based on context').mark_line().encode(
        x = 'time:T', 
        y = 'value', 
        color = 'start:N', 
        tooltip = [alt.Tooltip('time', format = '%H:%M'),'value']
    )
    (full_chart.interactive(bind_y =False) & (missing_chart.interactive(bind_x = False) | matches_chart.interactive(bind_x = False)).resolve_scale(y='shared')).resolve_scale(y='shared').display()
    display(best_match_info)

## connection peak is true but similarity peak doesn't know or dissagrees

In [None]:
vis_df_same_day = vis_df[vis_df.start_time.dt.date == vis_df.end_time.dt.date]
connection_sim_disagrees = vis_df_same_day[vis_df_same_day.connection_peak & (vis_df_same_day.similarity_peak == False)]
kde_sim_disagrees = vis_df_same_day[vis_df_same_day.kde_peak & (vis_df_same_day.similarity_peak == False)]
kde_false_sim_true = vis_df_same_day[~vis_df_same_day.kde_peak & (vis_df_same_day.similarity_peak)]
real_detections = vis_df_same_day[vis_df_same_day.similarity_peak == False]
dont_know_detections = vis_df_same_day[vis_df_same_day.similarity_peak.isna()]
sim_dont_know_low = vis_df_same_day[vis_df_same_day.similarity_peak.isna() & vis_df_same_day.low_start_and_end]
IDX = -1

In [None]:
IDX += 1
# chose one of the options from above
interval = kde_sim_disagrees.iloc[IDX]
inspect_similarity_approach(interval, context_size = '6H', reference_day_window = 50, k = 5)

# Make a single prediction per interval
Sl2clpa0lIpO1Q this profile is an exception! The zeros after the Nan Interval should be replaced by NaNs and the value after the longer NaN interval should be marked as real
- For the rest if connection_power_peak can give an answer use that answer
- We won't use the kde results as similarity gives the same result but better 
- So then we apply similarity to the remaining NaNs 
- After the similarity all intervals where beginning and end are low are given a real value prediction 

In [None]:
all_info = vis_df
all_info

In [None]:
final_predictions = pd.Series(index = all_info.index, dtype = 'object')
final_predictions

In [None]:
# connection peak overwrites all the rest
final_predictions[all_info.connection_peak] = True
step1 = final_predictions.value_counts(dropna = False).to_frame('count')
step1

In [None]:
# next use similarity 
final_predictions[final_predictions.isna()] = all_info.similarity_peak[final_predictions.isna()]
step2 = final_predictions.value_counts(dropna = False).to_frame('count')
step2

In [None]:
# next let KDE fill in the things that similarity couldn't solve
final_predictions[final_predictions.isna() & all_info.kde_peak] = True
step3 = final_predictions.value_counts(dropna = False).to_frame('count')
step3

In [None]:
# after similarity mark intervals that are still NaN as real if both beginning and end value are low 
final_predictions[final_predictions.isna() & all_info.low_start_and_end] = False
step4 = final_predictions.value_counts(dropna = False).to_frame('count')
step4

### Choose an interval

In [None]:
same_day_intervals = interval_df[interval_df.start_time.dt.date == interval_df.end_time.dt.date]
IDX = 100
# IDX +=100
# IDX = 1 # a cumulative peak that would otherwise not be detected
# IDX = 2 # a cumulative peak that would otherwise not be detected
# IDX = 11 # a case where it is not super clear what to do
# IDX = 316 # a case where it is not super clear what to do
# IDX = 176 # a case where it is not super clear what to do but we don't actually carea
random_interval = same_day_intervals.iloc[IDX]
print(IDX)
random_interval.to_frame().T

### Let the detection run

In [None]:
reference_day_window = 50
context_size = '4H'
k = 5
decision, decision_info, missing_day, best_matches, best_match_info = match_knn_then_assumption(random_interval, reference_day_window, context_size, k)



### Inspect the result

In [None]:
display(best_match_info.mean()[['cumulative_distance','real_distance']].to_frame())
best_match_info