# New similarity based cumulative value approach 
This approach first searched the nearest neighbors using the context as a guideline.  
Each nearest neighbor checks which assumption fits the best.  
If most nearest neighbors vote for the same assumption we mark the value as this.  


In [None]:
import altair as alt
from dask.distributed import Client
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
from scipy.signal import find_peaks, find_peaks_cwt
from kde_diffusion import kde1d
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.neighbors import KernelDensity
import warnings
from scipy.stats import norm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

from interval_information import get_interval_df
from peak_detection import (
    get_cumulative_value_detections,
    get_connection_and_pv_power_peaks, 
    get_model_based_global_peaks,
    get_similarity_based_peaks, 
    construct_search_intervals, 
    add_data_to_search_intervals, 
    sim_known_data, 
    match_knn_then_assumption, 
    get_knn_similarity_based_peaks
    
)
from statistical_models import (
    NormalDistribution, 
    AutoKDEDistribution, 
    KDEDistribution,
)

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
zero_path = RESULT_PATH / 'zero_interval_is_error.csv'
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists() and zero_path.exists(), 'These paths should exist'

## Confusion matrix
Small table for convenience comparison

In [None]:
def confusion_matrix(name1, series1, name2, series2): 
    return pd.crosstab(series1, series2, rownames = [name1], colnames =[name2])

## Read the data

In [None]:
%%time
info_df = pd.read_csv(info_path, dtype={'meterID':'str'}).set_index(['meterID', 'year'], drop=True)
print(f'#profiles = {info_df.shape[0]}')
data_df = pd.read_csv(data_path, dtype={'meterID':'str'}).set_index(['meterID', 'year'], drop=True)
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


## For development look at subset

In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']
info16_df

In [None]:
# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]
data16_df

# Read the zero error detections

In [None]:
zero_detections = pd.read_csv(zero_path).set_index(['meterID', 'year', 'start', 'end'], drop = True)
zero_detections

## Calculate the intervals with additional information

In [None]:
interval_df = get_interval_df(data16_df, info16_df, keep_zero = True, keep_nan = True)
interval_df

# Only look at the NaN intervals and the zero intervals detected as error

In [None]:
nan_intervals = interval_df.interval_value.isna()
zero_error_intervals = (zero_detections == True).reindex(interval_df.index).is_error

interval_df = interval_df[nan_intervals | zero_error_intervals]

data_subset = data16_df.loc[interval_df.index.get_level_values(0).unique()]

# Do cumulative value detection on the known error intervals


In [None]:
with Client(n_workers = 4) as client: # local client with 4 processes
    cumulative_value_detection = get_cumulative_value_detections(data_subset, interval_df, client = client)
cumulative_value_detection

# Save the result

In [None]:
cumulative_value_detection.to_frame('followed_by_cumulative_value').to_csv(result_path)

# Result summary
## Zero detection

In [None]:
zero_detections.is_error.value_counts(dropna = False)

In [None]:
cumulative_value_detection.value_counts(dropna = False)