# Distinguish measurement errors from real zero measurements

So we build upon a couple of intuitions.  
Indicators for an error: 
- **clear cumulative value** *Implemented not adapted*
- **collective zero/NaN interval** *Implemented*  
(it happens that in one profile an interval is zero and in another it is NaN) 


Indicators for normal behaviour: 
- **A single zero when there is a consumption sign change**  *Implemented*
- **A single zero surrounded by low consumption** *Implemented*





# Imports and set-up

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import dask.dataframe as dd #conda install dask
from dask.distributed import Client
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import tqdm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
from interval_information import get_interval_df
from peak_detection import (
    get_cumulative_value_detections, 
    get_connection_and_pv_power_peaks, 
    get_knn_similarity_based_peaks,
    match_knn_then_assumption_parallel
)
from zero_intervals import (
    sign_change_intervals, 
    low_consumption_on_both_sides_intervals, 
    collective_error_intervals
)

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
zero_error_path = RESULT_PATH / 'zero_interval_is_error.csv' 
zero_error_pkl_path = RESULT_PATH / 'zero_interval_is_error.pkl' 
# info_path = PRE_PATH/'info.csv'
# data_path = PRE_PATH/'data.csv'
info_path = PRE_PATH/'reindexed_info.pkl'
data_path = PRE_PATH/'reindexed_DST_data.pkl'
result_with_error_path = PRE_PATH/'reindexed_DST_data_w_errors.pkl'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

# Helpers

In [None]:
def detection_summary(series): 
    count = series.value_counts(dropna=False).to_frame('count')
    count['relative'] = count['count']/count['count'].sum()
    return count

In [None]:
def combine_strategies(*args): 
    strategies = pd.concat(args, axis = 1)
    normal = (strategies == False).any(axis = 1)
    error = (strategies == True).any(axis = 1)
    nan = (strategies.isna()).all(axis = 1)
    result = pd.Series(index = strategies.index, dtype ='object')
    result[error] = True
    result[normal] = False
    return result
    

## Read the data

In [None]:
%%time
info_df = pd.read_pickle(info_path)
data_df = pd.read_pickle(data_path)
data_df.columns = pd.to_datetime(data_df.columns, exact = False)
data_df.columns.name = 'timestamp'


In [None]:
# DATA_SOURCE = 'EandisVREG'
# YEAR = 2016
# # get the right subset based on the info df
# info16_df = info_df.loc[idx[:, 2016],:]
# info16_df = info16_df[info16_df.data_source == 'EandisVREG']

# # read the corresponding data profiles 
# data16_df = data_df.loc[info16_df.index, :]

# info_df.connection_power.astype('float')

## Only investigate timeseries with data problems

In [None]:
# nb of zeros for each profile
nb_of_zeros = (data_df == 0).sum(axis = 1)
nb_of_nan = data_df.isna().any(axis =1 )
data16_df= data_df.loc[(nb_of_zeros>0) | nb_of_nan]
# data16_df

## Construct the intervals
So in the rest of this code we simply construct the intervals as a dataset and add different attributes/features and investigate whether they could be useful or not

In [None]:
%%time
# the DST missing intervals are still in here
interval_df = get_interval_df(data_df, info_df, keep_zero = True, keep_nan = True)
interval_df

# Sign change intervals
A single zero is normal if the consumption changes sign 

In [None]:
sign_change_detection = sign_change_intervals(interval_df)
detection_summary(sign_change_detection)


# Short low consumption intervals
A single zero is normal if the consumption on both sides of the interval is small

In [None]:
low_consumption_detection = low_consumption_on_both_sides_intervals(interval_df)
detection_summary(low_consumption_detection)

# Current result

In [None]:
current_detection = combine_strategies(low_consumption_detection, sign_change_detection)
detection_summary(current_detection)

# Collective periods based on start time

In [None]:
# don't look at the intervals we have marked in the previous two steps
rel_interval_df = interval_df.query('interval_value == 0')[current_detection != False]
rel_interval_df

In [None]:
collective_data_problems = collective_error_intervals(rel_interval_df, threshold = 2)

# Current result

In [None]:
current_result = combine_strategies(sign_change_detection,low_consumption_detection, collective_data_problems)
detection_summary(current_result)

# On the remaining zero intervals do cumulative value detection

In [None]:
print(f'there are {current_result.isna().sum()} intervals that are still unknown')

In [None]:
%%time
remaining_intervals = interval_df.query('interval_value == 0')[current_result.isna()]
data_subset = data16_df.loc[remaining_intervals.index.get_level_values(0).unique()]

cumulative_value_detection = get_cumulative_value_detections(data_subset, remaining_intervals, n_threads = 10, result_dir = Path()/'intermediate_results')
cumulative_value_detection = cumulative_value_detection.reindex(interval_df.query('interval_value ==0').index)
cumulative_value_detection

# Final result

In [None]:
result = combine_strategies(sign_change_detection,low_consumption_detection, collective_data_problems, cumulative_value_detection)
detection_summary(result)

In [None]:
result.to_frame('is_error').to_csv(zero_error_path)
result.to_frame('is_error').to_pickle(zero_error_pkl_path)

# Include this result in the dataframe

In [None]:
interval_df

In [None]:
interval_with_error = interval_df.join(result.to_frame('is_error'))
interval_with_error

For every detected data problem fill the interval with the end values with NaN's 

In [None]:
data_df_with_errors = data_df.copy()
for index, row in tqdm.tqdm(interval_with_error[interval_with_error.is_error == True].iterrows()): 
    data_df_with_errors.loc[tuple(index[:2]),:].iloc[index[2]:index[3]+1] = np.NAN


In [None]:
data_df_with_errors.to_pickle(result_with_error_path)