# Distinguish measurement errors from real zero measurements

So we build upon a couple of intuitions.  
Indicators for an error: 
- clear cumulative value 
- collective zero/NaN interval (it happens that in one profile an interval is zero and in another it is NaN) 
- a zero interval in a profile where zeros are exceptional 

Indicators for normal behaviour: 
- Profiles where zeros are common (and no other indications of an error) 
- **A single zero when there is a consumption sign change**  *Implemented*
- consumption around zero interval is low 
- Very long zero intervals (if not followed by a cumulative value) 




## Imports and set-up

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import tqdm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
from interval_information import get_interval_df
from peak_detection import (
    get_cumulative_value_detections, 
    get_connection_and_pv_power_peaks, 
    get_knn_similarity_based_peaks,
)

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

### helpers

In [None]:
def plot_profile_with_intervals(meterID, year, period_type_column = None, data = None, daterange = None):
    # plots the profile, using the period data in data 
    # the color can be determined using the period_type_column
    if data is None : 
        data = nan_intervals
    if daterange is not None: 
        start_time =  f'2016-{daterange[0]}-1 00:00:00'
        end_time = f'2016-{daterange[1]}-1 00:00:00'
        profile_df = data16_df.loc[(meterID, year),start_time:end_time]
        periods_for_profile =data.loc[(meterID,year), :]
        periods_for_profile = periods_for_profile[(periods_for_profile['end_time'] > start_time ) & (periods_for_profile['start_time'] < end_time)]
    else: 
        profile_df = data16_df.loc[(meterID, year),:]
        periods_for_profile =data.loc[(meterID,year), :]
        
#     print(periods_for_profile[['start_time', 'end_time']])
#     print(zero_periods_for_profile[['start_time', 'end_time', 'is_disconnection_period']])
    line = alt.Chart(profile_df.to_frame('value').reset_index()).mark_line().encode(
        x = alt.X('timestamp:T'), 
        y = alt.Y('value:Q')
    )
    if period_type_column is None: 
        color_encoding = alt.ColorValue('blue') 
    else: 
        color_encoding = alt.Color(f'{period_type_column}:N')
    plot_df =periods_for_profile.reset_index(drop=True)
    rect = alt.Chart(plot_df).mark_rect(opacity = 0.6).encode(
        x = 'start_time:T',
        x2 = 'end_time:T', 
        color = color_encoding
    ) + alt.Chart(plot_df).mark_circle(opacity = 0.6).encode(
        x = 'start_time:T',
        y = alt.YValue(profile_df.max()),
#         x2 = 'end_time:T', 
        color = color_encoding
    )
    chart = rect + line
    if 'connection_power' in periods_for_profile.columns: 
        connection_power = float(periods_for_profile.connection_power.iat[0])

        connection_power_line = alt.Chart(periods_for_profile.reset_index()).mark_rule(color = 'black', opacity = 0.8).encode(
            y =  'mean(connection_power):Q'
        )
        chart += connection_power_line
    return chart.properties(width = 2200, title = f"{meterID} in {year}").interactive()


In [None]:
def confusion_matrix(name1, series1, name2, series2): 
    return pd.crosstab(series1, series2, rownames = [name1], colnames =[name2])

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


## Handle all data sources and years seperately
Of course connection problems need to be in the same year and within the same measurement project, so for now lets use the EandisVREG data of 2016

In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']

# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]


## For this only look at the profiles that have zeros

In [None]:
# nb of zeros for each profile
nb_of_zeros = (data16_df == 0).sum(axis = 1)
nb_of_nan = data16_df.isna().any(axis =1 )
data16_df= data16_df.loc[(nb_of_zeros>0) | nb_of_nan]
data16_df

## Construct the intervals
So in the rest of this code we simply construct the intervals as a dataset and add different attributes/features and investigate whether they could be useful or not

In [None]:
%%time
interval_df = get_interval_df(data16_df, info16_df, keep_zero = True, keep_nan = True)
interval_df

# Take a small subset

In [None]:
NB_OF_PROFILES = 100
profile_sample = interval_df.index.get_level_values(0).unique()[:NB_OF_PROFILES]
interval2_df = interval_df.loc[profile_sample]
interval2_df

# Fill this df with detection information

In [None]:
detection_df = pd.DataFrame(index = interval2_df.index)
detection_df

# Short intervals

In [None]:
short_zero_intervals = (
    interval2_df
    .replace({'start':np.nan, 'end': np.nan})
    .dropna(subset = ['0th_value_after_end', 'value_before_start'])
    .query('interval_value == 0')
    .query('interval_length == 1')
)
# short_zero_intervals

In [None]:
sign_change_intervals = np.sign(short_zero_intervals['value_before_start']) == - np.sign(short_zero_intervals['0th_value_after_end'])
count = sign_change_intervals.value_counts(dropna = False).to_frame('count')
count['relative_count'] = count['count']/count['count'].sum()
count

### Add to detection df

In [None]:
detection_df['sign_change'] = np.nan
detection_df.loc[sign_change_intervals[sign_change_intervals].index, 'sign_change'] = False


# A single zero with low consumption on both sides

In [None]:
short_low_consumption = (short_zero_intervals['value_before_start'] < 0.1) & (short_zero_intervals['0th_value_after_end'] < 0.1)
count = short_low_consumption.value_counts(dropna = False).to_frame('count')
count['relative_count'] = count['count']/count['count'].sum()
count

### Confusion with sign change

In [None]:
confusion_matrix('low_consumption', short_low_consumption, 'sign_change', sign_change_intervals)

### Add to detection df

In [None]:
detection_df['short_low_consumption'] = np.nan
detection_df.loc[short_low_consumption[short_low_consumption].index, 'short_low_consumption'] = False
detection_df

In [None]:
detection_df = detection_df.drop(columns = 'detection')
detected_normal = (detection_df == False).any(axis = 1)
detected_cumulative = (detection_df == True).any(axis =1)
detection_df['detection'] = np.nan
detection_df.loc[detected_normal, 'detection'] = False
detection_df.loc[detected_cumulative, 'detection'] = True 
detection_df

In [None]:
remaining_intervals = interval2_df[detection_df.detection.isna()]
remaining_intervals

# Cumulative value detection on the zeros

## Connection power peaks

In [None]:
connection_power_peaks = get_connection_and_pv_power_peaks(remaining_intervals)
connection_power_peaks.value_counts().to_frame('count')

In [None]:
remaining_intervals[connection_power_peaks].interval_value.value_counts(dropna= False).to_frame('count')

## add to detection_df

In [None]:
detection_df['connection_power'] = np.nan
detection_df.loc[connection_power_peaks[connection_power_peaks].index, 'connection_power'] = True
detection_df

## Similarity based peaks

In [None]:
%%time
same_day_intervals = remaining_intervals[remaining_intervals.start_time.dt.date == remaining_intervals.end_time.dt.date]
avoid_bug = same_day_intervals[(same_day_intervals.start_time - pd.Timedelta('3H')).dt.date == same_day_intervals.start_time.dt.date]

similarity_peaks = get_knn_similarity_based_peaks(data16_df, avoid_bug, 50,'6H',  5)
similarity_peaks.value_counts(dropna =False).to_frame('count')

In [None]:
detection_df['similarity'] = np.nan
detection_df.loc[similarity_peaks.index, 'similarity'] = similarity_peaks
detection_df

In [None]:
detected_normal = (detection_df[['sign_change', 'short_low_consumption']] == False).any(axis = 1)
detected_cumulative = (detection_df[['sign_change', 'short_low_consumption']] == True).any(axis =1)
detection_df['intuitions'] = np.nan
detection_df.loc[detected_normal, 'intuitions'] = False
detection_df.loc[detected_cumulative, 'intuitions'] = True 
detection_df

detections = detection_df.intuitions
detections[detections.isna()] = detection_df.connection_power[detections.isna()]
detections[detections.isna()] = detection_df.similarity[detections.isna()]
detections

In [None]:
detection_true = (similarity_peaks == True).index.get_level_values(0).unique()
vis_df = interval_df.join(detections.fillna('no_detection').to_frame('cumulative'))
IDX = 1 
plot_profile_with_intervals(detection_true[IDX],2016, period_type_column = 'cumulative', data = vis_df, daterange = None)

# Intuition: if the zero values are very uncommon using a kde the zeros are probably measurement mistakes!

### Visualise beforehand to get some good examples

In [None]:
profiles_sorted_by_zero_count = (data16_df == 0).sum(axis = 1).sort_values().index.get_level_values(0)
profiles_sorted_by_zero_count

In [None]:
IDX = 100
plot_profile_with_intervals(profiles_sorted_by_zero_count[IDX], 2016, data = interval_df)

## Third intuition: very long zero periods are disabled meters not measurement errors

To get an idea let's check how long the longest NaN intervals are 

In [None]:
nan_intervals = full_interval_df[full_interval_df.interval_value.isna()]
nan_intervals
zero_intervals = full_interval_df[full_interval_df.interval_value == 0]
zero_intervals

In [None]:
temp = nan_intervals.interval_length.value_counts().to_frame('count')
nan_interval_length = alt.Chart(temp.reset_index(), title = 'NaN interval length').mark_bar().encode(
    x = 'index:N',
    y= 'count'
)
temp = zero_intervals.interval_length.value_counts().to_frame('count')
zero_interval_length = alt.Chart(temp[temp.index<200].reset_index(), title = 'zero interval length').mark_bar().encode(
    x = 'index:Q',
    y= 'count'
)
(nan_interval_length | zero_interval_length)

So the longest NaN interval is 200 but the zero intervals are way longer. So everything longer than 200 timestamps +- 2 days is considered a disabled meter.

In [None]:
long_zero_intervals = zero_intervals[zero_intervals.interval_length > 200]
long_zero_intervals

In [None]:
zero_intervals.loc[long_zero_intervals.index, 'interval_type'] = 'real_long_disabled_meter'

In [None]:
relevant_profiles = long_zero_intervals.index.get_level_values(0).unique()
INDEX = 25
print(f'showing profile {INDEX} from {len(relevant_profiles)}')
print(relevant_profiles[INDEX])
plot_profile_with_intervals(relevant_profiles[INDEX], 2016, 'interval_type', data = zero_intervals)

### Again show some results

## Add count to each interval
Count the amount of times that each interval occurs

In [None]:
interval_counts = profile_intervals.reset_index().groupby(['start', 'end'])[['meterID', 'year']].size().to_frame('#profiles')
nan_interval_counts = profile_intervals[profile_intervals.interval_value.isna()].reset_index().groupby(['start', 'end'])[['meterID', 'year']].size().to_frame('#profiles')
zero_interval_counts = profile_intervals[~profile_intervals.interval_value.isna()].reset_index().groupby(['start', 'end'])[['meterID', 'year']].size().to_frame('#profiles')
print(f'{len(interval_counts)} distinct disconnection periods')
interval_counts

### Plot the amount of profiles that are zero during an interval and their count
So interestingly you can see here that there are also zero intervals that occur for multiple meters! So some of these zeros are also missing values! 
It also seems that there are intervals that show up as zeros in one profile but NaNs in another profile! 

In [None]:
all_chart = alt.Chart(interval_counts.reset_index(), title = '#intervals that have x profiles that are missing in the interval').mark_bar().encode(
    x = alt.X('#profiles:N'), 
    y = alt.Y('count()')
)
zero_chart = alt.Chart(zero_interval_counts.reset_index(), title = '# zero intervals that have x profiles that are missing in the interval').mark_bar().encode(
    x = alt.X('#profiles:N'), 
    y = alt.Y('count()')
)
nan_chart = alt.Chart(nan_interval_counts.reset_index(), title = '# NaN intervals that have x profiles that are missing in the interval').mark_bar().encode(
    x = alt.X('#profiles:N'), 
    y = alt.Y('count()')
)
(all_chart & zero_chart & nan_chart)

### Check if these unique missing intervals are really unique or if they are similar to one of the non-unique intervals

In [None]:
def check_unique(intervals): 
    df = intervals.squeeze()
    non_unique_df = df[df > 1]
    unique_df = df[df == 1]
    non_unique_set = {index for index in non_unique_df.index}
    remaining_uniques = []
    for start, end in unique_df.index: 
        found = False
        for delta_s, delta_e in itertools.product([-1, 0,1], [-1, 0, 1]): 
            if (start + delta_s, end + delta_e) in non_unique_set: 
                found = True
                break
        if not found: 
            remaining_uniques.append((start,end))
    return remaining_uniques
unique_nans = check_unique(nan_interval_counts)
unique_zeros = check_unique(zero_interval_counts)
print(f'there are {len(unique_nans)} unique NaN intervals that are not similar to a more common NaN interval (of {len(nan_interval_counts[nan_interval_counts.squeeze()==1])} unique intervals)')
print(f'there are {len(unique_zeros)} unique zero intervals that are not similar to a more common zero interval (of {len(zero_interval_counts[zero_interval_counts.squeeze() == 1])} unique intervals)')

In [None]:
temp = nan_interval_counts.loc[unique_nans].reset_index().eval('end-start').to_frame('interval_length')
temp = temp.value_counts().sort_index().to_frame('count').reset_index()
alt.Chart(temp, title = 'histogram of the length of the unique profiles').mark_bar().encode(
    x = alt.X('interval_length:N', title = 'Interval length'), 
    y = alt.Y('count:Q', title = '# unique profiles of given length')
)

So this shows that there are intervals that are not similar to any other interval but are still measurement errors.  
We can also see that the length of these unique periods is not necesarrily short intervals.  
**In conclusion, a high count can be indicative of measurement errors but a low count does not necessarily mean that it is a valid measurement.**

## Check if there are intervals that show up as zero and as NaN 

In [None]:
# intervals that occur twice, occur once as a zero interval and once as a nan interval
zero_nan_intervals = profile_intervals.reset_index()[['start', 'end', 'interval_value']].drop_duplicates().groupby(['start', 'end']).size() == 2
zero_nan_intervals = zero_nan_intervals.index[zero_nan_intervals].to_frame(index = False)
zero_nan_intervals['length'] = zero_nan_intervals['end'] - zero_nan_intervals['start']
zero_nan_intervals['length'].value_counts().sort_index()

So there clearly are intervals that occur both as a zero interval and as a nan interval!  
The lengths of these intervals are also not necessarily short (although short intervals are more common)  
This makes me think that there is some weird preprocessing involved

In [None]:
temp = zero_nan_intervals.set_index(['start', 'end']).join(profile_intervals.reset_index().set_index(['start', 'end', 'meterID', 'year']), how = 'inner' )
temp

//TODO visualize some of these co-occurences? 

### plot the length of each interval vs its count 

In [None]:
def length_vs_count_scatter_plot(df): 
    df = df.reset_index()
    df['length'] = df['end'] - df['start']
    return alt.Chart(df).mark_circle().encode(
        x = '#profiles:N', 
        y = alt.Y('length:Q', scale = alt.Scale(type = 'log'))
    )
length_vs_count_scatter_plot(interval_counts).properties(title = 'all') & length_vs_count_scatter_plot(zero_interval_counts).properties(title = 'zero') & length_vs_count_scatter_plot(nan_interval_counts).properties(title = 'NaN')

So, most of these intervals are unique (there is only one meter that is zero/NaN in this interval)

In [None]:
# add this info to profile_intervals
if '#profiles' not in profile_intervals.columns: 
    profile_intervals = profile_intervals.join(interval_counts, on = ['start', 'end'])
profile_intervals

## Check if we can distinguish NaN intervals from 0 intervals
We'll use a decision tree to check if we can learn a function that figures out if a certain interval is zero or NaN given the other features (learned this trick from Elia)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt # I don't want to :( but I have to

In [None]:
tree = DecisionTreeClassifier(max_depth = 3)
y = profile_intervals['interval_value'].fillna(1).astype('int').values
X = profile_intervals.reset_index().drop(columns = ['meterID', 'year', 'interval_value', 'start_time', 'end_time'])
tree.fit(X.values,y)
plt.figure(figsize = (20,10))
plot_tree(tree, feature_names = X.columns);

I played around a little bit with this but nothing to conclude, they are not perfectly seperable. 

## Check NaN interval stats

In [None]:
profile_intervals[profile_intervals.interval_value.isna()]

## Investigate the intervals with counts > 600

In [None]:
very_high_count = profile_intervals[profile_intervals['#profiles']> 600]
very_high_count_ids = very_high_count.index.to_frame(index = False)
very_high_count[['start_time','end_time']].drop_duplicates().reset_index(drop = True)

In [None]:
temp_df = profile_intervals.copy()
temp_df['high_count'] = temp_df['#profiles'] > 600
temp_df

plot_profile_with_intervals(*very_high_count_ids.iloc[0,:2], 'high_count', data = temp_df, daterange = (1,4)) & plot_profile_with_intervals(*very_high_count_ids.iloc[3,:2], 'high_count', data = temp_df, daterange = (1,4))
# plot_profile_with_intervals(*very_high_count_ids.iloc[3,:2], 'high_count', data = temp_df)

## Investigate intervals with counts > 10 and < 35  

In [None]:
high_count = profile_intervals[(profile_intervals['#profiles']> 10) & (profile_intervals['#profiles'] < 35)]
high_count_ids = high_count.index.to_frame(index = False)
intervals_of_interest = high_count.drop_duplicates(['start_time', 'end_time', 'interval_value']).sort_values(['start_time', 'end_time']).reset_index(drop = True)
intervals_of_interest

In [None]:
intervals_of_interest.interval_length.value_counts().to_frame('count')

In [None]:
intervals_of_interest.interval_value.value_counts(dropna=False).to_frame('count')

In [None]:
temp_df = profile_intervals.copy()
temp_df['high_count'] = (temp_df['#profiles'] > 10)&(temp_df['#profiles'] < 35)
temp_df

plot_profile_with_intervals(*high_count_ids.iloc[0,:2], 'high_count', data = temp_df) & plot_profile_with_intervals(*high_count_ids.iloc[3,:2], 'high_count', data = temp_df)
# plot_profile_with_intervals(*very_high_count_ids.iloc[3,:2], 'high_count', data = temp_df)

## Add value before and after each interval 

In [None]:
data16_df

In [None]:
# profile_intervals['value_after_interval'] =



## Add connection capacity 

In [None]:
profile_intervals = profile_intervals.drop(columns = ['connection_power'])

In [None]:
if 'connection_power' not in profile_intervals.columns:
    connection_power = info16_df[['connection_power']]
    profile_intervals = profile_intervals.join(connection_power)
profile_intervals

## Check peaks due to connection_power

In [None]:
if 'connection_power_peak' not in profile_intervals.columns: 
    profile_intervals['connection_power_peak'] = profile_intervals['value_after_interval'].replace({'end': np.NaN}) > profile_intervals['connection_power'].astype('float')
profile_intervals

In [None]:
profile_intervals.connection_power_peak.value_counts().to_frame('count')

So clearly this rule only helps to detect very few peaks

# Let's look a bit deeper at profiles with lots of zeros

In [None]:
zero_intervals = profile_intervals.query('interval_value == 0')
zero_intervals

## Let's inspect the results by plotting them
So this is not clean anymore

In [None]:
profile_intervals_with_zero_followed_by_nan = profile_intervals[(profile_intervals.interval_value == 0)&(profile_intervals.value_after_interval.isna()) ]
profile_intervals_with_zero_followed_by_nan

In [None]:
profiles_with_long_intervals = zero_intervals.index[zero_intervals.interval_length > 4].unique().to_list()
long_zero_intervals = zero_intervals.loc[profiles_with_long_intervals, :]

In [None]:
plot_profile_with_period_marks(long_zero_intervals.index[-100], data = long_zero_intervals )

In [None]:
data16_df.loc[long_zero_intervals.index[-100]].value_counts()

In [None]:
(plot_profile_with_period_marks(2) & plot_profile_with_period_marks(3)).resolve_scale(x = 'shared')