# Generate annotated profiles

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import random
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
# this reloads code from external modules automatically if it is changed (without having to restart the kernel)
%load_ext autoreload
%autoreload 2

In [None]:
from interval_information import get_interval_df

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/error_detection')
RESULT_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)
result_path = RESULT_PATH / 'cumulative_value_detection.csv' 
zero_path = RESULT_PATH / 'zero_interval_is_error.csv'
interval_path = RESULT_PATH /'intervals_with_info.csv'
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists() and zero_path.exists(), 'These paths should exist'

# Read info and data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']

# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]


# Read zero errors and cumulative values

In [None]:
zero_detections = pd.read_csv(zero_path).set_index(['meterID', 'year', 'start', 'end'], drop = True)
cumulative_value_detections = pd.read_csv(result_path).set_index(['meterID', 'year', 'start', 'end'], drop = True)


## Only investigate timeseries with data problems

In [None]:
# nb of zeros for each profile
nb_of_zeros = (data16_df == 0).sum(axis = 1)
nb_of_nan = data16_df.isna().any(axis =1 )
data16_df= data16_df.loc[(nb_of_zeros>0) | nb_of_nan]
# data16_df

## Construct the intervals
So in the rest of this code we simply construct the intervals as a dataset and add different attributes/features and investigate whether they could be useful or not

In [None]:
%%time
interval_df = get_interval_df(data16_df, info16_df, keep_zero = True, keep_nan = True)
# interval_df

In [None]:
intervals = interval_df.join(zero_detections).join(cumulative_value_detections)
intervals.loc[interval_df.interval_value.isna(), 'is_error'] = True 
intervals = intervals.rename(columns = {'followed_by_cumulative_value':'is_cumulative_value'})
intervals

In [None]:
intervals.to_csv(interval_path)

# Subsample these at random 

In [None]:
# np.random.seed(331345)
# meterIDs = np.random.choice(intervals.index.get_level_values(0).unique(), 10, replace = False)
# data16_df = data16_df.loc[meterIDs].sort_index()
# intervals = intervals.loc[meterIDs].sort_index()
# del interval_df
# del info16_df

# Plot function

In [None]:
def annotate_profile(meterID, year):
    # plots the profile, using the period data in data 
    # the color can be determined using the period_type_column
    
    
    profile_df = data16_df.loc[(meterID, year),:].to_frame('value').reset_index()
    profile_intervals =intervals.loc[(meterID,year), :].reset_index(drop = True).fillna({'is_cumulative_value':"don't know"})
    profile_intervals.loc[profile_intervals.interval_value.isna(), 'interval_type'] = 'error'
    profile_intervals.loc[(profile_intervals.interval_value == 0) & (profile_intervals.is_error), 'interval_type'] = 'error'
    profile_intervals.loc[(profile_intervals.interval_value == 0) & (profile_intervals.is_error.isna()), 'interval_type'] = "don't know"
    profile_intervals.loc[(profile_intervals.interval_value == 0) & (profile_intervals.is_error == False), 'interval_type'] = 'normal'
        

    line = alt.Chart(profile_df).mark_line().encode(
        x = alt.X('timestamp:T', title = 'timestamp'), 
        y = alt.Y('value:Q', title = 'consumption (in kWh)')
    )
    interval_type_color = alt.Color('interval_type:N', 
                                            scale = alt.Scale(
                                                domain = ['normal', "don't know", "error"], 
                                            ))
    period_shading = alt.Chart(profile_intervals).mark_rect(opacity = 0.6).encode(
        x = 'start_time:T',
        x2 = 'end_time:T', 
        color = interval_type_color, 
        tooltip = ['interval_type', 'interval_value']
    ) 
    period_dot = alt.Chart(profile_intervals).mark_square(size = 100).encode(
        x = 'start_time:T',
        y = alt.YValue(profile_df.value.max()),
        color = interval_type_color, 
        tooltip = ['interval_type', 'interval_value']
    )
    cumulative_value_dot = alt.Chart(profile_intervals[profile_intervals.is_error == True]).mark_circle(size = 200).encode(
        x = 'end_time:T', 
        y = '0th_value_after_end:Q',
        color = alt.Color('is_cumulative_value:N', scale = alt.Scale(domain = [False,"don't know", True])), 
        tooltip = ['is_cumulative_value']
    )
    connection_power = profile_intervals.connection_power.iat[0]
    pv_power = -profile_intervals.PV_power.iat[0]
    connection_power_line = alt.Chart(pd.DataFrame({'y': [connection_power, pv_power]})).mark_rule(color = 'black', opacity = 0.8).encode(
            y = 'y:Q'
        )
    period_chart = alt.layer(period_shading, period_dot).resolve_legend(color = 'shared')
    chart = alt.layer(period_chart,cumulative_value_dot, line, connection_power_line).resolve_scale(color = 'independent').resolve_legend(color = 'independent')

    return chart.properties(width = 900, height = 400,title = f"{meterID} in {year}").interactive()



# Some annotated profiles
First of all, when the profile is 0 or NaN the background of the plot is shaded in a color that indicates the detected interval type: 
- **normal**: this is a zero interval that was detected as normal
- **don't know**: our method is not entirely sure, when working with the profiles we will consider these intervals as normal 
- **error**: this is a zero or NaN interval due to a measurement error and thus this is missing data 

NaN intervals are always classified as an error, zero intervals can be classified as normal, don't know or error.  
Because some intervals might be very narrow, squares are added at the top of the chart to show the location of the intervals. 

Second, when an interval is classified as an error, the next value in the timeseries gets a dot that indicates whether it is a cumulative value or not: 
- **false**: this is a normal measurement (and thus should be included)
- **don't know**: our method is not entirely sure, when working with the profiles these values will be dropped
- **true**: our method detects these as cumulative measurements, when working with these profiles these values should be dropped (or be interpreted as cumulative values*)

Finally, the connection capacity and the negative PV_power (if known) are shown as horizontal black lines.  
You should be able to zoom in on the charts interactively.


*Note: the profiles shown are chosen at random*

### Profiles with injection

In [None]:
injection_profiles=interval_df[~interval_df.PV_power.isna()].index.get_level_values(0).unique()

In [None]:
annotate_profile(injection_profiles[0],2016)

In [None]:
annotate_profile(injection_profiles[1],2016)

### profiles with no injection but still have zeros


In [None]:
no_injection_zero_profiles = interval_df[interval_df.PV_power.isna() & (interval_df.interval_value == 0)].index.get_level_values(0).to_series().value_counts().sort_values(ascending = False).index

#### A lot of zero values

In [None]:
annotate_profile(no_injection_zero_profiles[100],2016)

#### A few zeros

In [None]:
annotate_profile(no_injection_zero_profiles[400], 2016)

### profiles with NaN intervals with cumulative values

In [None]:
nan_intervals = intervals[intervals.interval_value.isna() & intervals.is_cumulative_value].index.get_level_values(0).unique()

In [None]:
annotate_profile(nan_intervals[0],2016)

In [None]:
annotate_profile(nan_intervals[1],2016)

### profiles with zero intervals with cumulative values

In [None]:
zero_intervals = intervals[~intervals.interval_value.isna() & intervals.is_cumulative_value].index.get_level_values(0).unique()

In [None]:
annotate_profile(zero_intervals[0],2016)

In [None]:
annotate_profile(zero_intervals[1],2016)