
# Imports and set-up

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import dask.dataframe as dd #conda install dask
from dask.distributed import Client
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import tqdm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
from interval_information import get_interval_df
from peak_detection import (
    get_cumulative_value_detections, 
    get_connection_and_pv_power_peaks, 
    get_knn_similarity_based_peaks,
    match_knn_then_assumption_parallel
)
from zero_intervals import (
    sign_change_intervals, 
    low_consumption_on_both_sides_intervals
)

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
RESULT_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/cumulative_value_detection')
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

# Helpers

In [None]:
def plot_intervals(meterID, year, start_time, context = '10D'): 
    subset = data16_df.loc[meterID]
    line = alt.Chart(subset).mark_line().encode(
        x = 'timestamp:T', 
        y = 'value', 
        color = 'meterID'
    )
    return line.properties(width = 2200).interactive(bind_y = False)
    

In [None]:
def plot_profile_with_intervals(meterID, year, period_type_column = None, data = None, daterange = None):
    # plots the profile, using the period data in data 
    # the color can be determined using the period_type_column
    if data is None : 
        data = nan_intervals
    if daterange is not None: 
        start_time =  f'2016-{daterange[0]}-1 00:00:00'
        end_time = f'2016-{daterange[1]}-1 00:00:00'
        profile_df = data16_df.loc[(meterID, year),start_time:end_time]
        periods_for_profile =data.loc[(meterID,year), :]
        periods_for_profile = periods_for_profile[(periods_for_profile['end_time'] > start_time ) & (periods_for_profile['start_time'] < end_time)]
    else: 
        profile_df = data16_df.loc[(meterID, year),:]
        periods_for_profile =data.loc[(meterID,year), :]
        
#     print(periods_for_profile[['start_time', 'end_time']])
#     print(zero_periods_for_profile[['start_time', 'end_time', 'is_disconnection_period']])
    line = alt.Chart(profile_df.to_frame('value').reset_index()).mark_line().encode(
        x = alt.X('timestamp:T'), 
        y = alt.Y('value:Q')
    )
    if period_type_column is None: 
        color_encoding = alt.ColorValue('blue') 
    else: 
        color_encoding = alt.Color(f'{period_type_column}:N')
    plot_df =periods_for_profile.reset_index(drop=True)
    rect = alt.Chart(plot_df).mark_rect(opacity = 0.6).encode(
        x = 'start_time:T',
        x2 = 'end_time:T', 
        color = color_encoding
    ) + alt.Chart(plot_df).mark_circle(opacity = 0.6).encode(
        x = 'start_time:T',
        y = alt.YValue(profile_df.max()),
#         x2 = 'end_time:T', 
        color = color_encoding
    )
    chart = rect + line
    if 'connection_power' in periods_for_profile.columns: 
        connection_power = float(periods_for_profile.connection_power.iat[0])

        connection_power_line = alt.Chart(periods_for_profile.reset_index()).mark_rule(color = 'black', opacity = 0.8).encode(
            y =  'mean(connection_power):Q'
        )
        chart += connection_power_line
    return chart.properties(width = 2200, title = f"{meterID} in {year}").interactive()

In [None]:
def confusion_matrix(name1, series1, name2, series2): 
    return pd.crosstab(series1, series2, rownames = [name1], colnames =[name2])

In [None]:
def detection_summary(series): 
    count = series.value_counts(dropna=False).to_frame('count')
    count['relative'] = count['count']/count['count'].sum()
    return count

In [None]:
def combine_strategies(*args): 
    strategies = pd.concat(args, axis = 1)
    normal = (strategies == False).any(axis = 1)
    error = (strategies == True).any(axis = 1)
    nan = (strategies.isna()).all(axis = 1)
    result = pd.Series(index = strategies.index, dtype ='object')
    result[error] = True
    result[normal] = False
    return result
    

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'


In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']

# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]


## Only investigate timeseries with data problems

In [None]:
# nb of zeros for each profile
nb_of_zeros = (data16_df == 0).sum(axis = 1)
nb_of_nan = data16_df.isna().any(axis =1 )
data16_df= data16_df.loc[(nb_of_zeros>0) | nb_of_nan]
# data16_df

## Construct the intervals
So in the rest of this code we simply construct the intervals as a dataset and add different attributes/features and investigate whether they could be useful or not

In [None]:
%%time
interval_df = get_interval_df(data16_df, info16_df, keep_zero = True, keep_nan = True)
interval_df

# Collective periods based on start time

In [None]:
current_detection = combine_strategies(low_consumption_detection, sign_change_detection)
detection_summary(current_detection)

In [None]:
# don't look at the intervals we have marked as normal already
rel_interval_df = interval_df[current_detection != False]
rel_interval_df

In [None]:
# count how much each start time occurs
interval_counts = rel_interval_df.reset_index().groupby('start')[['meterID', 'year']].size()
# add this to the interval df as a column
intervals_with_count = rel_interval_df.join(interval_counts.to_frame('count'), on = ['start'])

# only use the intervals with a very high count
intervals_with_count = intervals_with_count[intervals_with_count['count'] >= 33] 

# filter each group of intervals that start on the same moment, only allow intervals with the most common length +- a threshold (in this case 2)
def filter_groups(df): 
    THRESHOLD = 2
    most_common_value = df.interval_length.value_counts().idxmax()
    return df[(df.interval_length >= most_common_value -THRESHOLD) & (df.interval_length <= most_common_value + THRESHOLD) ]
intervals_with_count = intervals_with_count.groupby('start_time').apply(filter_groups).droplevel(0)
# each of the intervals that remains is thus a collective data problem and is a data error
collective_data_problems  = pd.Series(index = interval_df.index, dtype = 'object')
collective_data_problems.loc[intervals_with_count.index] = True
detection_summary(collective_data_problems)

In [None]:
current_result = combine_strategies(sign_change_detection,low_consumption_detection, collective_data_problems)
detection_summary(current_result[interval_df.interval_value == 0])

### Visualise some results 

In [None]:
np.sort(intervals_with_count['count'].unique())

In [None]:
intervals = intervals_with_count[(intervals_with_count['count'] == 51)]
start_times = intervals.start_time.unique()
print(f"len start times = {len(start_times)}")
START_IDX = 0
start_time = start_times[START_IDX]
print(f"showing start time {start_time}")
intervals_to_plot = intervals[intervals.start_time == start_time]
display(intervals_to_plot)
plot_intervals(intervals_to_plot.index.get_level_values(0).unique(), 2016, start_time = start_time, context = '1D')