# The goal here is to detect connection problems in the data such that we can handle these later
So the main idea is to look for periods where multiple meters have zero measurements, these periods are called disconnection periods



## Imports and set-up

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import tqdm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
info_df.head()

In [None]:
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'
data_df.head()

In [None]:
leap_years = [2012, 2016]
non_leap_years = [year for year in info_df.index.levels[1] if year not in leap_years]
print(f'leap years = {leap_years}')
print(f'non leap years = {non_leap_years}')

## Handle all data sources and years seperately
Of course connection problems need to be in the same year and within the same measurement project, so for now lets use the EandisVREG data of 2016

In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']
info16_df

In [None]:
# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]
data16_df

## Look at the amount of NaNs and Zeros

In [None]:
# nb of zeros for each profile
nb_of_na = (data16_df.isna()).sum(axis = 1)
nb_of_zeros = (data16_df == 0).sum(axis = 1)

## Look at profiles with potential problems

In [None]:
data16_df= data16_df.loc[(nb_of_na>0)| (nb_of_zeros>0), :]
data16_df

## Construct the intervals
So in the rest of this code we simply construct the intervals as a dataset and add different attributes/features and investigate whether they could be useful or not

In [None]:
# code to find intervals with only zeros
def value_interval(meterID, year, a, value):
    """
        Makes a dataframe containing the start and end of each interval (only the longest intervals) that only contains value
    """
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    if np.isnan(value):
        iszero = np.concatenate(([0], np.isnan(a).view(np.int8), [0]))
    else: 
        iszero = np.concatenate(([0], np.equal(a, value).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    df = pd.DataFrame(ranges, columns = ['start', 'end'])
    df['meterID'] = meterID
    df['year'] = year
    df['interval_value'] = value
    return df.set_index(['meterID', 'year'])

def zero_nan_intervals_df(data_df): 
    dfs = []
    for (meterID, year), row in data_df.iterrows(): 
        nan_df = value_interval( meterID, year,row, np.NaN)
        zero_df = value_interval( meterID, year, row, 0)
        dfs.append(nan_df)
        dfs.append(zero_df)
    full_df = pd.concat(dfs, axis = 0)
#     full_df['start_time'] = data14_df.columns[full_df['start']]
#     full_df['end_time'] = data14_df.columns[full_df['end']-1]
    return full_df

profile_intervals = zero_nan_intervals_df(data16_df)
profile_intervals['interval_length'] = profile_intervals.end - profile_intervals.start
# start time and end time are exclusive! (this plots well with altair that why we do it this way)
profile_intervals['start_time'] = data16_df.columns[profile_intervals['start']] - pd.Timedelta('15min')
profile_intervals['end_time'] = data16_df.columns[profile_intervals['end']-1] # doing it this way because the timestamp we need might not exist in the columns
profile_intervals['end_time'] += pd.Timedelta('15min')
profile_intervals = profile_intervals.set_index(['start', 'end'], append = True)
profile_intervals

*notes:*  
- start is inclusive, end is exclusive so the interval is $[start, end[$  
- start_time and end_time are both exclusive $]start\_time, end\_time[$  
This works better for plotting in altair
     

## Remove the missing hour on march 27 due to change from winter to summer time 

In [None]:
profile_intervals = profile_intervals[~((profile_intervals.start_time == '2016-03-27 02:00:00') & (profile_intervals.end_time == '2016-03-27 03:00:00'))]

## Add two next values after each interval

In [None]:
# profile_intervals['value_after_interval'] =
def values_after_end(row):
    meterID, year, start, end = row.name
    # if end is to large
    try:
        first_value = data16_df.at[(meterID,year), data16_df.columns[end]]
    except: 
        first_value = 'end'
    try:
        second_value = data16_df.at[(meterID,year), data16_df.columns[end+1]]
    except: 
        second_value = 'end'
    return first_value, second_value

if 'first_value_after_end' not in profile_intervals.columns:
    after_values_df = profile_intervals.apply(values_after_end, axis = 1, result_type = 'expand').rename(columns = {0:'first_value_after_end', 1:'second_value_after_end'})
    profile_intervals = pd.concat([profile_intervals, after_values_df], axis = 1)
profile_intervals

## Add connection capacity 

In [None]:
if 'connection_power' not in profile_intervals.columns:
    connection_power = info16_df[['connection_power']]
    profile_intervals = profile_intervals.join(connection_power)
profile_intervals

## Check peaks due to connection_power

In [None]:
if 'is_connection_power_peak' not in profile_intervals.columns: 
    # a value is a connection power peak if the first or the second value after the interval is higher than the peak
    profile_intervals['is_connection_power_peak'] = (profile_intervals['first_value_after_end'].replace({'end': np.NaN}) > profile_intervals['connection_power'].astype('float'))|(profile_intervals['second_value_after_end'].replace({'end': np.NaN}) > profile_intervals['connection_power'].astype('float'))
profile_intervals

In [None]:
profile_intervals.is_connection_power_peak.value_counts().to_frame('count')

So clearly this rule only helps to detect very few peaks

### Plotting for profiles with periods

In [None]:
def plot_profile_with_intervals(meterID, year, period_type_column = None, data = None, daterange = None):
    # plots the profile, using the period data in data 
    # the color can be determined using the period_type_column
    if data is None : 
        data = profile_intervals
    if daterange is not None: 
        start_time =  f'2016-{daterange[0]}-1 00:00:00'
        end_time = f'2016-{daterange[1]}-1 00:00:00'
        profile_df = data16_df.loc[(meterID, year),start_time:end_time]
        periods_for_profile =data.loc[(meterID,year), :]
        periods_for_profile = periods_for_profile[(periods_for_profile['end_time'] > start_time ) & (periods_for_profile['start_time'] < end_time)]
    else: 
        profile_df = data16_df.loc[(meterID, year),:]
        periods_for_profile =data.loc[(meterID,year), :]
        
#     print(periods_for_profile[['start_time', 'end_time']])
#     print(zero_periods_for_profile[['start_time', 'end_time', 'is_disconnection_period']])
    line = alt.Chart(profile_df.to_frame('value').reset_index()).mark_line().encode(
        x = alt.X('timestamp:T'), 
        y = alt.Y('value:Q')
    )
    if period_type_column is None: 
        color_encoding = alt.ColorValue('blue') 
    else: 
        color_encoding = alt.Color(f'{period_type_column}:N')
    plot_df =periods_for_profile.reset_index(drop=True)
    rect = alt.Chart(plot_df).mark_rect(opacity = 0.6).encode(
        x = 'start_time:T',
        x2 = 'end_time:T', 
        color = color_encoding
    ) + alt.Chart(plot_df).mark_circle(opacity = 0.6).encode(
        x = 'start_time:T',
        y = alt.YValue(profile_df.max()),
#         x2 = 'end_time:T', 
        color = color_encoding
    )
    chart = rect + line
    if 'connection_power' in periods_for_profile.columns: 
        connection_power = float(periods_for_profile.connection_power.iat[0])

        connection_power_line = alt.Chart(periods_for_profile.reset_index()).mark_rule(color = 'black', opacity = 0.8).encode(
            y =  'mean(connection_power):Q'
        )
        chart += connection_power_line
    return chart.properties(width = 2200, title = f"{meterID} in {year}").interactive()
    

### Using a normal distribution
For each profile figure out thresholds based on the normal distribution

In [None]:
def get_thresholds(row, prominence = 0.5): 
    X = row.fillna(0).to_numpy()
    peaks, _ = find_peaks(X, prominence = prominence)
    x_to_use = X[peaks]
    mu, std = norm.fit(x_to_use)
    _, max_thres = norm.interval(0.99, mu, std)
    min_thres = np.nan
    if profile.min() < 0:
        # this profile has injection so negative peaks are possible
        inverse_peaks, _ = find_peaks(-X, prominence = prominence)
        x_to_use = X[peaks]
        mu, std = norm.fit(x_to_use)
        min_thres, _ = norm.interval(0.99, mu, std)
    return min_thres ,max_thres
if 'gauss_min_threshold' not in profile_intervals.columns: 
    thresholds = data16_df.apply(get_thresholds, axis = 1, result_type = 'expand', prominence = 0.3).rename(columns = {0:'gauss_min_threshold',1:'gauss_max_threshold'})
    profile_intervals = profile_intervals.join(thresholds)
profile_intervals

In [None]:
profile_intervals['is_gauss_peak'] = profile_intervals.replace({'end': np.NaN}).eval('(first_value_after_end < gauss_min_threshold) | (first_value_after_end > gauss_max_threshold)')
profile_intervals

### Inspect some results

In [None]:
meterIDs = profile_intervals.index.levels[0]
plot_profile_with_intervals(meterIDs[14], 2016, 'is_gauss_peak')