# The goal here is to detect connection problems in the data such that we can handle these later
So the main idea is to look for periods where multiple meters have zero measurements, these periods are called disconnection periods



## Imports and set-up

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import tqdm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
info_df.head()

In [None]:
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'
data_df.head()

In [None]:
leap_years = [2012, 2016]
non_leap_years = [year for year in info_df.index.levels[1] if year not in leap_years]
print(f'leap years = {leap_years}')
print(f'non leap years = {non_leap_years}')

## Handle all data sources and years seperately
Of course connection problems need to be in the same year and within the same measurement project

In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']
info16_df

In [None]:
# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]
data16_df

In [None]:
data16_df = data16_df.fillna(0)

## Look at the amount of zeros

In [None]:
# nb of zeros for each profile
nb_of_zeros = (data16_df == 0).sum(axis = 1)
nb_of_zeros = nb_of_zeros[nb_of_zeros>0]


print(f'there are {len(nb_of_zeros)} profiles with zero values')
print(f'the average number of zeros in each these profiles is {nb_of_zeros.mean()}')
alt.Chart(nb_of_zeros.to_frame('zeros').reset_index()).mark_bar().encode(
    x = alt.X('zeros:O', bin=True), 
    y = alt.Y('count()')
)

In [None]:
# drop profiles with no zeros, these do not have to be fixed
data16_df= data16_df.loc[nb_of_zeros.index, :]
data16_df

## Construct the intervals

In [None]:
# code to find intervals with only zeros
def zero_runs(a):
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges

def zero_run_df(data_df): 
    dfs = []
    for index, row in data_df.iterrows(): 
        runs = zero_runs(row)
        dfs.append(pd.DataFrame(runs, columns = ['start', 'end'], index = [index]*runs.shape[0]))
    full_df = pd.concat(dfs, axis = 0).reset_index().rename(columns = {'index':'profile'})
#     full_df['start_time'] = data14_df.columns[full_df['start']]
#     full_df['end_time'] = data14_df.columns[full_df['end']-1]
    return full_df

zero_periods = zero_run_df(data16_df)
zero_periods

In [None]:
# periods with more than DISCONNECTION_PERIOD_THRESHOLD profiles are seen as disconnection periods 
DISCONNECTION_PERIOD_THRESHOLD = 30
disconnection_period_df = zero_periods.groupby(['start', 'end']).count().rename(columns = {'profile':'count'}).reset_index()
print(f'{len(disconnection_period_df)} distinct disconnection periods')
disconnection_period_df = disconnection_period_df.query(f'count > {DISCONNECTION_PERIOD_THRESHOLD}')
print(f'{len(disconnection_period_df)} disconnection periods with #profiles > {DISCONNECTION_PERIOD_THRESHOLD}')
disconnection_period_df

In [None]:
# organize this in a set for fast lookup 
disconnection_periods = {tuple(period) for _, period in disconnection_period_df.drop(columns = ['count']).iterrows()}


In [None]:
# to the zero periods df add columns that say whether it is a disconnection period or not 
# a period is a disconnection period if it has a deviation from the a disconnection profile of maximum MAXIMUM_DEVIATION
MAXIMUM_DEVIATION = 2
def is_disconnection_period(row): 
    # instead of checking similarity against all profiles, check a limited amount of options in constant time 
    _, start, end = row 
    for start_deviation in range(0, MAXIMUM_DEVIATION+1): 
        max_end_deviation = MAXIMUM_DEVIATION - start_deviation
        for end_deviation in range(0, max_end_deviation+1): 
            # try all directions of deviation 
            for start_delta, end_delta in itertools.product([start_deviation, -start_deviation], [end_deviation, -end_deviation]): 
                if (start + start_delta, end + end_delta) in disconnection_periods: 
                    return True
    return False
            
    
zero_periods['is_disconnection_period'] = zero_periods.apply(is_disconnection_period, axis = 1)
zero_periods

In [None]:
zero_periods.is_disconnection_period.value_counts()

In [None]:
zero_periods = zero_periods.set_index('profile')
zero_periods['start_time'] = data16_df.columns[zero_periods['start']]
zero_periods['end_time'] = data16_df.columns[zero_periods['end']-1]
zero_periods['end_time'] += pd.Timedelta('15min')
zero_periods['period_length'] = zero_periods.end - zero_periods.start

So around 2 in 3 zero periods is detected as a disconnection error 

## Let's inspect the results by plotting them

In [None]:
def plot_profile_with_period_marks(profile_idx): 
    profile = data16_df.iloc[profile_idx, :]
    zero_periods_for_profile = zero_periods.loc[[profile.name],: ]
#     print(zero_periods_for_profile[['start_time', 'end_time', 'is_disconnection_period']])
    line = alt.Chart(profile.to_frame('value').reset_index()).mark_line().encode(
        x = alt.X('timestamp:T'), 
        y = alt.Y('value:Q')
    )
    rect = alt.Chart(zero_periods_for_profile).mark_rect(opacity = 0.8).encode(
        x = 'start_time:T',
        x2 = 'end_time:T', 
        color = alt.Color('is_disconnection_period:N', scale = alt.Scale(domain = [True, False], range = ['red', 'green']))
    )
    return (rect + line).properties(width = 1800).interactive()
    

In [None]:
(plot_profile_with_period_marks(2) & plot_profile_with_period_marks(3)).resolve_scale(x = 'shared')