# The goal here is to detect connection problems in the data such that we can handle these later
So the main idea is to look for periods where multiple meters have zero measurements, these periods are called disconnection periods



## Imports and set-up

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import tqdm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
info_df.head()

In [None]:
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'
data_df.head()

In [None]:
leap_years = [2012, 2016]
non_leap_years = [year for year in info_df.index.levels[1] if year not in leap_years]
print(f'leap years = {leap_years}')
print(f'non leap years = {non_leap_years}')

## Handle all data sources and years seperately
Of course connection problems need to be in the same year and within the same measurement project, so for now lets use the EandisVREG data of 2016

In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']
info16_df

In [None]:
# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]
data16_df

## Look at the amount of NaNs and Zeros

In [None]:
# nb of zeros for each profile
nb_of_na = (data16_df.isna()).sum(axis = 1)
print(f'There are {(nb_of_na>0).sum()} profiles with NaN values')
print(f'The average number of NaNs in each these profiles is {nb_of_na[nb_of_na>0].mean()}')
nb_of_zeros = (data16_df == 0).sum(axis = 1)
print(f'There are {(nb_of_zeros>0).sum()} profiles with zeros values')
print(f'The average number of zeros in each these profiles is {nb_of_zeros[nb_of_zeros>0].mean()}')

In [None]:
alt.Chart(nb_of_na.value_counts().to_frame('count').reset_index().pipe(lambda x: x[x['index']>5]), title= 'histogram of amount of NaNs').mark_bar().encode(
    x = alt.X('index:Q', title = 'amount of NaNs in profile'), 
    y = alt.Y('count:Q', title = '#profiles')
)

So most profiles have 4 NaN's (these are due to change from winter to summer time NOT a data problem) these are ignored in this plot and handled later.
Profiles with the most NaN's have 1200+ NaN values, which is in total 12 days missing (we will probably be able to handle this if they are not consecutive) 


In [None]:
alt.Chart(nb_of_zeros.value_counts().to_frame('count').reset_index().pipe(lambda x: x[x['index']>0]), title= 'histogram of amount of zeros').mark_bar().encode(
    x = 'index:N', 
    y = 'count:Q'
)

So most profiles have no zero values (160 profiles) then we have a slight distribution around 100 zero values and then we have a very long tail  up to 35000 zero values which is almost a year of zeros (but these are still valid profiles)  

## Look at profiles with potential problems

In [None]:
data16_df= data16_df.loc[(nb_of_na>0)| (nb_of_zeros>0), :]
data16_df

## Construct the intervals
So in the rest of this code we simply construct the intervals as a dataset and add different attributes/features and investigate whether they could be useful or not

In [None]:
# code to find intervals with only zeros
def value_interval(meterID, year, a, value):
    """
        Makes a dataframe containing the start and end of each interval (only the longest intervals) that only contains value
    """
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    if np.isnan(value):
        iszero = np.concatenate(([0], np.isnan(a).view(np.int8), [0]))
    else: 
        iszero = np.concatenate(([0], np.equal(a, value).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    df = pd.DataFrame(ranges, columns = ['start', 'end'])
    df['meterID'] = meterID
    df['year'] = year
    df['interval_value'] = value
    return df.set_index(['meterID', 'year'])

def zero_nan_intervals_df(data_df): 
    dfs = []
    for (meterID, year), row in data_df.iterrows(): 
        nan_df = value_interval( meterID, year,row, np.NaN)
        zero_df = value_interval( meterID, year, row, 0)
        dfs.append(nan_df)
        dfs.append(zero_df)
    full_df = pd.concat(dfs, axis = 0)
#     full_df['start_time'] = data14_df.columns[full_df['start']]
#     full_df['end_time'] = data14_df.columns[full_df['end']-1]
    return full_df

profile_intervals = zero_nan_intervals_df(data16_df)
profile_intervals['interval_length'] = profile_intervals.end - profile_intervals.start
# start time and end time are exclusive! (this plots well with altair that why we do it this way)
profile_intervals['start_time'] = data16_df.columns[profile_intervals['start']] - pd.Timedelta('15min')
profile_intervals['end_time'] = data16_df.columns[profile_intervals['end']-1] # doing it this way because the timestamp we need might not exist in the columns
profile_intervals['end_time'] += pd.Timedelta('15min')
profile_intervals = profile_intervals.set_index(['start', 'end'], append = True)
profile_intervals

*notes:*  
- start is inclusive, end is exclusive so the interval is (start, end(  
- start_time and end_time are both exclusive )start_time, end_time(  
This works better for plotting in altair
     

## Remove the missing hour on march 27 due to change from winter to summer time 

In [None]:
profile_intervals = profile_intervals[~((profile_intervals.start_time == '2016-03-27 02:00:00') & (profile_intervals.end_time == '2016-03-27 03:00:00'))]

## Look at the distribution of interval length
**NaN intervals** are mostly 3 quarters long, interestingly for the longer intervals 24 hours is a common value, same for 48 hours.  
The longest interval is 955 (which is almost 10 days that are missing) 

**Zero intervals** are mostly shorter from one to ten quarters but a way longer tail! This is due to disabled meters, these are not always data problems. 


In [None]:
interval_length_count = profile_intervals.reset_index().drop_duplicates(['start', 'end', 'interval_value']).groupby(['interval_value', 'interval_length'], dropna = False).size().to_frame('value').reset_index().astype({'interval_value': 'string'})
alt.Chart(interval_length_count).mark_bar().encode(
    x = 'interval_length:N', 
    y = 'value:Q'
).facet(row = 'interval_value').resolve_scale(x = 'independent', y = 'independent')

## Plotting function 
A plotting function that can be used to plot part of a profile with some intervals marked in different colors

In [None]:
def plot_profile_with_intervals(meterID, year, period_type_column = None, data = None, daterange = None):
    # plots the profile, using the period data in data 
    # the color can be determined using the period_type_column
    if data is None : 
        data = profile_intervals
    if daterange is not None: 
        start_time =  f'2016-{daterange[0]}-1 00:00:00'
        end_time = f'2016-{daterange[1]}-1 00:00:00'
        profile_df = data16_df.loc[(meterID, year),start_time:end_time]
        periods_for_profile =data.loc[(meterID,year), :]
        periods_for_profile = periods_for_profile[(periods_for_profile['end_time'] > start_time ) & (periods_for_profile['start_time'] < end_time)]
    else: 
        profile_df = data16_df.loc[(meterID, year),:]
        periods_for_profile =data.loc[(meterID,year), :]
        
#     print(periods_for_profile[['start_time', 'end_time']])
#     print(zero_periods_for_profile[['start_time', 'end_time', 'is_disconnection_period']])
    line = alt.Chart(profile_df.to_frame('value').reset_index()).mark_line().encode(
        x = alt.X('timestamp:T'), 
        y = alt.Y('value:Q')
    )
    if period_type_column is None: 
        color_encoding = alt.ColorValue('blue') 
    else: 
        color_encoding = alt.Color(f'{period_type_column}:N')
    plot_df =periods_for_profile.reset_index(drop=True)
    rect = alt.Chart(plot_df).mark_rect(opacity = 0.6).encode(
        x = 'start_time:T',
        x2 = 'end_time:T', 
        color = color_encoding
    ) + alt.Chart(plot_df).mark_circle(opacity = 0.6).encode(
        x = 'start_time:T',
        y = alt.YValue(profile_df.max()),
#         x2 = 'end_time:T', 
        color = color_encoding
    )
    chart = rect + line
    if 'connection_power' in periods_for_profile.columns: 
        connection_power = float(periods_for_profile.connection_power.iat[0])

        connection_power_line = alt.Chart(periods_for_profile.reset_index()).mark_rule(color = 'black', opacity = 0.8).encode(
            y =  'mean(connection_power):Q'
        )
        chart += connection_power_line
    return chart.properties(width = 2200).interactive()
    

## Add count to each interval
Count the amount of times that each interval occurs

In [None]:
interval_counts = profile_intervals.reset_index().groupby(['start', 'end'])[['meterID', 'year']].size().to_frame('#profiles')
nan_interval_counts = profile_intervals[profile_intervals.interval_value.isna()].reset_index().groupby(['start', 'end'])[['meterID', 'year']].size().to_frame('#profiles')
zero_interval_counts = profile_intervals[~profile_intervals.interval_value.isna()].reset_index().groupby(['start', 'end'])[['meterID', 'year']].size().to_frame('#profiles')
print(f'{len(interval_counts)} distinct disconnection periods')
interval_counts

### Plot the amount of profiles that are zero during an interval and their count
So interestingly you can see here that there are also zero intervals that occur for multiple meters! So some of these zeros are also missing values! 
It also seems that there are intervals that show up as zeros in one profile but NaNs in another profile! 

In [None]:
all_chart = alt.Chart(interval_counts.reset_index(), title = '#intervals that have x profiles that are missing in the interval').mark_bar().encode(
    x = alt.X('#profiles:N'), 
    y = alt.Y('count()')
)
zero_chart = alt.Chart(zero_interval_counts.reset_index(), title = '# zero intervals that have x profiles that are missing in the interval').mark_bar().encode(
    x = alt.X('#profiles:N'), 
    y = alt.Y('count()')
)
nan_chart = alt.Chart(nan_interval_counts.reset_index(), title = '# NaN intervals that have x profiles that are missing in the interval').mark_bar().encode(
    x = alt.X('#profiles:N'), 
    y = alt.Y('count()')
)
(all_chart & zero_chart & nan_chart)

### Check if these unique missing intervals are really unique or if they are similar to one of the non-unique intervals

In [None]:
def check_unique(intervals): 
    df = intervals.squeeze()
    non_unique_df = df[df > 1]
    unique_df = df[df == 1]
    non_unique_set = {index for index in non_unique_df.index}
    remaining_uniques = []
    for start, end in unique_df.index: 
        found = False
        for delta_s, delta_e in itertools.product([-1, 0,1], [-1, 0, 1]): 
            if (start + delta_s, end + delta_e) in non_unique_set: 
                found = True
                break
        if not found: 
            remaining_uniques.append((start,end))
    return remaining_uniques
unique_nans = check_unique(nan_interval_counts)
unique_zeros = check_unique(zero_interval_counts)
print(f'there are {len(unique_nans)} unique NaN intervals that are not similar to a more common NaN interval (of {len(nan_interval_counts[nan_interval_counts.squeeze()==1])} unique intervals)')
print(f'there are {len(unique_zeros)} unique zero intervals that are not similar to a more common zero interval (of {len(zero_interval_counts[zero_interval_counts.squeeze() == 1])} unique intervals)')

In [None]:
temp = nan_interval_counts.loc[unique_nans].reset_index().eval('end-start').to_frame('interval_length')
temp = temp.value_counts().sort_index().to_frame('count').reset_index()
alt.Chart(temp, title = 'histogram of the length of the unique profiles').mark_bar().encode(
    x = alt.X('interval_length:N', title = 'Interval length'), 
    y = alt.Y('count:Q', title = '# unique profiles of given length')
)

So this shows that there are intervals that are not similar to any other interval but are still measurement errors.  
We can also see that the length of these unique periods is not necesarrily short intervals.  
**In conclusion, a high count can be indicative of measurement errors but a low count does not necessarily mean that it is a valid measurement.**

## Check if there are intervals that show up as zero and as NaN 

In [None]:
# intervals that occur twice, occur once as a zero interval and once as a nan interval
zero_nan_intervals = profile_intervals.reset_index()[['start', 'end', 'interval_value']].drop_duplicates().groupby(['start', 'end']).size() == 2
zero_nan_intervals = zero_nan_intervals.index[zero_nan_intervals].to_frame(index = False)
zero_nan_intervals['length'] = zero_nan_intervals['end'] - zero_nan_intervals['start']
zero_nan_intervals['length'].value_counts().sort_index()

So there clearly are intervals that occur both as a zero interval and as a nan interval!  
The lengths of these intervals are also not necessarily short (although short intervals are more common)  
This makes me think that there is some weird preprocessing involved

In [None]:
temp = zero_nan_intervals.set_index(['start', 'end']).join(profile_intervals.reset_index().set_index(['start', 'end', 'meterID', 'year']), how = 'inner' )
temp

//TODO visualize some of these co-occurences? 

### plot the length of each interval vs its count 

In [None]:
def length_vs_count_scatter_plot(df): 
    df = df.reset_index()
    df['length'] = df['end'] - df['start']
    return alt.Chart(df).mark_circle().encode(
        x = '#profiles:N', 
        y = alt.Y('length:Q', scale = alt.Scale(type = 'log'))
    )
length_vs_count_scatter_plot(interval_counts).properties(title = 'all') & length_vs_count_scatter_plot(zero_interval_counts).properties(title = 'zero') & length_vs_count_scatter_plot(nan_interval_counts).properties(title = 'NaN')

So, most of these intervals are unique (there is only one meter that is zero/NaN in this interval)

In [None]:
# add this info to profile_intervals
if '#profiles' not in profile_intervals.columns: 
    profile_intervals = profile_intervals.join(interval_counts, on = ['start', 'end'])
profile_intervals

## Check if we can distinguish NaN intervals from 0 intervals
We'll use a decision tree to check if we can learn a function that figures out if a certain interval is zero or NaN given the other features (learned this trick from Elia)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt # I don't want to :( but I have to

In [None]:
tree = DecisionTreeClassifier(max_depth = 3)
y = profile_intervals['interval_value'].fillna(1).astype('int').values
X = profile_intervals.reset_index().drop(columns = ['meterID', 'year', 'interval_value', 'start_time', 'end_time'])
tree.fit(X.values,y)
plt.figure(figsize = (20,10))
plot_tree(tree, feature_names = X.columns);

I played around a little bit with this but nothing to conclude, they are not perfectly seperable. 

## Check NaN interval stats

In [None]:
profile_intervals[profile_intervals.interval_value.isna()]

## Investigate the intervals with counts > 600

In [None]:
very_high_count = profile_intervals[profile_intervals['#profiles']> 600]
very_high_count_ids = very_high_count.index.to_frame(index = False)
very_high_count[['start_time','end_time']].drop_duplicates().reset_index(drop = True)

In [None]:
temp_df = profile_intervals.copy()
temp_df['high_count'] = temp_df['#profiles'] > 600
temp_df

plot_profile_with_intervals(*very_high_count_ids.iloc[0,:2], 'high_count', data = temp_df, daterange = (1,4)) & plot_profile_with_intervals(*very_high_count_ids.iloc[3,:2], 'high_count', data = temp_df, daterange = (1,4))
# plot_profile_with_intervals(*very_high_count_ids.iloc[3,:2], 'high_count', data = temp_df)

## Investigate intervals with counts > 10 and < 35  

In [None]:
high_count = profile_intervals[(profile_intervals['#profiles']> 10) & (profile_intervals['#profiles'] < 35)]
high_count_ids = high_count.index.to_frame(index = False)
intervals_of_interest = high_count.drop_duplicates(['start_time', 'end_time', 'interval_value']).sort_values(['start_time', 'end_time']).reset_index(drop = True)
intervals_of_interest

In [None]:
intervals_of_interest.interval_length.value_counts().to_frame('count')

In [None]:
intervals_of_interest.interval_value.value_counts(dropna=False).to_frame('count')

In [None]:
temp_df = profile_intervals.copy()
temp_df['high_count'] = (temp_df['#profiles'] > 10)&(temp_df['#profiles'] < 35)
temp_df

plot_profile_with_intervals(*high_count_ids.iloc[0,:2], 'high_count', data = temp_df) & plot_profile_with_intervals(*high_count_ids.iloc[3,:2], 'high_count', data = temp_df)
# plot_profile_with_intervals(*very_high_count_ids.iloc[3,:2], 'high_count', data = temp_df)

## Add value before and after each interval 

In [None]:
data16_df

In [None]:
# profile_intervals['value_after_interval'] =
def value_after_end(row):
    meterID, year, start, end = row.name
    # if end is to large
    if end == 35136:
        return 'end'
    value = data16_df.at[(meterID,year), data16_df.columns[end]]
    return value
def value_before_start(row): 
    meterID, year, start, end = row.name
    # if end is to large
    if start == 0:
        return 'start'
    return data16_df.at[(meterID, year), data16_df.columns[start - 1]]
# if 'value_after_interval' not in profile_intervals.columns: 

profile_intervals['value_after_interval'] = profile_intervals.apply(value_after_end, axis = 1)
profile_intervals['value_before_interval'] = profile_intervals.apply(value_before_start, axis = 1)
profile_intervals


## Add connection capacity 

In [None]:
profile_intervals = profile_intervals.drop(columns = ['connection_power'])

In [None]:
if 'connection_power' not in profile_intervals.columns:
    connection_power = info16_df[['connection_power']]
    profile_intervals = profile_intervals.join(connection_power)
profile_intervals

## Check peaks due to connection_power

In [None]:
if 'connection_power_peak' not in profile_intervals.columns: 
    profile_intervals['connection_power_peak'] = profile_intervals['value_after_interval'].replace({'end': np.NaN}) > profile_intervals['connection_power'].astype('float')
profile_intervals

In [None]:
profile_intervals.connection_power_peak.value_counts().to_frame('count')

So clearly this rule only helps to detect very few peaks

# Let's look a bit deeper at profiles with lots of zeros

In [None]:
zero_intervals = profile_intervals.query('interval_value == 0')
zero_intervals

## Let's inspect the results by plotting them
So this is not clean anymore

In [None]:
profile_intervals_with_zero_followed_by_nan = profile_intervals[(profile_intervals.interval_value == 0)&(profile_intervals.value_after_interval.isna()) ]
profile_intervals_with_zero_followed_by_nan

In [None]:
profiles_with_long_intervals = zero_intervals.index[zero_intervals.interval_length > 4].unique().to_list()
long_zero_intervals = zero_intervals.loc[profiles_with_long_intervals, :]

In [None]:
plot_profile_with_period_marks(long_zero_intervals.index[-100], data = long_zero_intervals )

In [None]:
data16_df.loc[long_zero_intervals.index[-100]].value_counts()

In [None]:
(plot_profile_with_period_marks(2) & plot_profile_with_period_marks(3)).resolve_scale(x = 'shared')