# Leveraging similar days: try-out
The main idea is when filling in the missing values in a profile, use the most similar day. 

In this notebook I also try using this idea to figure out if the value after a peak is a cumulative measurement or not: 

1. search for the most similar day based on simularity metric that assumes that the value after the missing interval is a cumulative measurement  
2. search the most similar day based on a simularity metric that assumes that the value after the missing interval is a normal measurement  
Check if which of these similar days is the most similar and use that one.  
I think this would result in using the similar day from 2. whenever the peak is not abnormal and using the day from 1. if it is abnormal  

its a bit more adaptive then using a normal distribution learned on similar days because it takes into account context  

## Imports and set-up

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import tqdm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
info_df.head()

In [None]:
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'
data_df.head()

In [None]:
leap_years = [2012, 2016]
non_leap_years = [year for year in info_df.index.levels[1] if year not in leap_years]
print(f'leap years = {leap_years}')
print(f'non leap years = {non_leap_years}')

## Handle all data sources and years seperately
Of course connection problems need to be in the same year and within the same measurement project, so for now lets use the EandisVREG data of 2016

In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']
info16_df

In [None]:
# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]
data16_df

## Look at the amount of NaNs and Zeros

In [None]:
# nb of zeros for each profile
nb_of_na = (data16_df.isna()).sum(axis = 1)
nb_of_zeros = (data16_df == 0).sum(axis = 1)

## Look at profiles with potential problems

In [None]:
data16_df= data16_df.loc[(nb_of_na>0)| (nb_of_zeros>0), :]
data16_df

## Construct the intervals
So in the rest of this code we simply construct the intervals as a dataset and add different attributes/features and investigate whether they could be useful or not

In [None]:
# code to find intervals with only zeros
def value_interval(meterID, year, a, value):
    """
        Makes a dataframe containing the start and end of each interval (only the longest intervals) that only contains value
    """
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    if np.isnan(value):
        iszero = np.concatenate(([0], np.isnan(a).view(np.int8), [0]))
    else: 
        iszero = np.concatenate(([0], np.equal(a, value).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    df = pd.DataFrame(ranges, columns = ['start', 'end'])
    df['meterID'] = meterID
    df['year'] = year
    df['interval_value'] = value
    return df.set_index(['meterID', 'year'])

def zero_nan_intervals_df(data_df): 
    dfs = []
    for (meterID, year), row in data_df.iterrows(): 
        nan_df = value_interval( meterID, year,row, np.NaN)
        zero_df = value_interval( meterID, year, row, 0)
        dfs.append(nan_df)
        dfs.append(zero_df)
    full_df = pd.concat(dfs, axis = 0)
#     full_df['start_time'] = data14_df.columns[full_df['start']]
#     full_df['end_time'] = data14_df.columns[full_df['end']-1]
    return full_df

profile_intervals = zero_nan_intervals_df(data16_df)
profile_intervals['interval_length'] = profile_intervals.end - profile_intervals.start
# start time and end time are exclusive! (this plots well with altair that why we do it this way)
profile_intervals['start_time'] = data16_df.columns[profile_intervals['start']] - pd.Timedelta('15min')
profile_intervals['end_time'] = data16_df.columns[profile_intervals['end']-1] # doing it this way because the timestamp we need might not exist in the columns
profile_intervals['end_time'] += pd.Timedelta('15min')
profile_intervals = profile_intervals.set_index(['start', 'end'], append = True)
profile_intervals

*notes:*  
- start is inclusive, end is exclusive so the interval is $[start, end[$  
- start_time and end_time are both exclusive $]start\_time, end\_time[$  
This works better for plotting in altair
     

## Remove the missing hour on march 27 due to change from winter to summer time 

In [None]:
profile_intervals = profile_intervals[~((profile_intervals.start_time == '2016-03-27 02:00:00') & (profile_intervals.end_time == '2016-03-27 03:00:00'))]

## Add two next values after each interval

In [None]:
# profile_intervals['value_after_interval'] =
def values_after_end(row):
    meterID, year, start, end = row.name
    # if end is to large
    try:
        first_value = data16_df.at[(meterID,year), data16_df.columns[end]]
    except: 
        first_value = 'end'
    try:
        second_value = data16_df.at[(meterID,year), data16_df.columns[end+1]]
    except: 
        second_value = 'end'
    return first_value, second_value

if 'first_value_after_end' not in profile_intervals.columns:
    after_values_df = profile_intervals.apply(values_after_end, axis = 1, result_type = 'expand').rename(columns = {0:'first_value_after_end', 1:'second_value_after_end'})
    profile_intervals = pd.concat([profile_intervals, after_values_df], axis = 1)
profile_intervals

## Let's first focus on NaN intervals that start and end on the same day

In [None]:
same_day_intervals = profile_intervals.start_time.dt.date == profile_intervals.end_time.dt.date
nan_intervals = profile_intervals.interval_value.isna()
nan_intervals = profile_intervals[same_day_intervals & nan_intervals].copy()
nan_intervals

### Get the missing day data

In [None]:
nan_intervals['date'] = nan_intervals.start_time.dt.date
days_with_missing_data = nan_intervals.reset_index()[['meterID', 'year', 'date']].drop_duplicates()


def get_data(row): 
    meterID, year, date = row
    
    return data16_df.loc[(meterID,year), data16_df.columns.date == date].values # for some reason loc directly using the date does not work
    
day_data = days_with_missing_data.apply(get_data, axis = 1, result_type = 'expand')
day_data.columns = pd.date_range('2016-01-01', '2016-01-01 23:45', freq = '15min')
day_data.index = pd.MultiIndex.from_frame(days_with_missing_data)
days_with_missing_data = day_data 
days_with_missing_data

### Match these days with the most similar day based on a certain simularity metric

In [None]:
def sim_as_real_measurement(full_day, missing_day): 
    iszero = np.concatenate(([0], np.isnan(missing_day).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    values_to_use = np.zeros(missing_day.shape).astype('int')
    for start,end in ranges: 
        values_to_use[start:end] = 1
    # euclidean distance of known values (without value after missing interval)
    v1 = missing_day[values_to_use]
    v2 = full_day[values_to_use]
    euclidean = np.linalg.norm(v1-v2)
    
    # distances between values after missing intervals
    other_vector = []
    indices_to_use = ranges[:,1]
    other_part = np.linalg.norm(missing_day[indices_to_use]- full_day[indices_to_use])
    return euclidean + other_part

def sim_as_cumulative_measurement(full_day, missing_day): 
    iszero = np.concatenate(([0], np.isnan(missing_day).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    values_to_use = np.zeros(missing_day.shape).astype('int')
    for start,end in ranges: 
        values_to_use[start:end] = 1
        
    # euclidean distance of known part
    v1 = missing_day[values_to_use]
    v2 = full_day[values_to_use]
    euclidean = np.linalg.norm(v1-v2)
    
    # distance between cumulative measurements and the sum of the measurement during missing interval 
    other_vector = []
    for start, end in ranges: 
        consumption_during_missing = np.sum(full_day[start:end+1] )
        cumulative_measurement = missing_day[end]
        other_vector.append(consumption_during_missing - cumulative_measurement)
    other_part = np.linalg.norm(other_vector)
    return euclidean + other_part
        
    
    

In [None]:
missing_day_row = days_with_missing_data.iloc[500,:]
search_window = 90
meterID, year, date = missing_day_row.name
print(meterID)
min_date, max_date = date - pd.Timedelta(f'{search_window//2}D'), date + pd.Timedelta(f'{search_window//2}D')
full_profile = data16_df.loc[(meterID, year),:]
dates_to_match = pd.date_range(min_date, max_date, freq = '1D')
all_relevant_days = full_profile.loc[min_date:max_date].to_frame('value')
all_relevant_days

In [None]:
relevant_days = all_relevant_days.copy()
relevant_days['date'] = relevant_days.index.date
relevant_days['time'] = relevant_days.index.time
possible_days = pd.pivot_table(relevant_days, index = 'date', columns = 'time', values = 'value').dropna(axis = 0, how = 'any')
possible_days.index = pd.to_datetime(possible_days.index)
possible_days.columns = pd.to_datetime(possible_days.columns, format='%H:%M:%S')
possible_days 

In [None]:
distances_real_measurement = possible_days.apply(sim_as_real_measurement, axis = 1, missing_day = missing_day_row.to_numpy(), raw = True)
distances_cum_measurement = possible_days.apply(sim_as_cumulative_measurement, axis = 1, missing_day = missing_day_row.to_numpy(), raw = True)
distances = distances_real_measurement.to_frame('real')
distances['cumulative'] = distances_cum_measurement
distances

In [None]:
best_real_match = distances.index[np.argmin(distances['real'])]
best_cumulative_match = distances.index[np.argmin(distances['cumulative'])]
distances.loc[[best_real_match, best_cumulative_match],:]

In [None]:
missing_data = alt.Chart(missing_day_row.to_frame('value').reset_index(), title = 'missing day').mark_line().encode(
    x= 'index:T',
    y='value'
)
full_month = alt.Chart(all_relevant_days.reset_index(), width = 1600, title = 'days to match with').mark_line().encode(
     x= 'timestamp:T',
    y='value'
).interactive(bind_y = False)
best_day = alt.Chart(possible_days.loc[best_real_match].to_frame('value').reset_index(), title = 'best_real_match').mark_line().encode(
     x= 'time:T',
    y='value'
)
best_cumulative_day = alt.Chart(possible_days.loc[best_cumulative_match].to_frame('value').reset_index(), title = 'best_cumulative_match').mark_line().encode(
     x= 'time:T',
    y='value'
)
full_month & (missing_data & best_day & best_cumulative_day).resolve_scale(y='shared')
# (missing_data & best_day & best_cumulative_day).resolve_scale(y='shared')