# Leveraging similar days: clean version
The main idea is when filling in the missing values in a profile, use the most similar day. 

In this notebook I also try using this idea to figure out if the value after a peak is a cumulative measurement or not: 

1. search for the most similar day based on simularity metric that assumes that the value after the missing interval is a cumulative measurement  
2. search the most similar day based on a simularity metric that assumes that the value after the missing interval is a normal measurement  
Check if which of these similar days is the most similar and use that one.  
I think this would result in using the similar day from 2. whenever the peak is not abnormal and using the day from 1. if it is abnormal  

its a bit more adaptive then using a normal distribution learned on similar days because it takes into account context  

## Imports and set-up

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import itertools
import datetime
import tqdm
idx = pd.IndexSlice
alt.data_transformers.disable_max_rows()

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

## Get the data 

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
info_df.head()

In [None]:
data_df = pd.read_csv(data_path, index_col = [0,1])
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'
data_df.head()

### Start with a subset of the data

In [None]:
DATA_SOURCE = 'EandisVREG'
YEAR = 2016
# get the right subset based on the info df
info16_df = info_df.loc[idx[:, 2016],:]
info16_df = info16_df[info16_df.data_source == 'EandisVREG']
info16_df

In [None]:
# read the corresponding data profiles 
data16_df = data_df.loc[info16_df.index, :]
data16_df

### Only use profiles with potential problems

In [None]:
# nb of zeros for each profile
nb_of_na = (data16_df.isna()).sum(axis = 1)
nb_of_zeros = (data16_df == 0).sum(axis = 1)

data16_df= data16_df.loc[(nb_of_na>0)| (nb_of_zeros>0), :]
data16_df

## Construct the intervals
So in the rest of this code we simply construct the intervals as a dataset and add different attributes/features and investigate whether they could be useful or not

In [None]:
# code to find intervals with only zeros
def value_interval(meterID, year, a, value):
    """
        Makes a dataframe containing the start and end of each interval (only the longest intervals) that only contains value
    """
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    if np.isnan(value):
        iszero = np.concatenate(([0], np.isnan(a).view(np.int8), [0]))
    else: 
        iszero = np.concatenate(([0], np.equal(a, value).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    df = pd.DataFrame(ranges, columns = ['start', 'end'])
    df['meterID'] = meterID
    df['year'] = year
    df['interval_value'] = value
    return df.set_index(['meterID', 'year'])

def zero_nan_intervals_df(data_df): 
    dfs = []
    for (meterID, year), row in data_df.iterrows(): 
        nan_df = value_interval( meterID, year,row, np.NaN)
        zero_df = value_interval( meterID, year, row, 0)
        dfs.append(nan_df)
        dfs.append(zero_df)
    full_df = pd.concat(dfs, axis = 0)
#     full_df['start_time'] = data14_df.columns[full_df['start']]
#     full_df['end_time'] = data14_df.columns[full_df['end']-1]
    return full_df

profile_intervals = zero_nan_intervals_df(data16_df)
profile_intervals['interval_length'] = profile_intervals.end - profile_intervals.start
# start time and end time are exclusive! (this plots well with altair that why we do it this way)
profile_intervals['start_time'] = data16_df.columns[profile_intervals['start']] - pd.Timedelta('15min')
profile_intervals['end_time'] = data16_df.columns[profile_intervals['end']-1] # doing it this way because the timestamp we need might not exist in the columns
profile_intervals['end_time'] += pd.Timedelta('15min')
profile_intervals = profile_intervals.set_index(['start', 'end'], append = True)
profile_intervals

*notes:*  
- start is inclusive, end is exclusive so the interval is $[start, end[$  
- start_time and end_time are both exclusive $]start\_time, end\_time[$  
This works better for plotting in altair
     

## Remove the missing hour on march 27 due to change from winter to summer time 

In [None]:
profile_intervals = profile_intervals[~((profile_intervals.start_time == '2016-03-27 02:00:00') & (profile_intervals.end_time == '2016-03-27 03:00:00'))]
profile_intervals = profile_intervals[~((profile_intervals.start_time == '2016-03-27 01:45:00') & (profile_intervals.end_time == '2016-03-27 03:00:00'))]

## Add two next values after each interval

In [None]:
# profile_intervals['value_after_interval'] =
def values_after_end(row):
    meterID, year, start, end = row.name
    # if end is to large
    try:
        first_value = data16_df.at[(meterID,year), data16_df.columns[end]]
    except: 
        first_value = 'end'
    try:
        second_value = data16_df.at[(meterID,year), data16_df.columns[end+1]]
    except: 
        second_value = 'end'
    return first_value, second_value

if 'first_value_after_end' not in profile_intervals.columns:
    after_values_df = profile_intervals.apply(values_after_end, axis = 1, result_type = 'expand').rename(columns = {0:'first_value_after_end', 1:'second_value_after_end'})
    profile_intervals = pd.concat([profile_intervals, after_values_df], axis = 1)
profile_intervals

## Let's first focus on NaN intervals that start and end on the same day

In [None]:
same_day_intervals = profile_intervals.start_time.dt.date == profile_intervals.end_time.dt.date
nan_intervals = profile_intervals.interval_value.isna()
nan_intervals = profile_intervals[same_day_intervals & nan_intervals].copy()
nan_intervals

## The actual matching

### The simularity metrics

In [None]:
def sim_as_real_measurement(full_day, missing_day): 
    iszero = np.concatenate(([0], np.isnan(missing_day).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    values_to_use = np.zeros(missing_day.shape).astype('int')
    for start,end in ranges: 
        values_to_use[start:end] = 1
    # euclidean distance of known values (without value after missing interval)
    v1 = missing_day[values_to_use]
    v2 = full_day[values_to_use]
    euclidean = np.linalg.norm(v1-v2)
    
    # distances between values after missing intervals
    other_vector = []
    indices_to_use = ranges[:,1]
    other_part = np.linalg.norm(missing_day[indices_to_use]- full_day[indices_to_use])
    return euclidean + other_part

def sim_as_cumulative_measurement(full_day, missing_day): 
    iszero = np.concatenate(([0], np.isnan(missing_day).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    values_to_use = np.zeros(missing_day.shape).astype('int')
    for start,end in ranges: 
        values_to_use[start:end] = 1
        
    # euclidean distance of known part
    v1 = missing_day[values_to_use]
    v2 = full_day[values_to_use]
    euclidean = np.linalg.norm(v1-v2)
    
    # distance between cumulative measurements and the sum of the measurement during missing interval 
    other_vector = []
    for start, end in ranges: 
        consumption_during_missing = np.sum(full_day[start:end+1] )
        cumulative_measurement = missing_day[end]
        other_vector.append(consumption_during_missing - cumulative_measurement)
    other_part = np.linalg.norm(other_vector)
    return euclidean + other_part
        
    
    

### Match function

In [None]:
from math import ceil
def construct_search_intervals(start_time, end_time, reference_day_window, context_size): 
    """
        This function constructs all the parts of the timeseries that we can compare with 
        (same period as the missing period with context_size /2 added to both sides for every day in 'reference_day_window' around the missing day)
    """
    reference_day_window_one_side = pd.Timedelta(days = ceil(reference_day_window / 2))
    reference_day_window_size = reference_day_window_one_side * 2 + pd.Timedelta(days = 1) # one additional day for the day with the missing value
    context_size = pd.Timedelta(context_size)
    context_size_one_side = context_size / 2 
    
    reference_period_length = end_time - start_time + context_size
    
    # the first search interval starts at start_time of the missing interval - half the context size - half of the day window size 
    first_reference_period_start = start_time - context_size_one_side - reference_day_window_one_side
    final_reference_period_start = first_reference_period_start + reference_day_window_size
    search_interval_starts = pd.date_range(first_reference_period_start, final_reference_period_start, freq = 'D')
    search_interval_ends = pd.date_range(first_reference_period_start + reference_period_length, final_reference_period_start + reference_period_length, freq = 'D')
    df = search_interval_starts.to_frame(name = 'start').reset_index(drop = True)
    df['end'] = search_interval_ends
    # filter out intervals that fall outside of known range
    min_date, max_date = data16_df.columns.min(), data16_df.columns.max()
    before_start = df['start'] < min_date 
    after_end = df['end'] > max_date
    return df[~before_start & ~after_end]

def add_data_to_search_intervals(meterID,year, search_interval_df): 
    """
        Make a dataframe with the data from all the periods in search_interval_df
    """
    def get_data(row): 
        start, end = row
        return data16_df.loc[(meterID,year), start:end].values 
    
    data_df = search_interval_df.apply(get_data, axis = 1, result_type = 'expand')
    start, end = search_interval_df.iloc[0]
    new_start = start.replace(year = 2016, month = 1, day = 1)
    new_end = end.replace(year = 2016, month = 1, day = 1) +(end.date() - start.date())
    data_df.columns = pd.date_range(new_start, new_end, freq = '15min')
    data_df.index = pd.MultiIndex.from_frame(search_interval_df)
    return data_df

def match_interval_wrapper(row): 
    """ 
        Simple helper function to call match_interval from an apply call of pandas 
    """
    meterID, year, _, _ = row.name 
    start_time, end_time = row['start_time'] , row['end_time']
    return match_interval(meterID, year, start_time, end_time).squeeze()

def match_interval(meterID, year, start_time, end_time, reference_day_window = 30, context_size = '4H'):
    """
        Function that will find the best match to the missing interval of meter meterID, year year between start_time and end_time
    """
    # make the dataframe with all the relevant data
    search_intervals_df = construct_search_intervals(start_time, end_time, reference_day_window, context_size)
    data_df = add_data_to_search_intervals(meterID, year, search_intervals_df)
    
    # seperate the missing day from all the other days
    missing_day = data_df.loc[start_time - pd.Timedelta(context_size)/2]
    reference_days = data_df.drop(index = start_time-pd.Timedelta(context_size)/2)
    
    # drop reference days with data problems
    reference_days.dropna(inplace = True)
    
    # calculate the similarity between missing day and each reference day
    try:
        distances_real_measurement = reference_days.apply(sim_as_real_measurement, axis = 1, missing_day = missing_day.squeeze().to_numpy(), raw = True)
        distances_cum_measurement = reference_days.apply(sim_as_cumulative_measurement, axis = 1, missing_day = missing_day.squeeze().to_numpy(), raw = True)
        distances = distances_real_measurement.to_frame('real')
        distances['cumulative'] = distances_cum_measurement
    except: 
        print(f"error in profile {meterID}, {start_time}, {end_time}")
        return pd.DataFrame([[np.nan]*5], columns = ['real_distance', 'cumulative_distance', 'real_match', 'cumulative_match', 'best_match'])
    
    # calculate the smallest distances
    best_real_distance, best_cumulative_distance = distances.min(axis = 0)
    best_real_match_date = distances.index[np.argmin(distances['real'])][0] + pd.Timedelta(context_size)/2
    best_cumulative_match_date = distances.index[np.argmin(distances['cumulative'])][0] + pd.Timedelta(context_size)/2
    best_match = 'real' if best_real_distance < best_cumulative_distance else 'cumulative'
    return pd.DataFrame([[best_real_distance, best_cumulative_distance, best_real_match_date, best_cumulative_match_date, best_match]], columns = ['real_distance', 'cumulative_distance', 'real_match', 'cumulative_match', 'best_match'])
    
    

### Single test

In [None]:
test_interval = nan_intervals.iloc[0]

meterID, year, _, _ = test_interval.name
_, _, start_time, end_time, *_ = test_interval 
distances = match_interval(meterID, year, start_time, end_time) 
distances 

### Apply to the all intervals

In [None]:
test_set = nan_intervals
test_set

In [None]:
matches = test_set.apply(match_interval_wrapper, axis = 1)
matches

In [None]:
test_set = pd.concat([test_set, matches], axis = 1)
test_set

# plotting some results

In [None]:
def plot_profile_with_intervals(meterID, year, period_type_column = None, data = None, daterange = None):
    # plots the profile, using the period data in data 
    # the color can be determined using the period_type_column
    if data is None : 
        data = nan_intervals
    if daterange is not None: 
        start_time =  f'2016-{daterange[0]}-1 00:00:00'
        end_time = f'2016-{daterange[1]}-1 00:00:00'
        profile_df = data16_df.loc[(meterID, year),start_time:end_time]
        periods_for_profile =data.loc[(meterID,year), :]
        periods_for_profile = periods_for_profile[(periods_for_profile['end_time'] > start_time ) & (periods_for_profile['start_time'] < end_time)]
    else: 
        profile_df = data16_df.loc[(meterID, year),:]
        periods_for_profile =data.loc[(meterID,year), :]
        
#     print(periods_for_profile[['start_time', 'end_time']])
#     print(zero_periods_for_profile[['start_time', 'end_time', 'is_disconnection_period']])
    line = alt.Chart(profile_df.to_frame('value').reset_index()).mark_line().encode(
        x = alt.X('timestamp:T'), 
        y = alt.Y('value:Q')
    )
    if period_type_column is None: 
        color_encoding = alt.ColorValue('blue') 
    else: 
        color_encoding = alt.Color(f'{period_type_column}:N')
    plot_df =periods_for_profile.reset_index(drop=True)
    rect = alt.Chart(plot_df).mark_rect(opacity = 0.6).encode(
        x = 'start_time:T',
        x2 = 'end_time:T', 
        color = color_encoding
    ) + alt.Chart(plot_df).mark_circle(opacity = 0.6).encode(
        x = 'start_time:T',
        y = alt.YValue(profile_df.max()),
#         x2 = 'end_time:T', 
        color = color_encoding
    )
    chart = rect + line
    if 'connection_power' in periods_for_profile.columns: 
        connection_power = float(periods_for_profile.connection_power.iat[0])

        connection_power_line = alt.Chart(periods_for_profile.reset_index()).mark_rule(color = 'black', opacity = 0.8).encode(
            y =  'mean(connection_power):Q'
        )
        chart += connection_power_line
    return chart.properties(width = 2200, title = f"{meterID} in {year}").interactive()

def plot_helper(profile_idx): 
    meterID = test_set.index.get_level_values(0).unique()[profile_idx]
    return plot_profile_with_intervals(meterID, 2016, 'best_match', data = test_set)


In [None]:
PROFILE_TO_CHECK = 28
plot_helper(PROFILE_TO_CHECK)

In [None]:
meterID = test_set.index.get_level_values(0).unique()[PROFILE_TO_CHECK]
print(meterID)
test_set.loc[meterID]


# calculating some statistics about real/cumulative values

In [None]:
temp = test_set.best_match.value_counts().to_frame('count').reset_index()
alt.Chart(temp).mark_bar().encode(
    x = 'index', 
    y = 'count'
)

# Look at the real measurements into depth

In [None]:
real_measurements = test_set[test_set.best_match == 'real']
real_measurements

In [None]:
# exclude the once that have a zero and then a high value 
real_measurements = real_measurements.query('~(first_value_after_end == 0 & second_value_after_end > 1)')
real_measurements

In [None]:
def plot_helper2(index): 
    meterID = real_measurements.index.get_level_values(0).unique()[index]
    return plot_profile_with_intervals(meterID, 2016, 'best_match', data = test_set)
    

In [None]:
IDX = 5
plot_helper2(IDX)

In [None]:
meterID = real_measurements.index.get_level_values(0).unique()[IDX]
print(meterID)
test_set.loc[meterID]

### Look at some of the longer intervals 

In [None]:
real_measurements_by_length = real_measurements.sort_values('interval_length', ascending = False)
real_measurements_by_length

In [None]:
def plot_helper3(index): 
    meterID = real_measurements_by_length.index.get_level_values(0).unique()[index]
    return plot_profile_with_intervals(meterID, 2016, 'best_match', data = test_set)
    

In [None]:
IDX = 1
plot_helper3(IDX)

In [None]:
meterID = real_measurements_by_length.index.get_level_values(0).unique()[IDX]
print(meterID)
test_set.loc[meterID]