In [None]:
from pathlib import Path
from energyclustering.data.preprocessing.interval_information import get_interval_df
from energyclustering.data.preprocessing.peakdetection import replace_connection_and_pv_power_peaks_with_nan
import pandas as pd 
import altair as alt
alt.data_transformers.disable_max_rows()

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
paths = [Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/new_preprocessed/combined'),
        # Lola simply add your path to the 
        ]
PRE_PATH = next(path for path in paths if path.exists())
DATA_DF_NAME = 'reindexed_DST_data.pkl'
INFO_DF_NAME = 'reindexed_info.pkl'
RESULT_DF_NAME = 'reindexed_DST_data_subset_no_errors.pkl'
INFO_RESULT_DF_NAME = 'reindexed_info_subset_no_errors.pkl'

assert (PRE_PATH/DATA_DF_NAME).exists() and (PRE_PATH/INFO_DF_NAME).exists()
# OVERWRITE = True

## Read the data

In [None]:
data_df = pd.read_pickle(PRE_PATH/DATA_DF_NAME)
info_df = pd.read_pickle(PRE_PATH/INFO_DF_NAME)

## Fill the missing DST hour with zeros

In [None]:
mar_2010, oct_2010 = pd.to_datetime('2016-03-28 02:00:00') , pd.to_datetime('2016-10-31 02:00:00')
mar_2011, oct_2011 = pd.to_datetime('2016-03-27 02:00:00') , pd.to_datetime('2016-10-30 02:00:00')
mar_2012, oct_2012 = pd.to_datetime('2016-03-25 02:00:00') , pd.to_datetime('2016-10-28 02:00:00')
mar_2013, oct_2013 = pd.to_datetime('2016-03-31 02:00:00') , pd.to_datetime('2016-10-27 02:00:00')
mar_2014, oct_2014 = pd.to_datetime('2016-03-30 02:00:00') , pd.to_datetime('2016-10-26 02:00:00')
mar_2015, oct_2015 = pd.to_datetime('2016-03-29 02:00:00') , pd.to_datetime('2016-10-25 02:00:00')
mar_2016, oct_2016 = pd.to_datetime('2016-03-27 02:00:00') , pd.to_datetime('2016-10-30 02:00:00')
mar_2017, oct_2017 = pd.to_datetime('2016-03-26 02:00:00') , pd.to_datetime('2016-10-29 02:00:00')
DST_times = pd.DataFrame(
    [
        [mar_2010, oct_2010],
        [mar_2011, oct_2011],
        [mar_2012, oct_2012],
        [mar_2013, oct_2013],
        [mar_2014, oct_2014],
        [mar_2015, oct_2015], 
        [mar_2016, oct_2016], 
        [mar_2017, oct_2017]
    ], index = range(2010, 2018),
    columns = ['DST_start', 'DST_end']).rename_axis(index = 'year')
DST_times

In [None]:
def fill_DST_missing_hour_with_zeros(row): 
    meterID, year = row.name 
    DST_start, DST_end = DST_times.loc[year]
    missing_hour_start = DST_start
    missing_hour_end = DST_start + pd.Timedelta('45min')
    assert row.loc[missing_hour_start:missing_hour_end].isna().all()
    row.loc[missing_hour_start: missing_hour_end]= 0


In [None]:
# fill all the missing hours with 0
data_df.apply(fill_DST_missing_hour_with_zeros, axis =1)
data_df

## Remove the additional day in a leap year

In [None]:
data_df = data_df.drop(data_df.loc[:,'2016-02-29':'2016-02-29 23:45'].columns, axis = 1)
data_df.columns = [date.replace(year = 2015) for date in data_df.columns]
data_df

In [None]:
data_df.loc[:,'2015-02-28 23:00':'2015-03-01 1:00']

## Remove power peaks that we are certain off 

In [None]:
data_df = replace_connection_and_pv_power_peaks_with_nan(data_df, info_df)
data_df

## Drop the days with remaining problems

In [None]:
data_df = data_df.dropna(how = 'any', axis = 0)
data_df 

## Take corresponding info_df

In [None]:
info_df = info_df.loc[data_df.index]
info_df

## Save the results

In [None]:
data_df.to_pickle(PRE_PATH/RESULT_DF_NAME)
info_df.to_pickle(PRE_PATH/INFO_RESULT_DF_NAME)

## Look at some profiles

In [None]:
def plot_profile(profile): 
    profile_df = data_df.loc[profile]
    plot_df = profile_df.to_frame().reset_index().set_axis(['time', 'value'], axis = 1)
    return alt.Chart(plot_df, width = 1500).mark_line().encode(
        x = 'time', 
        y = 'value'
    ).interactive(bind_y = False)

In [None]:
info_df = info_df.loc[data_df.index]
info_df

In [None]:
info_df.consumer_type.value_counts(dropna = False).to_frame('count')

In [None]:
info_df.data_source.value_counts().to_frame('count')

In [None]:
info_df.tarif_type.value_counts()

In [None]:
info_df.category.value_counts()