# Simplified handle data issues
After quite a long complex solution to the handling of data issues, let's simplify and simply use boxplot rule (based on Kostas)

In [None]:
from pathlib import Path
from energyclustering.data.preprocessing.interval_information import get_interval_df
from energyclustering.data.preprocessing.peakdetection import replace_data_problems_with_NaN, get_cumulative_measurements_simple, replace_connection_and_pv_power_peaks_with_nan
import pandas as pd 
import altair as alt
alt.data_transformers.disable_max_rows()

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
paths = [Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/new_preprocessed/combined'),
        # Lola simply add your path to the 
        ]
PRE_PATH = next(path for path in paths if path.exists())
DATA_DF_NAME = 'reindexed_DST_data.pkl'
RESULT_DF_NAME = 'reindexed_DST_data_masked_errors.pkl'
INFO_DF_NAME = 'reindexed_info.pkl'
assert (PRE_PATH/DATA_DF_NAME).exists() and (PRE_PATH/INFO_DF_NAME).exists()
# OVERWRITE = True

## Read the data, do the transformation and write the result

In [None]:
data_df = pd.read_pickle(PRE_PATH/DATA_DF_NAME)
info_df = pd.read_pickle(PRE_PATH/INFO_DF_NAME)

In [None]:
%%time
if not (PRE_PATH/RESULT_DF_NAME).exists() or OVERWRITE: 
    interval_df = get_interval_df(data_df, info_df)

In [None]:
%%time
OVERWRITE = False
if (PRE_PATH/RESULT_DF_NAME).exists() and not OVERWRITE: 
    new_data_df = pd.read_pickle(PRE_PATH/RESULT_DF_NAME)
else: 
    is_error = get_cumulative_measurements_simple(data_df, info_df, interval_df, iqr_multiplier = 3)
    new_data_df = replace_data_problems_with_NaN(data_df, interval_df, is_error)
    new_data_df = replace_connection_and_pv_power_peaks_with_nan(new_data_df, info_df)
    new_data_df.to_pickle(PRE_PATH/RESULT_DF_NAME)
    OVERWRITE = False

## Look at the result


In [None]:

def plot_profile(profile_to_check):
    original = data_df.loc[profile_to_check]
    new = new_data_df.loc[profile_to_check]
    plot_df = (
        pd.concat([original, new], keys = ['original', 'new'],axis = 1).rename_axis(index = 'timestamp', columns = 'type')
        .stack(dropna=False)
        .to_frame('value')
        .reset_index()
    )
    return alt.Chart(plot_df, width = 1300, title = str(profile_to_check)).mark_line().encode(
        x = 'timestamp:T', 
        y = 'value', 
        row = alt.Row('type:N', sort = 'descending')
    ).interactive(bind_y = False)

In [None]:
nan_profiles = data_df.isna().sum(axis = 1).sort_values(ascending = False).index
nan_profiles_new = new_data_df.isna().sum(axis = 1).sort_values(ascending =False).index
high_consumption = data_df.max(axis = 1).sort_values(ascending = False).index
injection_profiles = (data_df < 0).sum(axis = 1).pipe(lambda x: x[x>0]).sort_values(ascending = True)
zero_profiles = (data_df == 0).sum(axis = 1).sort_values(ascending = False).index

In [None]:
injection_profiles.hist(bins = 100)

In [None]:
# profile = high_consumption[0]
# profile = high_consumption[30]
# profile = nan_profiles_new[10]
# profile = injection_profiles.index[200]
# profile = zero_profiles[5]
# profile = ('smartmeter_1596',2016) # lots of zeros
profile = ('smartmeter_478', 2014) # used to be non-peaks detected as errors
print(profile)
plot_profile(profile)

In [None]:
info_df.loc[profile]

In [None]:
data = data_df.loc[profile]
q1, q3 = data.quantile(0.25), data.quantile(0.75)
iqr = q3 - q1
print(f"{q1=}, {q3=}, {iqr=}")
print(f"{q1-2*iqr}, {q3+2*iqr}")