# First look at the data
DONE: 
- Preprocessed all files to a csv with info and data 
- make a combined file
- added year to the energy profiles 

Remaining problems: 
- Somewhere there are two rows with NaN's remove these 

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import datetime
import tqdm
import pyxlsb
alt.data_transformers.disable_max_rows()

In [None]:
# PATH to the profile directory in the fluvius data
# DATA_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/Data-2020-11/FluviusData/profiles')
DATA_PATH = Path('/Users/lolabotman/PycharmProjects/FluviusFullData/profiles') #Path Lola

# PATH to where the preprocessed files should be appear
# PREPROCESSED_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed_test/infrax')
PREPROCESSED_PATH = Path('/Users/lolabotman/PycharmProjects/FluviusFullData/profiles/preprocessed/infrax')#Path Lola
PREPROCESSED_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)

# Parse code
This is simply all the code to parse every kind of dataset (not so clean I know)  

In [None]:
# Converting a value to a datetime format  
def to_timestamp(index): 
    return [pyxlsb.convert_date(value) if not np.isnan(value) else value for value in index]

# transform the data and save the transformed data using the functions according to the parse data dict
def transform_and_save(source_path, name, parse_function): 
    info_path = PREPROCESSED_PATH / f"{name}_info.csv"
    data_path = PREPROCESSED_PATH / f"{name}_data.csv"
    if not( info_path.exists() and data_path.exists()):
        try:
            info_df, data_df = parse_function(source_path)
            info_df.to_csv(info_path)
            data_df.to_csv(data_path)
            assert info_path.exists() 
            assert data_path.exists()
        except Exception as e:
            raise e
    else: 
        print(f'{name} already preprocessed')

# In these files, the 24 first rows are meta data and the time series starts on row 26
# There are 30 EANs and hourly measurements for a year (2015 file and 2014 file)
# Hypothesis : measurement values are in kW
def read_infrax_gas(path): 
    df = pd.read_excel(path, header = None, parse_dates = True)
    df = df.set_index(df.columns[0])
    smart_meter_df = df.T
    smart_meter_df = smart_meter_df.set_index('EAN_coded')
    
    # info_df (we get the meta data from the first 24 columns - table has been transposed)
    info_df = smart_meter_df.iloc[:,:24]
    info_df = info_df.loc[:, ~ info_df.columns.isna()]
    info_df = info_df.dropna(how='all', axis = 1) # drop columns with all NaN's
    info_df = info_df.set_index('Jaar', append=True)
    
    #data_df (we get the actual value data from after column 24 )
    data_df = smart_meter_df.iloc[:,24:].copy()
    data_df.columns = pd.to_datetime(data_df.columns).round('1min')
    data_df['Jaar'] = info_df.Jaar
    data_df = data_df.set_index(['Jaar'], append = True)
    data_df = data_df.sort_index()
    
    assert info_df.index.is_unique, 'info_df index should be unique'
    assert data_df.index.is_unique, 'data_df index should be unique'

    # sort on index
    info_df = info_df.sort_index()
    data_df = data_df.sort_index()
    
    
    return info_df, data_df 

# In these files, the 24 first rows are meta data and the time series starts on row 27
# there are 8 smart meter ids (EAN)
# yearly total is in kWh 
# measurement values are in kW
def read_infrax_heatpump(path): 
    df = pd.read_excel(path, header = None, parse_dates = True)
    df = df.set_index(df.columns[0])
    smart_meter_df = df.T
    smart_meter_df = smart_meter_df.set_index('EAN_coded')

    # info_df 
    info_df = smart_meter_df.iloc[:,:25]
    info_df = info_df.loc[:, ~ info_df.columns.isna()]
    info_df = info_df.dropna(how='all', axis = 1) # drop columns with all NaN's
    info_df = info_df.set_index('Jaar', append=True)
    
    #data_df 
    data_df = smart_meter_df.iloc[:,25:].copy()
    data_df.columns = pd.to_datetime(data_df.columns).round('1min')
    data_df['Jaar'] = info_df.index.get_level_values('Jaar')
    data_df = data_df.set_index(['Jaar'], append = True)
    data_df = data_df.sort_index()
    
    assert info_df.index.is_unique, 'info_df index should be unique'
    assert data_df.index.is_unique, 'data_df index should be unique'

    # sort on index
    info_df = info_df.sort_index()
    data_df = data_df.sort_index()
    
    return info_df, data_df 

# Yearly total in kwh 
# hyp : Measurement values in kw >> deduced from the fact that (sum of the measurement) = 4*(jaarverbuik in kwh) + max value given in kW
def read_infrax_app_xlsb(path): 
    # no useful index! 
    df = pd.read_excel(path, header = None, engine='pyxlsb')
    df.set_index(df.columns[0], inplace = True)
    smart_meter_df = df.T
   
    #info df 
    info_df = (
    smart_meter_df
        .iloc[:,:5] # info columns
        .loc[:,~smart_meter_df.columns[:5].isna()] # drop nan columns
    )
    info_df = info_df.rename(index=lambda s: 'app2_'+ str(s)) #changing to 'unique' index to not mix up with the app1
    info_df['Jaar']=[2014]*len(info_df) ##extract 2014 anoher way ? more generic ?
    info_df = info_df.set_index('Jaar', append=True)
    info_df = info_df.dropna(how = 'all') # for some reason there are some NaN rows
    info_df = info_df.rename(columns={'Max (kW)':'Piek P (kW)'}) #rename such as to have the same column title as the other dfs
    
    # data_df
    data_df = smart_meter_df.iloc[:,5:].copy()
    data_df.columns = to_timestamp(data_df.columns)
    data_df = data_df.dropna(how = 'all') # for some reason there are some NaN rows
    data_df.columns = data_df.columns.round('1min')
    data_df = data_df.rename(index=lambda s: 'app2_'+ str(s))
    data_df['Jaar'] = [int(2014)]*len(data_df)
    data_df = data_df.set_index(['Jaar'], append = True)  
    data_df = data_df.loc[:,pd.to_datetime(data_df.columns).year == 2014] #there is one day of 2015 that we don't want to keep
    data_df.columns = data_df.columns.map(lambda t: t.replace(year=2016)) #set the columns to 2016 for the final merge 
    
    assert info_df.index.is_unique, 'info_df index should be unique'
    assert data_df.index.is_unique, 'data_df index should be unique'

    # sort on index
    info_df = info_df.sort_index()
    data_df = data_df.sort_index()
    return info_df, data_df

# hyp : all measurement values in kW >> deduced from the fact that (sum of the measurement) = 4*(jaarverbuik in kwh) 
# jaar verbruik in kWh
def read_infrax_app_xlsx(path): 
    df = pd.read_excel(path, header = None, parse_dates = True)
    df = df.set_index(df.columns[0])
    smart_meter_df = df.T

    # info_df
    info_df = smart_meter_df.iloc[:,:7]
    info_df = info_df.loc[:, ~ info_df.columns.isna()]
    info_df = info_df.rename(index=lambda s: 'app1_'+ str(s)) #changing to 'unique' index to not mix up with the app1
    info_df['Jaar']=[int(2014)]*len(info_df) ##extract 2014 anoher way ? more generic ?
    info_df = info_df.set_index('Jaar', append=True)
    info_df = info_df.rename(columns={'Max (kW)':'Piek P (kW)'})#rename such as to have the same column title as the other dfs
    
    # data_df
    data_df = smart_meter_df.iloc[:,7:].copy()
    data_df = data_df.dropna(how = 'all')
    data_df.columns = pd.to_datetime(data_df.columns).round('1min')
    data_df = data_df.rename(index=lambda s: 'app1_'+ str(s))
    data_df['Jaar'] = [2014]*len(data_df)
    data_df = data_df.set_index(['Jaar'], append = True)
    data_df = data_df.loc[:,pd.to_datetime(data_df.columns).year == 2014] #there is one day of 2015 that we don't want to keep
    data_df.columns = data_df.columns.map(lambda t: t.replace(year=2016)) #set the columns to 2016 for the final merge 
    
    assert info_df.index.is_unique, 'info_df index should be unique'
    assert data_df.index.is_unique, 'data_df index should be unique'

    # sort on index
    info_df = info_df.sort_index()
    data_df = data_df.sort_index()
    return info_df, data_df

# all other files ending in _coded.xlsb 
# measurment values in kW
# yearly total in kWh
def read_infrax_data(path):
    df = pd.read_excel(path, engine='pyxlsb')
    df.set_index(df.columns[0], inplace = True)
    smart_meter_df = df.T
    smart_meter_df.set_index('EAN_coded', inplace = True)


    # info df 
    info_df = (
    smart_meter_df
        .iloc[:,:21] # info columns
        .loc[:,~smart_meter_df.columns[:21].isna()] # drop nan columns
        .drop(columns = ['Info installatie', 'Info profiel'])
    )

    info_df['PV vermogen (kW)'] = info_df['PV vermogen (kW)'].replace('/', np.nan)
    info_df = info_df[~ info_df.index.isna()] #remove row with nan index
    info_df = info_df.reset_index()
    info_df['EAN_coded'] = info_df['EAN_coded'].astype('int')
    info_df = info_df.set_index(['EAN_coded', 'Jaar'])

    # data df 

    data_df = smart_meter_df.iloc[:,23:].copy()
    data_df.columns = to_timestamp(data_df.columns)
    # drop the columns with NaT
    data_df = data_df.loc[:,~data_df.columns.isna()]
    data_df.columns = data_df.columns.round('1min')
    data_df = data_df[~ data_df.index.isna()] #remove row with nan index
    data_df = data_df.reset_index()
    data_df['EAN_coded'] = data_df['EAN_coded'].astype('int')
    data_df['Jaar'] = info_df.index.get_level_values(1)
    data_df = data_df.set_index(['EAN_coded','Jaar'])

    # Handle the ids 1290 en 1299 that have year 2013 twice 
    if (1290, 2013) in info_df.index: 
        new_info_df = info_df.reset_index()
        new_info_df.loc[new_info_df['EAN_coded'].isin([1290,1299]) & new_info_df.duplicated(subset = ['EAN_coded', 'Jaar'], keep = 'first'), 'Jaar'] = 2012
        info_df = new_info_df.set_index(['EAN_coded', 'Jaar'])
        
        new_data_df = data_df.reset_index()
        new_data_df.loc[new_data_df['EAN_coded'].isin([1290,1299]) & new_data_df.duplicated(subset = ['EAN_coded', 'Jaar'], keep = 'first'), 'Jaar'] = 2012
        data_df = new_data_df.set_index(['EAN_coded', 'Jaar'])
        
   
    assert info_df.index.is_unique, 'info_df index should be unique'
    assert data_df.index.is_unique, 'data_df index should be unique'

    # sort on index
    info_df = info_df.sort_index()
    data_df = data_df.sort_index()
    
    return info_df, data_df 
    
  

# file information

In [None]:
# this is which parser function to use for which file 
parser_functions = {   
    'Appartement1': read_infrax_app_xlsx,
    'Appartement2': read_infrax_app_xlsb,
#     'SLP_profiel S41 2014 (30)_coded': read_infrax_gas,
#     'SLP_profiel S41 2015 (30)_coded': read_infrax_gas,
    'SLPs_professionelen(348)_coded': read_infrax_data,
    'SLPs_residentielen(1675)_coded': read_infrax_data,
    'SLPs_residentiëlen(1675)_coded': read_infrax_data,
    'Slimme meters met WP (en eventueel PV)_coded': read_infrax_heatpump,
    'Slimme meters_professionelen(141)_coded': read_infrax_data,
    'Slimme meters_prosumers(123)_coded': read_infrax_data,
    'Slimme meters_residentielen(1080)_coded': read_infrax_data, 
    'Slimme meters_residentiëlen(1080)_coded': read_infrax_data
}

# this is which preprocessed file name to use 
new_filename = { 
    'Appartement1': 'app1',
    'Appartement2': 'app2',
    'SLP_profiel S41 2014 (30)_coded': 'SLP_gas_2014',
    'SLP_profiel S41 2015 (30)_coded': 'SLP_gas_2015',
    'SLPs_professionelen(348)_coded': 'SLP_prof',
    'SLPs_residentielen(1675)_coded': 'SLP_resid',
    'SLPs_residentiëlen(1675)_coded': 'SLP_resid',
    'Slimme meters met WP (en eventueel PV)_coded': 'M_heatpump',
    'Slimme meters_professionelen(141)_coded': 'M_prof',
    'Slimme meters_prosumers(123)_coded': 'M_prosumers',
    'Slimme meters_residentielen(1080)_coded': 'M_resid', 
    'Slimme meters_residentiëlen(1080)_coded': 'M_resid'
}

# Parse it all :D 

In [None]:
infrax_path = DATA_PATH/ "20171219 Profielen Infrax"
translate = dict()
for path in tqdm.tqdm(list(infrax_path.glob('**/*.xlsb'))+ list(infrax_path.glob('**/*.xlsx'))):
    print(path)
    if path.stem in parser_functions:
        new_name = new_filename[path.stem]
        parser = parser_functions[path.stem]
        transform_and_save(path, new_name, parser)
    else:
        print('error:'+path.stem)
   
    



# Make combined dataframe of relevant profiles
So these profiles are all in the same format so we can easily combine these!  
I add some extra columns to the info dataframe to ensure that we can later recover the different groups if necessary.  
Appartement is excluded and the gas information is excluded

In [None]:
def combined_info_df():
    files = ['M_resid_info.csv', 'SLP_resid_info.csv', 'M_prof_info.csv', 'SLP_prof_info.csv', 'M_prosumers_info.csv', 'M_heatpump_info.csv', 'app1_info.csv', 'app2_info.csv']
    files = [PREPROCESSED_PATH/file for file in files]


    M_heatpump = pd.read_csv(files[5], index_col = [0,1])
    M_heatpump['heatpump'] = True


    M_prosumers = pd.read_csv(files[4], index_col = [0,1])
    M_prosumers = M_prosumers.dropna(how='all')
    M_prosumers['prosumer'] = True

    
    M_prof_df = pd.read_csv(files[2], index_col = [0,1])
    SLP_prof_df = pd.read_csv(files[3], index_col = [0,1])


    M_resid_df = pd.read_csv(files[0], index_col = [0,1])
    SLP_resid_df = pd.read_csv(files[1], index_col = [0,1])
    
    app1_df = pd.read_csv(files[6], index_col =[0,1])
    app1_df['R/P']=['app1']*len(app1_df.index)
    
    app2_df = pd.read_csv(files[7], index_col = [0,1])
    app2_df['R/P']=['app2']*len(app2_df.index)

    infrax = pd.concat([M_resid_df, SLP_resid_df, M_prof_df, SLP_prof_df, M_heatpump, M_prosumers, app1_df, app2_df]).sort_index()
    infrax.to_csv(PREPROCESSED_PATH/'combined_info.csv')

In [None]:
if not (PREPROCESSED_PATH/'combined_info.csv').exists(): 
    combined_info_df()

In [None]:
comb = pd.read_csv(PREPROCESSED_PATH/'combined_info.csv')
comb

In [None]:
OVERWRITE = True
if not (PREPROCESSED_PATH/'combined_data.csv').exists() or OVERWRITE: 
    files = ['M_resid_info.csv', 'SLP_resid_info.csv', 'M_prof_info.csv', 'SLP_prof_info.csv', 'M_prosumers_info.csv', 'M_heatpump_info.csv','app1_info.csv', 'app2_info.csv']
    profile_files = [PREPROCESSED_PATH/ f'{file[:-8]}data.csv' for file in files]
    combined_data_df = pd.concat([pd.read_csv(file, index_col = [0,1]) for file in profile_files] )
    combined_data_df = combined_data_df.dropna(how='all', axis = 0).sort_index()
    combined_data_df = combined_data_df.reset_index()
    combined_data_df['Jaar'] = combined_data_df['Jaar'].astype('int')
    combined_data_df = combined_data_df.set_index(['EAN_coded','Jaar'])
    #combined_data_df.to_csv(PREPROCESSED_PATH/'combined_data.csv')

In [None]:
combined_data_df.loc[1001]

In [None]:
combined_data_df

In [None]:
combined_data_df.reset_index().sort_values('Jaar')['Jaar'].unique()

### Check how the DST has been treated 

* 2010 : '2010-03-28 00:02:00' & '2010-10-31 00:02:00'
* 2011 : '2011-03-27 00:02:00' & '2011-10-30 00:02:00'
* 2012 : '2012-03-25 00:02:00' & '2012-10-28 00:02:00'
* 2013 : '2013-03-31 00:02:00' & '2013-10-27 00:02:00'
* 2014 : '2014-03-30 00:02:00' & '2014-10-26 00:02:00'
* 2015 : '2015-03-29 00:02:00' & '2015-10-25 00:02:00'
* 2016 : '2016-03-27 00:02:00' & '2016-10-30 00:02:00'

We know that the 2 appartment files have not been treated, because we can still notice the artefacts (duplicate values in october and missing values in march) >> let's remove this before checking the rest

In [None]:
app1_file = pd.read_csv(PREPROCESSED_PATH/'app1_info.csv', index_col = 0)
app2_file = pd.read_csv(PREPROCESSED_PATH/'app2_info.csv', index_col = 0)

In [None]:
sm_id_apps = list(app1_file.index) + list(app2_file.index)

In [None]:
new_combined_data_df = combined_data_df.drop(index=sm_id_apps).dropna(axis='columns', how = 'all', inplace=False)
new_combined_data_df

In [None]:
def parse_dates(data_df):
    data_df.columns = pd.to_datetime(data_df.columns).round('1min')
    return data_df

In [None]:
#Make df per year >> DST changes on different times. 

sm_2010 = parse_dates(new_combined_data_df.reset_index()[new_combined_data_df.reset_index().Jaar == 2010].drop(columns='Jaar').set_index('EAN_coded'))
sm_2011 = parse_dates(new_combined_data_df.reset_index()[new_combined_data_df.reset_index().Jaar == 2011].drop(columns='Jaar').set_index('EAN_coded'))
sm_2012 = parse_dates(new_combined_data_df.reset_index()[new_combined_data_df.reset_index().Jaar == 2012].drop(columns='Jaar').set_index('EAN_coded'))
sm_2013 = parse_dates(new_combined_data_df.reset_index()[new_combined_data_df.reset_index().Jaar == 2013].drop(columns='Jaar').set_index('EAN_coded'))
sm_2014 = parse_dates(new_combined_data_df.reset_index()[new_combined_data_df.reset_index().Jaar == 2014].drop(columns='Jaar').set_index('EAN_coded'))
sm_2015 = parse_dates(new_combined_data_df.reset_index()[new_combined_data_df.reset_index().Jaar == 2015].drop(columns='Jaar').set_index('EAN_coded'))
sm_2016 = parse_dates(new_combined_data_df.reset_index()[new_combined_data_df.reset_index().Jaar == 2016].drop(columns='Jaar').set_index('EAN_coded'))

In [None]:
#year is set to 2016 because we used this year to be coherent even with differnt profiles in different years
#Lets define the change of date in each year 
mar_2010, oct_2010 = pd.to_datetime('2016-03-28 02:00:00') , pd.to_datetime('2016-10-31 02:00:00')
mar_2011, oct_2011 = pd.to_datetime('2016-03-27 02:00:00') , pd.to_datetime('2016-10-30 02:00:00')
mar_2012, oct_2012 = pd.to_datetime('2016-03-25 02:00:00') , pd.to_datetime('2016-10-28 02:00:00')
mar_2013, oct_2013 = pd.to_datetime('2016-03-31 02:00:00') , pd.to_datetime('2016-10-27 02:00:00')
mar_2014, oct_2014 = pd.to_datetime('2016-03-30 02:00:00') , pd.to_datetime('2016-10-26 02:00:00')
mar_2015, oct_2015 = pd.to_datetime('2016-03-29 02:00:00') , pd.to_datetime('2016-10-25 02:00:00')
mar_2016, oct_2016 = pd.to_datetime('2016-03-27 02:00:00') , pd.to_datetime('2016-10-30 02:00:00')

ONGOING

In [None]:
def sample_selection(df,start_missing_hour):
    """
        INPUT 
        df = dataframe with date as columns (in datetime format) and smart meter id as rows 
        date = timestamp of the start of the missing hour in datetime format

        OUTPUT
        the start timestamp and end timestamp of the missing hour 
        df with NaNs instead of the originally potentially missing hour
        original sample df, englobing the potentially originally missing hour and one extra hour before and after
        nan sample df, englobing the missing hour (replaced with NaNs) and one extra hour before and after
    """
    #potentially missing hour
    end_missing_hour = start_missing_hour+pd.Timedelta(minutes=45)
    
    #full df with the potentially missing hour replaced with NaNs
    df_copy = df.copy()
    df_copy.at[:,start_missing_hour:end_missing_hour]=np.nan
    df_nan = df_copy.copy()
    
    #sample one hour before and after the potentially missing hour >> Original + with NaN
    start_sample = start_missing_hour - 4*pd.DateOffset(minutes=15)
    end_sample = end_missing_hour + 4*pd.DateOffset(minutes=15)
    original_sample_df = df.loc[:,start_sample: end_sample].copy()
    nan_sample_df = df_nan.loc[:,start_sample: end_sample].copy()
    
    return df_nan, original_sample_df, nan_sample_df

In [None]:
df_nan, original_sample_df, nan_sample_df = sample_selection(sm_2010, mar_2010)

In [None]:
def hyp_1(original_df, start_missing_hour):
    
    #FUNCTION : checks if missing hour = (h-1 + h+1)/2
    
    #INPUT : 
    # original sample df, englobing the potentially originally missing hour and one extra hour before and after
    # nan sample df, englobing the missing hour (replaced with NaNs) and one extra hour before and after
    # the two dfs should have the same size !! 
    
    #OUTPUT : 
    # df of percentage error in interpolation 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)
    
    
    result = pd.DataFrame(index= ['(h-1 + h+1)/2'],columns = ['is_correct', '%error'])
    
    interp_sample = pd.DataFrame()
    for sm_id in nan_sample_df.index:
        sm_serie = nan_sample_df.loc[sm_id].copy()
        for i in range(0,4):
            sm_serie.iat[i+4] = (sm_serie[i]+sm_serie[i+8])/2
            sm_df = pd.DataFrame(sm_serie).T
        interp_sample = interp_sample.append(sm_df)

    bool_output = interp_sample == original_sample_df
    unique, counts = np.unique(bool_output, return_counts=True)
    nb_nans_to_guess = 4*len(original_sample_df.index)
    false_guess = counts[0]
    percentage_error = false_guess/nb_nans_to_guess
    if percentage_error == 0 :
        #print('All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been interpolated using an everage of the hour before and after.')
        result.at['(h-1 + h+1)/2','is_correct'] = True
        result.at['(h-1 + h+1)/2','%error'] = percentage_error
    else :
        #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. This hypothesis is rejected. The missing hour has not been interpolated using an average of the hour before and after.')
        result.at['(h-1 + h+1)/2','is_correct'] = False
        result.at['(h-1 + h+1)/2','%error'] = percentage_error
    return result


In [None]:
result1 = hyp_1(sm_2010, mar_2010)
result1

In [None]:
def hyp_2(original_df, start_missing_hour):
    
    
    #FUNCTION : checks if missing hour = interpolated using one of the chosen methods :
    # ‘linear’, ‘time’, ‘index’, ‘values’, 'pad’, ‘nearest’, 
    # ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’
    # ‘krogh’, ‘piecewise_polynomial’, ‘pchip’, ‘akima’, ‘cubicspline’
    
    #INPUT : 
    # original sample df, englobing the potentially originally missing hour and one extra hour before and after
    # nan sample df, englobing the missing hour (replaced with NaNs) and one extra hour before and after
    # the two dfs should have the same size !! 
    
    #OUTPUT : 
    # percentage error in interpolation according to the method used 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)
    
    
    method_list = ['linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'polynomial', 'krogh', 'piecewise_polynomial', 'pchip', 'akima', 'spline', 'from_derivatives']
    method_order = [0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,2,0]
    dict_method_order = dict(zip(method_list,method_order))
    
    result = pd.DataFrame(index= method_list,columns = ['is_correct', '%error'])
    
    for chosen_meth in method_list:
        
        chosen_order = dict_method_order[chosen_meth]
        #print(chosen_meth+'_'+str(chosen_order))
        
        interp_sample = pd.DataFrame()
        for sm_id in nan_sample_df.index:
            sm_serie = nan_sample_df.loc[sm_id].copy()
            if chosen_order == 0:
                sm_interp = sm_serie.interpolate(method = chosen_meth, axis=0)
            else :
                sm_interp = sm_serie.interpolate(method = chosen_meth, axis=0, order = chosen_order)

            sm_df = pd.DataFrame(sm_interp).T
            interp_sample = interp_sample.append(sm_df)

        bool_output = interp_sample == original_sample_df
        unique, counts = np.unique(bool_output, return_counts=True)
        nb_nans_to_guess = 4*len(original_sample_df.index)
        false_guess = counts[0]
        percentage_error = false_guess/nb_nans_to_guess
        
        if percentage_error == 0 :
            result.at[chosen_meth,'is_correct'] = True
            result.at[chosen_meth,'%error'] = percentage_error
            #print(f'All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been interpolated using the method "{chosen_meth}" and order "{chosen_order}".')
            
        else :
            result.at[chosen_meth,'is_correct'] = False
            result.at[chosen_meth,'%error'] = percentage_error
            #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. \nThis hypothesis is rejected. \nThe missing hour has not been interpolated using the method : "{chosen_meth}" and order "{chosen_order}".\n') 

    return result
    

In [None]:
output2 = hyp_2(sm_2010, mar_2010)
output2

In [None]:
def hyp_3(original_df, start_missing_hour):

    #HYPOTHESIS : missing hour = (h-1)
    
    #INPUT : 
    # original sample df, englobing the potentially originally missing hour and one extra hour before and after
    # nan sample df, englobing the missing hour (replaced with NaNs) and one extra hour before and after
    # the two dfs should have the same size !! 
    
    #OUTPUT : 
    # percentage error in interpolation according to the method used 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)

    result = pd.DataFrame(index= ['(h-1)'],columns = ['is_correct', '%error'])
    
    interp_sample = pd.DataFrame()
    for sm_id in nan_sample_df.index:
        sm_serie = nan_sample_df.loc[sm_id].copy()
        for i in range(0,4):
            sm_serie.iat[i+4] = sm_serie[i]
            sm_df = pd.DataFrame(sm_serie).T
        interp_sample = interp_sample.append(sm_df)

    bool_output = interp_sample == original_sample_df
    unique, counts = np.unique(bool_output, return_counts=True)
    nb_nans_to_guess = 4*len(original_sample_df.index)
    false_guess = counts[0]
    percentage_error = false_guess/nb_nans_to_guess

    if percentage_error == 0 :
        result.at['(h-1)','is_correct'] = True
        result.at['(h-1)','%error'] = percentage_error
        #print(f'All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been filled using the previous hour.')

    else :
        result.at['(h-1)','is_correct'] = False
        result.at['(h-1)','%error'] = percentage_error
        #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. \nThis hypothesis is rejected. \nThe missing hour has not been filled using the previous hour\n') 
    
    return result

In [None]:
output3 = hyp_3(sm_2010, mar_2010)
output3

In [None]:
def hyp_4(original_df, start_missing_hour):
    
    #HYPOTHESIS : missing hour = (h+1)
    
    #INPUT : 
    # original sample df, englobing the potentially originally missing hour and one extra hour before and after
    # nan sample df, englobing the missing hour (replaced with NaNs) and one extra hour before and after
    # the two dfs should have the same size !! 
    
    #OUTPUT : 
    # percentage error in interpolation according to the method used 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)


    result = pd.DataFrame(index= ['(h+1)'],columns = ['is_correct', '%error'])
    

    interp_sample = pd.DataFrame()
    for sm_id in nan_sample_df.index:
        sm_serie = nan_sample_df.loc[sm_id].copy()
        for i in range(0,4):
            sm_serie.iat[i+4] = sm_serie[i+8]
            sm_df = pd.DataFrame(sm_serie).T
        interp_sample = interp_sample.append(sm_df)

    bool_output = interp_sample == original_sample_df
    unique, counts = np.unique(bool_output, return_counts=True)

    nb_nans_to_guess = 4*len(original_sample_df.index)
    false_guess = counts[0]
    percentage_error = false_guess/nb_nans_to_guess

    if percentage_error == 0 :
        result.at['(h+1)','is_correct'] = True
        result.at['(h+1)','%error'] = percentage_error
        #print(f'All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been filled using the next hour.')

    else :
        result.at['(h+1)','is_correct'] = False
        result.at['(h+1)','%error'] = percentage_error
        #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. \nThis hypothesis is rejected. \nThe missing hour has not been filled using the next hour\n') 
    
    return result

In [None]:
output4 = hyp_4(sm_2010, mar_2010)
output4

In [None]:
def hyp_5(original_df, start_missing_hour):
    
    #HYPOTHESIS : missing hour = same hour previous week
    
    # INPUT:
    # original_df = dataframe with date as columns (in datetime format) and smart meter id as rows 
    # df_nan = same as original_df but the columns of the potentially missing hour have been replaced with nans
    # using function : 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)
    
    result = pd.DataFrame(index= ['(h-7d)'],columns = ['is_correct', '%error'])
    
    interp_table = pd.DataFrame()
    for sm_id in df_nan.index:
        sm_serie = df_nan.loc[sm_id].copy()
        for date in pd.date_range(start = start_missing_hour, periods=4,freq='15min'):
            sm_serie.at[date] = sm_serie[date-pd.Timedelta(days=7)]
            sm_df = pd.DataFrame(sm_serie).T
        interp_table = interp_table.append(sm_df)

    bool_output = interp_table == original_df
    unique, counts = np.unique(bool_output, return_counts=True)
    
    nb_nans_to_guess = 4*len(original_sample_df.index)
    false_guess = counts[0]
    percentage_error = false_guess/nb_nans_to_guess

    if percentage_error == 0 :
        result.at['(h-7d)','is_correct'] = True
        result.at['(h-7d)','%error'] = percentage_error
        #print(f'All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been filled using the same hour, one week back.')

    else :
        result.at['(h-7d)','is_correct'] = False
        result.at['(h-7d)','%error'] = percentage_error
        #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. \nThis hypothesis is rejected. \nThe missing hour has not been interpolated using the same hour, one week back\n') 
    
    return result

In [None]:
output5 = hyp_5(sm_2010, mar_2010)
output5

In [None]:
def hyp_6(original_df, start_missing_hour):
    
    #HYPOTHESIS : missing hour = same hour next week
    
    # INPUT:
    # original_df = dataframe with date as columns (in datetime format) and smart meter id as rows 
    # df_nan = same as original_df but the columns of the potentially missing hour have been replaced with nans
    # using function : 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)
    
    result = pd.DataFrame(index= ['(h+7d)'],columns = ['is_correct', '%error'])
    
    interp_table = pd.DataFrame()
    for sm_id in df_nan.index:
        sm_serie = df_nan.loc[sm_id].copy()
        for date in pd.date_range(start = start_missing_hour, periods=4,freq='15min'):
            sm_serie.at[date] = sm_serie[date+pd.Timedelta(days=7)]
            sm_df = pd.DataFrame(sm_serie).T
        interp_table = interp_table.append(sm_df)

    bool_output = interp_table.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)] == original_df.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)]
    unique, counts = np.unique(bool_output, return_counts=True)
    
    nb_nans_to_guess = 4*len(original_sample_df.index)
    false_guess = counts[0]
    percentage_error = false_guess/nb_nans_to_guess

    if percentage_error == 0 :
        result.at['(h+7d)','is_correct'] = True
        result.at['(h+7d)','%error'] = percentage_error
        #print(f'All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been filled using the same hour, one week ahead.')

    else :
        result.at['(h+7d)','is_correct'] = False
        result.at['(h+7d)','%error'] = percentage_error
        #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. \nThis hypothesis is rejected. \nThe missing hour has not been interpolated using the same hour, one week ahead\n') 
    
    return result

In [None]:
output6 = hyp_6(sm_2010, mar_2010)
output6

In [None]:
def hyp_7(original_df, start_missing_hour):
    
    #HYPOTHESIS : missing hour = same hour previous day
    
    # INPUT:
    # original_df = dataframe with date as columns (in datetime format) and smart meter id as rows 
    # df_nan = same as original_df but the columns of the potentially missing hour have been replaced with nans
    # using function : 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)
    
    result = pd.DataFrame(index= ['(h-1d)'],columns = ['is_correct', '%error'])
    
    interp_table = pd.DataFrame()
    for sm_id in df_nan.index:
        sm_serie = df_nan.loc[sm_id].copy()
        for date in pd.date_range(start = start_missing_hour, periods=4,freq='15min'):
            sm_serie.at[date] = sm_serie[date-pd.Timedelta(days=1)]
            sm_df = pd.DataFrame(sm_serie).T
        interp_table = interp_table.append(sm_df)

    bool_output = interp_table.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)] == original_df.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)]
    unique, counts = np.unique(bool_output, return_counts=True)
    
    nb_nans_to_guess = 4*len(original_sample_df.index)
    false_guess = counts[0]
    percentage_error = false_guess/nb_nans_to_guess

    if percentage_error == 0 :
        result.at['(h-1d)','is_correct'] = True
        result.at['(h-1d)','%error'] = percentage_error
        #print(f'All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been filled using the same hour, one day back.')

    else :
        result.at['(h-1d)','is_correct'] = False
        result.at['(h-1d)','%error'] = percentage_error
        #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. \nThis hypothesis is rejected. \nThe missing hour has not been interpolated using the same hour, one day back\n') 
    
    return result

In [None]:
output7, interp = hyp_7(sm_2010, mar_2010)
output7

In [None]:
def hyp_8(original_df, start_missing_hour):
    
    #HYPOTHESIS : missing hour = same hour next day
    
    # INPUT:
    # original_df = dataframe with date as columns (in datetime format) and smart meter id as rows 
    # df_nan = same as original_df but the columns of the potentially missing hour have been replaced with nans
    # using function : 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)
    
    result = pd.DataFrame(index= ['(h+1d)'],columns = ['is_correct', '%error'])
    
    interp_table = pd.DataFrame()
    for sm_id in df_nan.index:
        sm_serie = df_nan.loc[sm_id].copy()
        for date in pd.date_range(start = start_missing_hour, periods=4,freq='15min'):
            sm_serie.at[date] = sm_serie[date+pd.Timedelta(days=1)]
            sm_df = pd.DataFrame(sm_serie).T
        interp_table = interp_table.append(sm_df)

    bool_output = interp_table.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)] == original_df.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)]
    unique, counts = np.unique(bool_output, return_counts=True)
    
    nb_nans_to_guess = 4*len(original_sample_df.index)
    false_guess = counts[0]
    percentage_error = false_guess/nb_nans_to_guess

    if percentage_error == 0 :
        result.at['(h+1d)','is_correct'] = True
        result.at['(h+1d)','%error'] = percentage_error
        #print(f'All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been filled using the same hour, one day ahead.')

    else :
        result.at['(h+1d)','is_correct'] = False
        result.at['(h+1d)','%error'] = percentage_error
        #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. \nThis hypothesis is rejected. \nThe missing hour has not been interpolated using the same hour, one day ahead\n') 
    
    return result

In [None]:
output8 = hyp_8(sm_2010, mar_2010)
output8

In [None]:
def hyp_9(original_df, start_missing_hour):
    
    #HYPOTHESIS : missing hour = average same hour next week + same hour last week 
    
    # INPUT:
    # original_df = dataframe with date as columns (in datetime format) and smart meter id as rows 
    # df_nan = same as original_df but the columns of the potentially missing hour have been replaced with nans
    # using function : 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)
    
    result = pd.DataFrame(index= ['((h+7d)+(h-7d))/2'],columns = ['is_correct', '%error'])
    
    interp_table = pd.DataFrame()
    for sm_id in df_nan.index:
        sm_serie = df_nan.loc[sm_id].copy()
        for date in pd.date_range(start = start_missing_hour, periods=4,freq='15min'):
            sm_serie.at[date] = (sm_serie[date+pd.Timedelta(days=7)] + sm_serie[date-pd.Timedelta(days=7)])/2
            sm_df = pd.DataFrame(sm_serie).T
        interp_table = interp_table.append(sm_df)

    bool_output = interp_table.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)] == original_df.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)]
    unique, counts = np.unique(bool_output, return_counts=True)
    
    nb_nans_to_guess = 4*len(original_sample_df.index)
    false_guess = counts[0]
    percentage_error = false_guess/nb_nans_to_guess

    if percentage_error == 0 :
        result.at['((h+7d)+(h-7d))/2','is_correct'] = True
        result.at['((h+7d)+(h-7d))/2','%error'] = percentage_error
        #print(f'All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been filled using the average of same hour, one week ahead and one week back.')

    else :
        result.at['((h+7d)+(h-7d))/2','is_correct'] = False
        result.at['((h+7d)+(h-7d))/2','%error'] = percentage_error
        #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. \nThis hypothesis is rejected. \nThe missing hour has not been filled using the average of same hour, one week ahead and one week back\n') 
    
    return result

In [None]:
output9 = hyp_9(sm_2010, mar_2010)
output9

In [None]:
def hyp_10(original_df, start_missing_hour):
    
    #HYPOTHESIS : missing hour = average same hour next day + same hour previous day 
    
    # INPUT:
    # original_df = dataframe with date as columns (in datetime format) and smart meter id as rows 
    # df_nan = same as original_df but the columns of the potentially missing hour have been replaced with nans
    # using function : 
    
    df_nan, original_sample_df, nan_sample_df = sample_selection(original_df, start_missing_hour)
    
    result = pd.DataFrame(index= ['((h+1d)+(h-1d))/2'],columns = ['is_correct', '%error'])
    
    interp_table = pd.DataFrame()
    for sm_id in df_nan.index:
        sm_serie = df_nan.loc[sm_id].copy()
        for date in pd.date_range(start = start_missing_hour, periods=4,freq='15min'):
            sm_serie.at[date] = (sm_serie[date+pd.Timedelta(days=1)] + sm_serie[date-pd.Timedelta(days=1)])/2
            sm_df = pd.DataFrame(sm_serie).T
        interp_table = interp_table.append(sm_df)

    bool_output = interp_table.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)] == original_df.loc[:,start_missing_hour:start_missing_hour+pd.Timedelta(minutes=45)]
    unique, counts = np.unique(bool_output, return_counts=True)
    
    nb_nans_to_guess = 4*len(original_sample_df.index)
    false_guess = counts[0]
    percentage_error = false_guess/nb_nans_to_guess

    if percentage_error == 0 :
        result.at['((h+1d)+(h-1d))/2','is_correct'] = True
        result.at['((h+1d)+(h-1d))/2','%error'] = percentage_error
        #print(f'All NaNs have been interpolated correctly. This hypothesis is confirmed. The missing hour has been filled using the average of same hour, one day ahead and one day back.')

    else :
        result.at['((h+1d)+(h-1d))/2','is_correct'] = False
        result.at['((h+1d)+(h-1d))/2','%error'] = percentage_error
        #print(f'There is {round(percentage_error*100,2)} % of falsly guessed NaNs. \nThis hypothesis is rejected. \nThe missing hour has not been filled using the average of same hour, one day ahead and one day back\n') 
    
    return result

In [None]:
output10 = hyp_10(sm_2010, mar_2010)
output10

In [None]:
def all_hyp(original_df, start_missing_hour):
    glob_result = pd.concat([hyp_1(original_df, start_missing_hour), hyp_2(original_df, start_missing_hour), hyp_3(original_df, start_missing_hour), hyp_4(original_df, start_missing_hour), hyp_5(original_df, start_missing_hour), hyp_6(original_df, start_missing_hour), hyp_7(original_df, start_missing_hour), hyp_8(original_df, start_missing_hour), hyp_9(original_df, start_missing_hour), hyp_10(original_df, start_missing_hour)])
    return glob_result


In [None]:
result_2010 = all_hyp(sm_2010,mar_2010)
result_2010

In [None]:
result_2011 = all_hyp(sm_2011,mar_2011)
result_2011

In [None]:
result_2012 = all_hyp(sm_2012,mar_2012)
result_2012

In [None]:
result_2013 = all_hyp(sm_2013,mar_2013)
result_2013

In [None]:
result_2014 = all_hyp(sm_2014,mar_2014)
result_2014

In [None]:
result_2015 = all_hyp(sm_2015,mar_2015)
result_2015

In [None]:
result_2016 = all_hyp(sm_2016,mar_2016)
result_2016