# Combined database of profiles
So the idea is to make one database of profiles that is easy to load and use. 


# Imports 

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import datetime
import tqdm

# Configuration

In [None]:
PREPROCESSED_DIR = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/new_preprocessed/') #Jonas
# PREPROCESSED_DIR = Path('/Users/lolabotman/PycharmProjects/FluviusFullData/profiles/preprocessed') #Lola

infrax_path = PREPROCESSED_DIR / 'infrax'
eandis_vreg_path = PREPROCESSED_DIR / 'eandis2017'
eandis_amr_path = PREPROCESSED_DIR / 'eandis_AMR'
result_path = PREPROCESSED_DIR / 'combined'
result_path.mkdir(parents = True, exist_ok = True, mode = 0o770)

In [None]:

def check_float_field(series, nan_allowed = True):
    if not nan_allowed: 
        assert ~series.isna().any(), 'there is at least one NaN value!'
    series = series.dropna()
    try: 
        return series.astype('float')
    except: 
        print('converting to float failed!')
        
    # check for ',' instead of '.'
    has_komma = series.str.contains(',', regex = False, na = False)
    print(series[has_komma[has_komma.isna()].index])
    if has_komma.any(): 
        print(f"Found komma ',' instead of '.' replacing... ({has_komma.sum()} times)")
        series[has_komma] = series[has_komma].str.replace(',','.')
   
    has_placeholders = series == '/'
    if has_placeholders.any(): 
        print(f"Found placeholder '/' replacing with NaN... ({has_placeholders.sum()} times)")
        series= series.replace('/', np.NAN)
    converted = series.astype('float')
    print("SUCCES")
    return converted
    

## Read everything

In [None]:
infrax_info_df = pd.read_csv(infrax_path / 'clean_info.csv', index_col = [0,1], dtype={'meterID':'str'})

In [None]:
infrax_data_df = pd.read_csv(infrax_path / 'clean_data.csv', index_col = [0,1], dtype={'meterID':'str'})

In [None]:
eandis_info_df = pd.read_csv(eandis_vreg_path / 'clean_info_no_night.csv', index_col = [0,1])
eandis_data_df = pd.read_csv(eandis_vreg_path / 'clean_data_no_night.csv', index_col = [0,1])


In [None]:
amr_info_df = pd.read_csv(eandis_amr_path / 'clean_info.csv', index_col = [0,1])
amr_data_df = pd.read_csv(eandis_amr_path /'clean_data.csv', index_col = [0,1])

# Make info df

## Combine info dfs

In [None]:
amr_info_df = amr_info_df.reset_index().astype({'meterID':'str'}).set_index(['meterID', 'year'])
amr_data_df = amr_data_df.reset_index().astype({'meterID':'str'}).set_index(['meterID', 'year'])


In [None]:
infrax_info_df.head();

In [None]:
eandis_info_df.head();

In [None]:
amr_info_df.head();

In [None]:
combined_info_df = pd.concat([infrax_info_df, eandis_info_df, amr_info_df])
combined_info_df

# Check the attributes

In [None]:
combined_info_df.town.value_counts()

In [None]:
combined_info_df['#family_members'].hist()

In [None]:
combined_info_df.consumer_type.value_counts().to_frame('count')

In [None]:
combined_info_df.PV.value_counts(dropna=False).to_frame('count')

In [None]:
combined_info_df.SLP_cat.value_counts(dropna=False).to_frame('count')

In [None]:
combined_info_df.heatpump.value_counts(dropna=False).to_frame('count')

## Connection power and PV power need some processing
There are '/' and sometimes a komma is used instead of a point 

In [None]:
combined_info_df['connection_power'] = check_float_field(combined_info_df.connection_power)
combined_info_df['PV_power'] = check_float_field(combined_info_df.PV_power)
combined_info_df['connection_power'] = combined_info_df['connection_power'].replace({0:np.NAN})
combined_info_df[['connection_power','PV_power']].hist()

In [None]:
combined_info_df.to_csv(result_path/'info.csv')
combined_info_df.to_pickle(result_path/'info.pkl')

# Make data df

## Combine data dfs
There is a problem with leap years, so use columns of a leap year and a non-leap year will simply have NaN values

In [None]:
infrax_data_df.head()

In [None]:
eandis_data_df.head()

In [None]:
amr_data_df.head()

In [None]:
def vec_dt_replace(series, year=None, month=None, day=None):
    return pd.to_datetime(
        {'year': series.year if year is None else year,
         'month': series.month if month is None else month,
         'day': series.day if day is None else day, 
        'hour': series.hour,
        'minute': series.minute})

In [None]:
t2016 = pd.to_datetime(eandis_data_df.columns)
t2017 = pd.to_datetime(amr_data_df.columns)
t2017_2016 = vec_dt_replace(t2017, year = 2016)
t2017_2016

In [None]:
# reindex the amr data correctly
amr_data_16_df = amr_data_df.copy()
amr_data_16_df.columns = t2017_2016
amr_data_16_df = amr_data_16_df.reindex(t2016, axis = 1)
amr_data_16_df.loc[:, amr_data_16_df.columns.month == 2]

In [None]:
# infrax contains some non-round timestamps so round them to correct his issue! 
infrax_data_df.columns = pd.to_datetime(infrax_data_df.columns, exact = False).round('min')
eandis_data_df.columns = pd.to_datetime(eandis_data_df.columns)

# take mean of the duplicate timestamps
infrax_data_df = infrax_data_df.groupby(axis = 1, level = 0).mean()


In [None]:
combined_data_df = pd.concat([infrax_data_df, eandis_data_df, amr_data_16_df])
combined_data_df.to_csv(result_path/'data.csv')
combined_data_df

### Checks if indexes in data and info are the same 

In [None]:
info_no_data = combined_info_df.index.difference(combined_data_df.index)
data_no_info = combined_data_df.index.difference(combined_info_df.index)
assert len(info_no_data) == 0, info_no_data
assert len(data_no_info) == 0

In [None]:
combined_info_df

# New indexing (more uniform and remove EANS)

In [None]:
# combined_info_df = pd.read_csv(result_path/'info.csv', dtype={'meterID':'str'})
# combined_info_df = combined_info_df.set_index(['meterID','year'])
# combined_data_df = pd.read_csv(result_path/'data.csv', dtype={'meterID':'str'})
# combined_data_df = combined_data_df.set_index(['meterID','year'])

In [None]:
#dubble check for issues in the indexes of the info and data file)
info_no_data = combined_info_df.index.difference(combined_data_df.index)
data_no_info = combined_data_df.index.difference(combined_info_df.index)
print(len(info_no_data))
print(len(data_no_info))

### Change index of the data file

In [None]:
original_indexes = list( str(item) for item in combined_data_df.index.levels[0])
original_indexes_info = list( str(item) for item in combined_info_df.index.levels[0])
new_indexes = ['smartmeter_'+str(integer) for integer in list(range(0,len(original_indexes)))]
print("Original indexes :")
print(original_indexes[0:5])
print('New indexes :')
print(new_indexes[0:5])

In [None]:
meterID_old_to_new = dict(zip(original_indexes,new_indexes))
meterID_new_to_old = dict(zip(new_indexes,original_indexes))

In [None]:
def map_level(df, dct, level=0):
    index = df.index
    index.set_levels([[dct.get(item, item) for item in names] if i==level else names
                      for i, names in enumerate(index.levels)], inplace=True)

In [None]:
OVERWRITE = True
if not (result_path/'reindexed_data.csv').exists() or OVERWRITE: 
    new_index_combined_data_df = combined_data_df.copy()
    map_level(new_index_combined_data_df, meterID_old_to_new, level=0)
    combined_data_df.to_csv(result_path/'reindexed_data.csv')
else:
    print("It has already been previously saved in your folder")

In [None]:
new_index_combined_data_df

In [None]:
new_index_combined_data_df.index.levels[0]

### INFO FILE 

In [None]:
OVERWRITE = True
if not (result_path/'reindexed_info.csv').exists() or OVERWRITE: 
    new_index_combined_info_df = combined_info_df.copy()
    map_level(new_index_combined_info_df, meterID_old_to_new, level=0)
#     specials = new_index_combined_info_df.connection_power[new_index_combined_info_df.connection_power.astype('str').str.contains(',')].str.replace(',', '.').astype('float')
#     new_index_combined_info_df.loc[specials.index, 'connection_power'] = specials
#     specials = new_index_combined_info_df.connection_power[new_index_combined_info_df.PV_power.astype('str').str.contains(',')].str.replace(',', '.').astype('float')
#     new_index_combined_info_df.loc[specials.index, 'PV_power'] = specials
    new_index_combined_info_df.to_csv(result_path/'reindexed_info.csv')
    new_index_combined_info_df.to_pickle(result_path/'reindexed_info.pkl')
else:
    print("It has already been previously saved in your folder")

In [None]:
new_index_combined_info_df

In [None]:
new_index_combined_info_df.query('consumer_type=="0"')

# Put everything in daylight savings time
So the data following datasets are in UTC: 
- infrax (all but appartement)
- eandis AMR  

And these are in UTC: 
- eandis2017
- infrax appartement
(the duplicate hour is resolved by taking the average of both hours)


# dates for winter summer time changes

In [None]:
mar_2010, oct_2010 = pd.to_datetime('2016-03-28 02:00:00') , pd.to_datetime('2016-10-31 02:00:00')
mar_2011, oct_2011 = pd.to_datetime('2016-03-27 02:00:00') , pd.to_datetime('2016-10-30 02:00:00')
mar_2012, oct_2012 = pd.to_datetime('2016-03-25 02:00:00') , pd.to_datetime('2016-10-28 02:00:00')
mar_2013, oct_2013 = pd.to_datetime('2016-03-31 02:00:00') , pd.to_datetime('2016-10-27 02:00:00')
mar_2014, oct_2014 = pd.to_datetime('2016-03-30 02:00:00') , pd.to_datetime('2016-10-26 02:00:00')
mar_2015, oct_2015 = pd.to_datetime('2016-03-29 02:00:00') , pd.to_datetime('2016-10-25 02:00:00')
mar_2016, oct_2016 = pd.to_datetime('2016-03-27 02:00:00') , pd.to_datetime('2016-10-30 02:00:00')
mar_2017, oct_2017 = pd.to_datetime('2016-03-26 02:00:00') , pd.to_datetime('2016-10-29 02:00:00')
DST_times = pd.DataFrame(
    [
        [mar_2010, oct_2010],
        [mar_2011, oct_2011],
        [mar_2012, oct_2012],
        [mar_2013, oct_2013],
        [mar_2014, oct_2014],
        [mar_2015, oct_2015], 
        [mar_2016, oct_2016], 
        [mar_2017, oct_2017]
    ], index = range(2010, 2018),
    columns = ['DST_start', 'DST_end']).rename_axis(index = 'year')
DST_times

In [None]:
data_df = new_index_combined_data_df
info_df = new_index_combined_info_df


### As a reference some data in DST 
This is what we want to recreate, missing data from 2:00 to 2:45 (inclusive) because of the shifted hour 
and in october the duplicate hour (2:00 to 2:45) resolved by taking the mean 

In [None]:
dst_data = data_df.loc[info_df.query('data_source == "EandisVREG"').index]
DST_start = DST_times.loc[2016, 'DST_start']
dst_data.loc[:, DST_start - pd.Timedelta(hours = 1): DST_start + pd.Timedelta(hours = 1)]

### Take all the data that is in UTC

In [None]:

utc_info = info_df.query('data_source == "EandisAMR" | (data_source == "Infrax" & consumer_type!= "app1" & consumer_type != "app2")')
utc_data = data_df.loc[utc_info.index, :]
utc_data

### Convert it to DST 

In [None]:
idx = pd.IndexSlice 

# to store the transformed utc data
new_utc_data = utc_data.copy() 
for year in utc_data.index.get_level_values('year').unique():     
    # goal:
    # - have missing values from march 2:00 to march 2:45 inclusive
    # - have duplicate values from october 2:00 to october 2:45
    
    # start and end dates 
    dst_start, dst_end = DST_times.loc[year, 'DST_start'], DST_times.loc[year, 'DST_end'] - pd.Timedelta(minutes = 15)
    new_start, new_end = dst_start + pd.Timedelta(hours = 1), dst_end + pd.Timedelta(hours = 1)
    
    # Move march 2:00 - october 1:45 to march 3:00 - october 2:45
    new_utc_data.loc[idx[:,year], new_start:new_end] = utc_data.loc[idx[:,year], dst_start: dst_end]

    # take average of duplicate hour 
    new_utc_data.loc[idx[:,year], dst_end:new_end] = (new_utc_data.loc[idx[:,year], dst_end:new_end]  + utc_data.loc[idx[:,year], dst_end:new_end] )/2

    # make the missing hour missing 
    new_utc_data.loc[idx[:,year], dst_start:new_start-pd.Timedelta(minutes = 15)] = np.NaN


### Put the converted data back into the full dataframe

In [None]:
data_df.loc[new_utc_data.index] = new_utc_data 

In [None]:
data_df.columns

### Save this already

In [None]:
data_df.to_csv(result_path/'reindexed_DST_data.csv')
data_df.to_pickle(result_path/'reindexed_DST_data.pkl')