# Data Preparation

This script contains code to complete the heatpump load data

Therefore two approaches are implemented

        1) Using Linear Regression to interpolate missing values

        2) Further reduction of data sample to reach full data availability

---

### Imports

In [None]:
import pandas as pd
import numpy as np
import pickle 
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

pd.options.mode.chained_assignment = None 

In [None]:
from plot_functions import plot_consumption_resampled, plot_metrics_lr

### Global variables

In [None]:
INDEX_START = 1528965000
COLUMNS = ['P_TOT', 'Q_TOT', 'S_TOT', 'PF_TOT']

### Data

In [None]:
#read in data
with open('Data/heatpump/data_heatpump.pkl', 'rb') as f:
    load_dict = pickle.load(f)

with open('Data/weather/data_weather_v1.pkl', 'rb') as f:
    weather_data = pickle.load(f)

---

### Starting point

In [None]:
Image(filename='Data/data_availability>85.png') 

In [None]:
with open('Data/heatpump/data_heatpump.pkl', 'rb') as f:
    load_dict = pickle.load(f)

# set index to start index
for key in load_dict:
    df_house = load_dict[key].set_index('index')
    df_house = df_house[df_house.index > INDEX_START]
    df_house = df_house[COLUMNS]

    for column in df_house.columns:
        if not df_house[df_house[column]<0].empty:
            df_house.loc[df_house[column] < 0, column] = 0.01

    load_dict[key] = df_house    

### Possible Solutions

Solution 1: Regression model for filling missing values

Solution 2: Further reduction, remove SFH 10, 11, and 23. New time horizon Nov 18 - Dez 20 

(Solution 3: Using removed data sets to fill missing values)

---

### Solution 1: Regression model 

In [None]:
# incomplete time series
list_complete = ['SFH12', 'SFH14', 'SFH16', 'SFH18', 'SFH19', 'SFH22', 'SFH27', 'SFH28', 'SFH29', 
                 'SFH3', 'SFH30', 'SFH32', 'SFH34', 'SFH36', 'SFH4', 'SFH9', 'SFH26', 'SFH33']
list_incomplete = ['SFH5', 'SFH7', 'SFH10', 'SFH11', 'SFH20', 'SFH21', 'SFH23', 'SFH38', 'SFH39']
list_incomlete_unique = ['SFH5', 'SFH7', 'SFH10', 'SFH11', 'SFH21', 'SFH38', 'SFH39']
list_incomplete_double = ['SFH20', 'SFH23']
list_v1 = list_complete + list_incomplete

In [None]:
with open('Data/missing_intervalls_dict.pkl', 'rb') as f:
    missing_intervalls = pickle.load(f)

In [None]:
start = pd.to_datetime(load_dict['SFH3'].index[0], unit='s')
end = pd.to_datetime(load_dict['SFH3'].index[-1], unit='s')

end-start

Correlation between parameters

In [None]:
df_house = load_dict['SFH39']

data = pd.merge(left=df_house, right=weather_data, how='inner', left_on=df_house.index, right_on=weather_data.index)
data.rename(columns={'key_0':'index'}, inplace=True)
data.set_index('index', inplace=True)

correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Korrelationsmatrix')
plt.show()

In [None]:
columns = ['P_TOT','PF_TOT', 'WEATHER_TEMPERATURE_TOTAL', 'WEATHER_PRECIPITATION_RATE_TOTAL', 'WEATHER_WIND_SPEED_TOTAL']
correlation_matrix = data[columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Korrelationsmatrix')
#plt.xlabel('Variablen')
#plt.ylabel('Variablen')

plt.show()

Reduced variant

In [None]:
with open('Data/heatpump/data_heatpump.pkl', 'rb') as f:
    load_dict = pickle.load(f)


data_columns = ['P_TOT', 'PF_TOT', 'WEATHER_TEMPERATURE_TOTAL', 'WEATHER_PRECIPITATION_RATE_TOTAL', 'WEATHER_WIND_SPEED_TOTAL']
weather_columns = ['WEATHER_TEMPERATURE_TOTAL', 'WEATHER_PRECIPITATION_RATE_TOTAL', 'WEATHER_WIND_SPEED_TOTAL']    

# set index to start index
for key in load_dict:
    df_house =load_dict[key].set_index('index')
    df_house = df_house[df_house.index > INDEX_START]
    df_house = df_house[['P_TOT', 'PF_TOT']]

    for column in df_house.columns:
        if not df_house[df_house[column]<0].empty:
            df_house.loc[df_house[column] < 0, column] = 0

    load_dict[key] = df_house    

dict_result = {}

df_metrics_r = pd.DataFrame(columns=['RMSE', 'MSE', 'R2'], index=list_incomplete)

for key in list_incomplete:
    # get load data for house
    df_house = load_dict[key]

    # merge weather and load data to one dataset - train and test data
    data = pd.merge(left=df_house, right=weather_data, how='inner', left_on=df_house.index, right_on=weather_data.index)
    data.rename(columns={'key_0':'index'}, inplace=True)
    data.set_index('index', inplace=True)
    data = data[data_columns]

    # create time features
    #data['minute'] = pd.to_datetime(data.index, unit='s').minute
    #data['hour'] = pd.to_datetime(data.index, unit='s').hour
    #data['day'] = pd.to_datetime(data.index, unit='s').day
    #data['month'] = pd.to_datetime(data.index, unit='s').month
    #data['year'] = pd.to_datetime(data.index, unit='s').year

    # create dataset for prediction, weather data and time features
    intervalls = missing_intervalls[key]

    for intervall in intervalls:
        features_to_predict = weather_data.loc[intervall[0]: intervall[1]]
        features_to_predict = features_to_predict[weather_columns]
        #features_to_predict['minute'] = pd.to_datetime(features_to_predict.index, unit='s').minute
        #features_to_predict['hour'] = pd.to_datetime(features_to_predict.index, unit='s').hour
        #features_to_predict['day'] = pd.to_datetime(features_to_predict.index, unit='s').day
        #features_to_predict['month'] = pd.to_datetime(features_to_predict.index, unit='s').month
        #features_to_predict['year'] = pd.to_datetime(features_to_predict.index, unit='s').year
    
    #train model
        data.dropna(inplace=True)
        X = data[data.columns[2:]]
        y = data[data.columns[0:2]]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

        # Lineare Regression 
        model = LinearRegression()
        model.fit(X_train, y_train)

        # predict
        predictions = model.predict(X_test)
    
        # evaluation of model performance
        r2 = r2_score(y_test, predictions)
        rmse = mean_squared_error(y_test, predictions, squared=True)
        mse = mean_squared_error(y_test, predictions, squared=False)
        #print(key)
        #print('The r2 is: ', r2)
        #print('The rmse is: ', np.sqrt(rmse))
        #print("--------------------")

        predictions= model.predict(features_to_predict)
        # add data to dataframe
        df_house.loc[intervall[0]:intervall[-1]] = predictions
    df_metrics_r.loc[key] = [np.sqrt(rmse), mse, r2]
   
    # add dataframe to dict
    dict_result[key] = df_house

df_metrics_r

Use of all variables 

In [None]:
with open('Data/heatpump/data_heatpump.pkl', 'rb') as f:
    load_dict = pickle.load(f)

# set index to start index
for key in load_dict:
    df_house =load_dict[key].set_index('index')
    df_house = df_house[df_house.index > INDEX_START]
    df_house = df_house[['P_TOT', 'PF_TOT']]

    for column in df_house.columns:
        if not df_house[df_house[column]<0].empty:
            df_house.loc[df_house[column] < 0, column] = 0

    load_dict[key] = df_house    

dict_result = {}

df_metrics = pd.DataFrame(columns=['RMSE', 'MSE', 'R2'], index=list_incomplete)

for key in list_incomplete:
    # get load data for house
    df_house = load_dict[key]

    # merge weather and load data to one dataset - train and test data
    data = pd.merge(left=df_house, right=weather_data, how='inner', left_on=df_house.index, right_on=weather_data.index)
    data.rename(columns={'key_0':'index'}, inplace=True)
    data.set_index('index', inplace=True)

    # create time features
    data['minute'] = pd.to_datetime(data.index, unit='s').minute
    data['hour'] = pd.to_datetime(data.index, unit='s').hour
    data['day'] = pd.to_datetime(data.index, unit='s').day
    data['month'] = pd.to_datetime(data.index, unit='s').month
    data['year'] = pd.to_datetime(data.index, unit='s').year

    # create dataset for prediction, weather data and time features
    intervalls = missing_intervalls[key]

    for intervall in intervalls:
        features_to_predict = weather_data.loc[intervall[0]: intervall[1]]
        features_to_predict['minute'] = pd.to_datetime(features_to_predict.index, unit='s').minute
        features_to_predict['hour'] = pd.to_datetime(features_to_predict.index, unit='s').hour
        features_to_predict['day'] = pd.to_datetime(features_to_predict.index, unit='s').day
        features_to_predict['month'] = pd.to_datetime(features_to_predict.index, unit='s').month
        features_to_predict['year'] = pd.to_datetime(features_to_predict.index, unit='s').year

        #train model
        data.dropna(inplace=True)
        X = data[data.columns[2:]]
        y = data[data.columns[0:2]]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

        # Lineare Regression 
        model = LinearRegression()
        model.fit(X_train, y_train)

        # predict
        predictions = model.predict(X_test)
    
        # evaluation of model performance
        r2 = r2_score(y_test, predictions)
        rmse = mean_squared_error(y_test, predictions, squared=True)
        mse = mean_squared_error(y_test, predictions, squared=False)

        predictions= model.predict(features_to_predict)
        # add data to dataframe
        df_house.loc[intervall[0]:intervall[-1]] = predictions
    df_metrics.loc[key] = [np.sqrt(rmse), mse, r2]
   
    # add dataframe to dict
    dict_result[key] = df_house

df_metrics

In [None]:
df_metrics_r

Visualization of modeling results

In [None]:
plot_metrics_lr(df_metrics)
plot_metrics_lr(df_metrics_r)

In [None]:
for key in list_incomplete:
    dict_result[key] = dict_result[key].clip(lower=0)
    plot_consumption_resampled(dict_result[key], ['P_TOT', 'PF_TOT'], key, missing_intervalls[key])

In [None]:
# add already complete time series
for key in list_complete:
    dict_result[key] = load_dict[key]
# save to file
with open('Data/heatpump/data_heatpump_cleaned_v1.pkl', 'wb') as f:
    pickle.dump(dict_result, f)

---

### Solution 2: Further reduction of data sample

In [None]:
start = pd.to_datetime(1542512700, unit='s')
end = pd.to_datetime(load_dict['SFH3'].index[-1], unit='s')

end-start

In [None]:
with open('Data/heatpump/data_heatpump.pkl', 'rb') as f:
    data = pickle.load(f)

load_dict = {}

# set index to start index
for key in list_v1:
    if key in ['SFH10', 'SFH11', 'SFH23']:
        #drop datasets
        continue
    else:
        df_house =data[key].set_index('index')
        # start index after missing values for SFH7
        df_house = df_house[df_house.index > missing_intervalls['SFH7'][0][1]]
        df_house = df_house[COLUMNS]
        df_house = df_house.clip(lower=0)

        load_dict[key] = df_house   

In [None]:
for key in load_dict:
    plot_consumption_resampled(load_dict[key], ['P_TOT', 'PF_TOT'], key, [])

In [None]:
# save to file
with open('Data/heatpump/data_heatpump_cleaned_v2.pkl', 'wb') as f:
    pickle.dump(load_dict, f)

### Solution 3

not working, als backup

In [None]:
mapping_dict = {
    'SFH39': 'SFH31',
    'SFH38': 'SFH35',
    'SFH23': 'SFH37',
    'SFH21': 'SFH6',
    'SFH20': 'SFH37',
    'SFH11': 'SFH39',
    'SFH10': 'SFH40',
    'SFH7': 'SFH31',
    'SFH5': 'SFH35'
}

In [None]:
with open('Data/heatpump/data_heatpump.pkl', 'rb') as f:
    load_dict = pickle.load(f)
with open('Data/missing_intervalls_dict.pkl', 'rb') as f:
    missing_intervalls_dict = pickle.load(f)

load_dict_complete = {}
for key in mapping_dict:
    if key in ['SFH20', 'SFH23', 'SFH5']:
        continue
    df_house = load_dict[key].set_index('index')
    df_house = df_house[df_house.index >= INDEX_START]
    missing_intervalls = missing_intervalls_dict[key][0]
    df_replace = load_dict[mapping_dict[key]].set_index('index')
    df_replace = df_replace[(df_replace.index >= missing_intervalls[0])&(df_replace.index <= missing_intervalls[1])]
    df_house.loc[missing_intervalls[0]:missing_intervalls[1]] = df_replace

    load_dict_complete[key] = df_house[COLUMNS]

for key in list_complete:
    df_house = load_dict[key].set_index('index')
    df_house = df_house[df_house.index >= INDEX_START]
    load_dict_complete[key] = df_house[COLUMNS]

for key in ['SFH20', 'SFH23', 'SFH5']:
    df_house = load_dict[key].set_index('index')
    df_house = df_house[df_house.index >= IndentationError]
    missing_intervalls = missing_intervalls_dict[key]
    for intervall in missing_intervalls:
        df_replace = load_dict[mapping_dict[key]].set_index('index')
        df_replace = df_replace[(df_replace.index >= intervall[0])&(df_replace.index <= intervall[1])]
        df_house.loc[intervall[0]:intervall[1]] = df_replace

    load_dict_complete[key] = df_house[COLUMNS]

In [None]:
df_result = load_dict_complete['SFH10'].reset_index()['index'].to_frame().set_index('index')
for df in load_dict_complete:
    load_dict_complete[df][df] = load_dict_complete[df]['P_TOT'].apply(check_nan)
    df_result = pd.concat([df_result, load_dict_complete[df][df]], axis=1)
#df_result.set_index('index', inplace=True)
plot_data_availability(df_result)

------

## Appendix

### Soltion 1: Dev with example house 5

Step 1: combining data

In [None]:
df_5 = load_dict[list_incomplete[0]]
df_5_train = df_5.dropna()
df_5_train.head(3)

In [None]:
data = pd.merge(left=df_5_train, right=weather_data, how='inner', left_on=df_5_train.index, right_on=weather_data.index)
data.rename(columns={'key_0':'index'}, inplace=True)
data.set_index('index', inplace=True)
data.head()

Adding time dependent variables

In [None]:
data['minute'] = pd.to_datetime(data.index, unit='s').minute
data['hour'] = pd.to_datetime(data.index, unit='s').hour
data['day'] = pd.to_datetime(data.index, unit='s').day
data['month'] = pd.to_datetime(data.index, unit='s').month
data['year'] = pd.to_datetime(data.index, unit='s').year

data.head()

weather data for to predicted time horizon

In [None]:
intervalls = missing_intervalls[list_incomplete[0]][0]
print(intervalls)
index_to_predict = df_5[df_5.isna().any(axis=1)].index
print(index_to_predict[0])
print(index_to_predict[-1])

In [None]:
features_to_predict = weather_data.loc[index_to_predict[0]: index_to_predict[-1]]
features_to_predict['minute'] = pd.to_datetime(features_to_predict.index, unit='s').minute
features_to_predict['hour'] = pd.to_datetime(features_to_predict.index, unit='s').hour
features_to_predict['day'] = pd.to_datetime(features_to_predict.index, unit='s').day
features_to_predict['month'] = pd.to_datetime(features_to_predict.index, unit='s').month
features_to_predict['year'] = pd.to_datetime(features_to_predict.index, unit='s').year
features_to_predict#.head()

Linear regression with scikit learn

In [None]:
X = data[data.columns[4:]]
y = data[data.columns[0:4]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Lineare Regression Modell erstellen und trainieren
model = LinearRegression()
model.fit(X_train, y_train)

# Vorhersagen treffen
predictions = model.predict(X_test)

r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
print('The r2 is: ', r2)
print('The rmse is: ', rmse)

In [None]:
predictions = model.predict(features_to_predict)
df_5.loc[intervalls[0]:intervalls[-1]] = predictions

In [None]:
plot_consumption_resampled(df_5, 'SFH5', intervalls)