# Data Preparation

This notebook contains code for

1. Load Data Quality: Linear Regression, Reduction of data sample

2. Merge Load and Weather Data

3. Prepare Data for Training: Standardization, Creating Sequences, Trainings/Val/Test split

In order to keep this notebook clearly readable, some functions are outsourced in utils/

____

### Imports

In [None]:
import pandas as pd
import numpy as np 
import pickle
import re
from IPython.display import Image

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
load_path_WPUQ = "data/WPUQ/heatpumps"
weather_path_WPUQ = "data/WPUQ/weather"
path_concat = "data/raw"
path_cleaned = "data/cleaned"

INDEX_START = 1528965000
COLUMNS = ['P_TOT', 'Q_TOT', 'S_TOT', 'PF_TOT']

____

### 1. Load Data Quality

In [None]:
with open(f'{path_concat}/data_heatpump.pkl', 'rb') as f:
    load_dict = pickle.load(f)

with open(f'{path_cleaned}/data_weather_v1.pkl', 'rb') as f:
    weather_data = pickle.load(f)

with open('data/missing_intervalls_dict.pkl', 'rb') as f:
    missing_intervals = pickle.load(f)

Starting Point

In [None]:
Image(filename='plots/data_availability>85.png') 

In [None]:
# set index to start index
for key in load_dict:
    df_house = load_dict[key].set_index('index')
    df_house = df_house[df_house.index > INDEX_START]
    df_house = df_house[COLUMNS]

    for column in df_house.columns:
        if not df_house[df_house[column]<0].empty:
            df_house.loc[df_house[column] < 0, column] = 0.01

    load_dict[key] = df_house   

### 1.1 Filling Missing Values using Linear Regression 

In [None]:
# incomplete time series
list_complete = ['SFH12', 'SFH14', 'SFH16', 'SFH18', 'SFH19', 'SFH22', 'SFH27', 'SFH28', 'SFH29', 
                 'SFH3', 'SFH30', 'SFH32', 'SFH34', 'SFH36', 'SFH4', 'SFH9', 'SFH26', 'SFH33']
list_incomplete = ['SFH5', 'SFH7', 'SFH10', 'SFH11', 'SFH20', 'SFH21', 'SFH23', 'SFH38', 'SFH39']
list_incomlete_unique = ['SFH5', 'SFH7', 'SFH10', 'SFH11', 'SFH21', 'SFH38', 'SFH39']
list_incomplete_double = ['SFH20', 'SFH23']
list_v1 = list_complete + list_incomplete

Duration

In [None]:
start = pd.to_datetime(load_dict['SFH3'].index[0], unit='s')
end = pd.to_datetime(load_dict['SFH3'].index[-1], unit='s')

end-start

In [None]:
from utils.utils import train_and_predict
from utils.plot_utils import plot_metrics_lr, plot_consumption_filled
dict_result, df_metrics = train_and_predict(load_dict, weather_data, missing_intervals, list_incomplete, include_time_features=False)
plot_metrics_lr(df_metrics)

In [None]:
for key in list_incomplete: #["SFH11"]:
    dict_result[key] = dict_result[key].clip(lower=0)
    plot_consumption_filled(dict_result[key], ['P_TOT', 'PF_TOT'], key, missing_intervals[key])

In [None]:
# add already complete time series
for key in list_complete:
    dict_result[key] = load_dict[key]
# save to file
with open(f'{path_cleaned}/data_heatpump_cleaned_v1.pkl', 'wb') as f:
    pickle.dump(dict_result, f)

### 1.2 Further reduction of data sample

Duration

In [None]:
start = pd.to_datetime(1542512700, unit='s')
end = pd.to_datetime(load_dict['SFH3'].index[-1], unit='s')

end-start

In [None]:
with open(f'{path_concat}/data_heatpump.pkl', 'rb') as f:
    data = pickle.load(f)

load_dict = {}

# set index to start index
for key in list_v1:
    if key in ['SFH10', 'SFH11', 'SFH23']:
        #drop datasets
        continue
    else:
        df_house = data[key].set_index('index')
        # start index after missing values for SFH7
        df_house = df_house[df_house.index > missing_intervals['SFH7'][0][1]]
        df_house = df_house[COLUMNS]
        df_house = df_house.clip(lower=0)

        load_dict[key] = df_house   

with open(f'{path_cleaned}/data_heatpump_cleaned_v2.pkl', 'wb') as f:
    pickle.dump(load_dict, f)

____

### 2. Merge Load and Weather Data

In [None]:
import config

# Load cleaned heat pump data
with open('data/cleaned/data_heatpump_cleaned_v1.pkl', 'rb') as f:
    load_dict = pickle.load(f)

# Load weather data
with open('Data/cleaned/data_weather_v1.pkl', 'rb') as f:
    weather_data = pickle.load(f)

# Load building information and set the index
building_info = pd.read_excel("data/cleaned/Gebaeudeinformationen_cleaned.xlsx", index_col=0)
building_info.set_index("Building number", inplace=True)

load_dict_sorted = {}

# Add building information and merge with weather data
for house in sorted(load_dict, key=lambda x: int(re.findall(r'\d+', x)[0])):
    id = int(re.findall(r'\d+', house)[0])

    # Add building area, number of inhabitants, and building id to each house's data
    load_dict[house]["area"] = building_info.loc[id]["Building area"]
    load_dict[house]["inhabitants"] = building_info.loc[id]["Number of inhabitants"]
    load_dict[house]["building"] = id
    
    # Filter weather data and merge with house data
    weather_data_filtered = weather_data[weather_data.index >= 1528965900]
    load_dict[house] = pd.concat([load_dict[house], weather_data_filtered], axis=1)
    load_dict[house].reset_index(inplace=True)
    load_dict[house] = load_dict[house][load_dict[house]["index"] > 1546298100]

    load_dict_sorted[house] = load_dict[house][config.columns]

# Concatenate data for all houses
data = pd.concat(load_dict_sorted)

# Reset index and set the new index to the house id, dropping the old index column
data = data.reset_index().set_index("level_0").drop(columns="level_1")

with open(f'{path_cleaned}/merged_data.pkl', 'wb') as f:
    pickle.dump(data, f)
    
data.head(3)

____

### 3. Prepare Data for Training

#### 3.1 Data Standardization

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
df_scaled = pd.DataFrame(scaled_data, columns=data.columns)

In [None]:
from utils.plot_utils import plot_scaling 

print("scaled_data.shape: " + str(scaled_data.shape))
plot_scaling(df_scaled, data, scaler)

#### 3.2 Creating Sequences

In [None]:
from utils.utils import create_daily_sequences
from utils.plot_utils import plot_sequences

# Erstellen von Sequenzen
all_X = []
all_y = []

for building in df_scaled["building"].unique():
    df_building = df_scaled[df_scaled["building"]==building]
    X_building, y_building = create_daily_sequences(df_scaled[df_scaled["building"]==building], config.SEQUENZE_LENGTH, config.PREDICTION_LENGTH, num_target_var=1)
    # Hinzufügen der Sequenzen zur Gesamtliste
    all_X.append(X_building)
    all_y.append(y_building)

X = np.concatenate(all_X, axis=0)
y = np.concatenate(all_y, axis=0)

print("Dimensionen X: " + str(X.shape))
print("Dimensionen y: " + str(y.shape))

3.3 Split Trainings, Validation and Test Data

In [None]:
from utils.utils import train_test_val_data
from utils.plot_utils import plot_with_classification

len_dataset = len(data.index.unique())
num_target_variables = 1

X_train, X_val, X_test, y_train, y_val, y_test = train_test_val_data(df_scaled, len_dataset, num_target_variables)

print("Dimensionen X_train: " + str(X_train.shape))
print("Dimensionen X_val: " + str(X_val.shape))
print("Dimensionen X_test: " + str(X_test.shape))
print("Dimensionen y_train: " + str(y_train.shape))
print("Dimensionen y_val: " + str(y_val.shape))
print("Dimensionen y_test: " + str(y_test.shape))

plot_with_classification(data[data.index=="SFH23"], train_split=0.5,  val_split=0.95, combine=True, title="SFH23")