In [78]:
import pandas as pd
import numpy as np
import datetime as dt
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression

In [79]:
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [80]:
def mse(y, y_hat):
    return np.mean((y - y_hat)**2)

def r2(y_train, y, y_hat):
    sse = np.sum((y - y_hat)**2)
    sst = np.sum((y - y_train.mean())**2)
    return 1 - sse/sst

In [81]:
def geo_difference(df):
    df = df.copy()
    countries = ["FRA", "NLD", "GBR", "DEU"]
    
    n_cols = []
    for s_col in df.columns:
        s_cty = s_col[s_col.find("-")+1:s_col.find("-")+4]
        for a_cty in countries:
            if a_cty != s_cty:
                a_col = s_col.replace(s_cty, a_cty)
                if a_col in df.columns:
                    n_cols.append(pd.Series(np.subtract(df[s_col],df[a_col]), name=s_col+"_minus_"+a_col))
    df = pd.concat([df] + n_cols, axis=1)
    return df

In [82]:
def get_data():
    df_train = pd.read_hdf('../data/EC15_split.h5', key="train")
    df_valid = pd.read_hdf('../data/EC15_split.h5', key="validation")
    df_test = pd.read_hdf('../data/EC15_split.h5', key="test")
    df_train_dif = geo_difference(df_train)
    df_val_dif = geo_difference(df_valid)
    df_test_dif = geo_difference(df_test)
    return df_train_dif, df_val_dif, df_test_dif

def get_target_data(df_train_dif, df_val_dif, df_test_dif, target, t):
    y_train = df_train_dif[target].shift(-t)[0:len(df_train_dif)-t]
    y_valid = df_val_dif[target].shift(-t)[0:len(df_val_dif)-t]
    y_test = df_test_dif[target].shift(-t)[0:len(df_test_dif)-t]
    df_train_dif = df_train_dif[0:len(df_train_dif)-t]
    df_val_dif = df_val_dif[0:len(df_val_dif)-t]
    df_test_dif = df_test_dif[0:len(df_test_dif)-t]
    return df_train_dif, df_val_dif, df_test_dif, y_train, y_valid, y_test

In [83]:
def subset_selection(k, selection_method, X_train, y_train):
    X_subset = SelectKBest(selection_method, k=k).fit_transform(X_train, y_train)
    cols = []
    for i in range(len(X_subset[0,:])):
        for col in X_train.columns:
            if X_subset[0,i] in X_train[col].unique():
                if col not in cols:
                    cols.append(col)
    return cols

In [84]:
def train(X, y):
    reg = LinearRegression().fit(X, y)
    return reg

def train_err(X, y):
    reg = LinearRegression().fit(np.array(X).reshape(-1,1), np.array(y).reshape(-1,1))
    return reg


def predict(model, error_model, X):
    y_hat = model.predict(X)
    y_hat_err = error_model.predict(np.array(y_hat).reshape(-1,1))
    return y_hat + y_hat_err.reshape(len(y_hat_err),)

In [85]:
def forecast(X_train, X_valid, k, selection_method, target, t):
    X_train, X_valid, X_test, y_train, y_valid, y_test = get_target_data(df_train_dif, df_val_dif, df_test_dif, target=target, t=t)
    cols = subset_selection(k, selection_method, X_train, y_train)
    print("Target: ", target)
    print("Chosen columns: ", cols)
    X_train = X_train[cols]
    X_valid = X_valid[cols]
    model = train(X_train, y_train)
    pred_train = model.predict(X_train)
    pred_valid = model.predict(X_valid)
    error_model = train(np.array(y_train).reshape(-1,1), np.array(y_train-pred_train).reshape(-1,1))
    y_hat_train = predict(model, error_model, X_train)
    y_hat_valid = predict(model, error_model, X_valid)
    return y_train, y_valid, y_hat_train, y_hat_valid, pred_train, pred_valid, 

In [86]:
def evaluate(y_train, y_valid, y_hat_train, y_hat_valid, y_hat_train_no_err, y_hat_valid_no_err):
    print("Train MSE: ", mse(y_train, y_hat_train))
    print("Train MSE (no error correction): ", mse(y_train, y_hat_train_no_err))
    print("Validation MSE: ", mse(y_valid, y_hat_valid))
    print("Validation MSE (no error correction): ", mse(y_valid, y_hat_valid_no_err))
    print("Train R2: ", r2(y_train, y_train, y_hat_train))
    print("Train R2 (no error correction): ", r2(y_train, y_train, y_hat_train_no_err))
    print("Validation R2: ", r2(y_train, y_valid, y_hat_valid))
    print("Validation R2 (no error correction): ", r2(y_train, y_valid, y_hat_valid_no_err))

In [87]:
def plot_forecast(y_hat_train, y_hat_valid, y_hat_train_no_err, y_hat_valid_no_err):
    plt.figure(figsize=(20,10))
    plt.plot(df_train_dif.index[0:len(df_train_dif)-2], y_train, label="Train")
    plt.plot(df_train_dif.index[0:len(df_train_dif)-2], y_hat_train, label="Train Forecast")
    plt.plot(df_train_dif.index[0:len(df_train_dif)-2], y_hat_train_no_err, label="Train Forecast (no error correction)")
    plt.plot(df_val_dif.index[0:len(df_val_dif)-2], y_valid, label="Validation")
    plt.plot(df_val_dif.index[0:len(df_val_dif)-2], y_hat_valid, label="Validation Forecast")
    plt.plot(df_val_dif.index[0:len(df_val_dif)-2], y_hat_valid_no_err, label="Validation Forecast (no error correction)")
    plt.legend()
    plt.show()

    plt.scatter(x=y_valid, y=y_valid-y_hat_valid)
    plt.scatter(x=y_valid, y=y_valid-y_hat_valid_no_err)

In [88]:
targets = ["ens_std-DEU_TEMP_EC15-24", "ens_std-FRA_TEMP_EC15-24", "ens_std-DEU_WIND_EC15-24", "ens_std-FRA_WIND_EC15-24"]
# targets = ["ens_std-GBR_WIND_EC15-24"]


In [89]:
def forecast_all(targets, k=5, selection_method=f_regression, t=2):
    df_train_dif, df_val_dif, df_test_dif = get_data()
    for target in targets:
        df_train, df_valid, df_test, y_train, y_valid, y_test = get_target_data(df_train_dif, df_val_dif, df_test_dif, target, t)
        y_train, y_valid, y_hat_train, y_hat_valid, y_hat_train_no_err, y_hat_valid_no_err = forecast(df_train, df_valid, k, selection_method, target, t)
        evaluate(y_train, y_valid, y_hat_train, y_hat_valid, y_hat_train_no_err, y_hat_valid_no_err)
        # plot_forecast(y_hat_train, y_hat_valid, y_hat_train_no_err, y_hat_valid_no_err)
        print()


In [90]:
forecast_all(targets)

Target:  ens_std-DEU_TEMP_EC15-24
Chosen columns:  ['ens_std-DEU_TEMP_EC15-48', 'ens_std-FRA_TEMP_EC15-48_minus_ens_std-DEU_TEMP_EC15-48', 'ens_std-GBR_TEMP_EC15-48_minus_ens_std-DEU_TEMP_EC15-48', 'ens_std-DEU_TEMP_EC15-48_minus_ens_std-FRA_TEMP_EC15-48', 'ens_std-DEU_TEMP_EC15-48_minus_ens_std-GBR_TEMP_EC15-48']
Train MSE:  0.010838664244026486
Train MSE (no error correction):  0.00867242570957085
Validation MSE:  0.01164819729400333
Validation MSE (no error correction):  0.010536284414649152
Train R2:  0.35676593722572336
Train R2 (no error correction):  0.48532406782968796
Validation R2:  0.4393597881919189
Validation R2 (no error correction):  0.4928773460130059

Target:  ens_std-FRA_TEMP_EC15-24
Chosen columns:  ['ens_std-FRA_TEMP_EC15-48', 'ens_std-FRA_TEMP_EC15-48_minus_ens_std-GBR_TEMP_EC15-48', 'ens_std-FRA_TEMP_EC15-48_minus_ens_std-DEU_TEMP_EC15-48', 'ens_std-GBR_TEMP_EC15-48_minus_ens_std-FRA_TEMP_EC15-48', 'ens_std-DEU_TEMP_EC15-48_minus_ens_std-FRA_TEMP_EC15-48']
Train M