In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [2]:
model_paths = [
    "../Temp/DEU_TEMP_EC15.h5",
    #"../Temp/DEU_TEMP_GEFS.h5",
    #"../Temp/FRA_TEMP_EC15.h5",
    #"../Temp/FRA_TEMP_GEFS.h5",
]

In [3]:
models = [pd.read_hdf(path) for path in model_paths]

In [4]:
def get_model_name(path):
    return path[path.rfind("/")+1:path.rfind(".")]

In [5]:
for idx, model in enumerate(models):
    model["model_name"] = get_model_name(model_paths[idx])

In [6]:
df_models = pd.concat(models)

In [7]:
del models

In [8]:
df_models['TimeDelta_h'] = (df_models['ValueDateTime'] - df_models['ForecastDateTime']).astype('timedelta64[h]').astype(int)

In [9]:
df_models.drop(["InsertionDateTime", "ObservationDateTime", "PartitionId", "PointConnectCurveId", "ValueDateTime", "source"], axis=1, inplace=True)

Remove observations where timedelta is less than 0 -- I guess some data error

In [10]:
df_models = df_models.loc[df_models["TimeDelta_h"] >= 0].copy()

In [11]:
df_final = pd.DataFrame(index=sorted(df_models["ForecastDateTime"].unique()))

In [12]:
def column_name(columns):
    names = []
    for i in columns.to_flat_index():
        name = ""
        for i_i in i:
            name += str(i_i)
            name += "-"
        name = name[:-1]
        names.append(name)
    return names

# Feature engineering

Model 0 value

In [13]:
df_model_0 = df_models.loc[df_models['ens_num'] == 0].copy()
df_model_0.drop('ens_num', axis=1, inplace=True)
df_model_0.rename({"value": "model_0_value"}, axis=1, inplace=True)

df_model_0 = df_model_0.pivot(index=["ForecastDateTime"], columns=["model_name", "TimeDelta_h"], values=["model_0_value"])
df_model_0.columns = column_name(df_model_0.columns)

In [14]:
df_final = df_final.merge(df_model_0, left_index=True, right_index=True, how="left")
del df_model_0

Ensamble model std

In [15]:
df_models_ens_std = df_models.loc[df_models['ens_num'] != 0].groupby(['model_name', 'ForecastDateTime', 'TimeDelta_h'])['value'].apply(lambda x: x.std())
df_models_ens_std = df_models_ens_std.to_frame()
df_models_ens_std.rename({"value": "ens_std"}, axis=1, inplace=True)

df_models_ens_std = df_models_ens_std.reset_index().pivot(index=["ForecastDateTime"], columns=["model_name", "TimeDelta_h"], values=["ens_std"])
df_models_ens_std.columns = column_name(df_models_ens_std.columns)

In [16]:
df_final = df_final.merge(df_models_ens_std, left_index=True, right_index=True, how="left")
del df_models_ens_std

MAE

In [17]:
df_models_ens_members = df_models[df_models['ens_num'] != 0]
df_models_ens_members.set_index(["ForecastDateTime", "model_name"], inplace=True)

df_target_value = df_models[(df_models['ens_num'] == 0) & (df_models['TimeDelta_h'] == 0)]
df_target_value.set_index(["ForecastDateTime", "model_name"], inplace=True)
df_target_value = df_target_value[["value"]]
df_target_value.rename({"value": "target"}, axis=1, inplace=True)

df_models_mae = df_models_ens_members.merge(df_target_value, left_index=True, right_index=True, how="left")
del df_target_value
del df_models_ens_members

df_models_mae.reset_index(inplace=True)
df_models_mae = df_models_mae.groupby(["ForecastDateTime", "model_name", "TimeDelta_h"]).apply(lambda x: (x["value"] - x["target"]).abs().mean()).to_frame()
df_models_mae.rename({0: "mae"}, axis=1, inplace=True)

df_models_mae = df_models_mae.reset_index().pivot(index=["ForecastDateTime"], columns=["model_name", "TimeDelta_h"], values=["mae"])
df_models_mae.columns = column_name(df_models_mae.columns)

In [18]:
df_final = df_final.merge(df_models_mae, left_index=True, right_index=True, how="left")
del df_models_mae

# Final dataframe

In [19]:
df_final.to_hdf('data/data.h5', key="data")