# ELP EU

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

from settings import TARGET, COLUMNS_TO_DROP, LAGS, DIFFS, N_OUTER_SPLITS, N_INNER_SPLITS, TEST_SIZE
from features import add_time_features, add_lags, add_diffs
from metrics import get_scores, add_scores_to_dict
from plots import plot_data_split, plot_prediction
from clean_data import get_elp_eu_orders_dataframe

sns.set_style('whitegrid')
matplotlib.rcParams['grid.color'] = '#eee'
color_palette = sns.color_palette()

In [None]:
df = get_elp_eu_orders_dataframe()
df.head()

#### Initial data exploring

In [None]:
for word in ['day', 'day_of_week', 'month', 'year']:
    fig, ax = plt.subplots(1, 1, figsize=(12, 3))
    sns.lineplot(df, x=getattr(df.index, word), y=df[TARGET])
    plt.xlabel('Date')
    plt.ylabel('$', rotation=0, labelpad=16)
    ax.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    plt.title(f'Mean value of sum of orders for each day, aggregated by {word}')
    plt.show()

#### Feature engineering

In [None]:
add_time_features(df)
add_lags(df, lags=LAGS, columns=[TARGET, 'count_of_orders'])
add_diffs(df, diffs=DIFFS, columns=[TARGET, 'count_of_orders'])
df.dropna(inplace=True)

#### Data split for nested cross-validation

In [None]:
X = df.drop(columns=COLUMNS_TO_DROP)
y = df[[TARGET]]

outer_cv = TimeSeriesSplit(n_splits=N_OUTER_SPLITS, test_size=TEST_SIZE)
inner_cv = TimeSeriesSplit(n_splits=N_INNER_SPLITS, test_size=TEST_SIZE)

In [None]:
fig, ax = plt.subplots(N_OUTER_SPLITS*N_INNER_SPLITS, 1, figsize=(20, 3*N_OUTER_SPLITS*N_INNER_SPLITS), sharex=True)

for outer_fold, (train_and_val_idx, test_idx) in enumerate(outer_cv.split(y)):
    train_and_val, test = y.iloc[train_and_val_idx], y.iloc[test_idx]
    for inner_fold, (train_idx, val_idx) in enumerate(inner_cv.split(train_and_val)):
        train, val = y.iloc[train_idx], y.iloc[val_idx]
        ax_idx = outer_fold * N_INNER_SPLITS + inner_fold
        plot_data_split(ax[ax_idx], TARGET, train, val, test)

#### Time series nested cross-validation

In [None]:
metrics = {'rmse': [], 'mae': [], 'r2': []}
metrics_monthly = {'rmse': [], 'mae': [], 'mape': [], 'r2': []}

param_grid = {
    'n_estimators': [100, 200, 500, 1000, 2000],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
}

fig, ax = plt.subplots(N_OUTER_SPLITS, 1, figsize=(20, 4*N_OUTER_SPLITS), sharex=True)
fig, axm = plt.subplots(N_OUTER_SPLITS, 1, figsize=(20, 4*N_OUTER_SPLITS), sharex=True)

for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(y)):
    print(f'--- Outer fold {outer_fold+1} ---')

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    grid_search = GridSearchCV(
        estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
        param_grid=param_grid,
        cv=inner_cv,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=3,
        error_score='raise'
    )

    grid_search.fit(X_train, y_train[TARGET])

    print('Parameters of the best model:')
    for key, value in grid_search.best_params_.items():
        print(f'{key}: {value}')
    print()

    y_pred = pd.DataFrame(grid_search.predict(X_test),
                          columns=['y'],
                          index=y_test.index)
    
    scores = get_scores(y_test, y_pred)
    add_scores_to_dict(metrics, scores)

    plot_prediction(ax[outer_fold], TARGET, y_train, y_test, y_pred, linewidth=0.5)

    y_train = y_train.resample('ME').mean()
    y_test = y_test.resample('ME').mean()
    y_pred = y_pred.resample('ME').mean()

    scores = get_scores(y_test, y_pred)
    add_scores_to_dict(metrics_monthly, scores)

    plot_prediction(axm[outer_fold], TARGET, y_train, y_test, y_pred)

In [None]:
pd.DataFrame(metrics).mean()

In [None]:
pd.DataFrame(metrics_monthly).mean()