# ELP EU

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor

from settings import TARGET, COLUMNS_TO_DROP, LAGS, DIFFS
from features import add_time_features, add_lags, add_diffs
from metrics import get_scores, add_scores_to_dict
from plots import plot_prediction
from clean_data import get_elp_eu_orders_dataframe

sns.set_style('whitegrid')
matplotlib.rcParams['grid.color'] = '#eee'
color_palette = sns.color_palette()

In [None]:
df = get_elp_eu_orders_dataframe()
df.head()

#### Initial data exploring

In [None]:
for word in ['day', 'day_of_week', 'month', 'year']:
    fig, ax = plt.subplots(1, 1, figsize=(12, 3))
    sns.lineplot(df, x=getattr(df.index, word), y=df[TARGET])
    plt.xlabel('Date')
    plt.ylabel('$', rotation=0, labelpad=16)
    ax.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    plt.title(f'Mean value of sum of orders for each day, aggregated by {word}')
    plt.show()

#### Feature engineering

In [None]:
add_time_features(df)
add_lags(df, lags=LAGS, columns=[TARGET, 'count_of_orders'])
add_diffs(df, diffs=DIFFS, columns=[TARGET, 'count_of_orders'])

#### Data split & training the model

In [None]:
metrics = {
        'rmse': [],
        'mae': [],
        'mape': [],
        'r2': [],
}

metrics_monthly = {
    'rmse': [],
    'mae': [],
    'mape': [],
    'r2': [],
}

n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=364)

In [None]:
fig, ax = plt.subplots(n_splits, 1, figsize=(20, 4*n_splits), sharex=True)
fig, ax_monthly = plt.subplots(n_splits, 1, figsize=(20, 4*n_splits), sharex=True)

for i, (train_indices, test_indices) in enumerate(tscv.split(df)):
    train, test = df.iloc[train_indices], df.iloc[test_indices]
    
    X_train = train.drop(columns=COLUMNS_TO_DROP)
    y_train = train[[TARGET]]

    X_test = test.drop(columns=COLUMNS_TO_DROP)
    y_test = test[[TARGET]]

    model = RandomForestRegressor(n_estimators=5)
    model.fit(X_train, y_train[TARGET])

    y_pred = model.predict(X_test)

    y_pred = pd.DataFrame(y_pred,
                          columns=[TARGET],
                          index=y_test.index)
    
    scores = get_scores(y_test, y_pred)
    add_scores_to_dict(metrics, scores)
    
    plot_prediction(ax[i], TARGET, y_train, y_test, y_pred, linewidth=0.5)
    
    y_train = y_train.resample('ME').mean()
    y_test = y_test.resample('ME').mean()
    y_pred = y_pred.resample('ME').mean()

    scores_monthly = get_scores(y_test, y_pred)
    add_scores_to_dict(metrics_monthly, scores)
    
    plot_prediction(ax_monthly[i], TARGET, y_train, y_test, y_pred)