The overall idea is that we will implement linear and tree based models first and try to ansamble them. Further will implement recurrent NN and compare the two models.
Data normalisation required for linear or NN models will be done with the help of sklearn pipeline which will include both motel and data transofrmation steps. 

In [8]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import optuna

from src.ToyModel import ToyModel
from src.utilities import run_cv
from src.FeatureGenerator import FeatureGenerator

# CV data initiation

In [3]:
# initiating data generators
feat_generator = FeatureGenerator()

# initiating cv splitter
tscv = TimeSeriesSplit(test_size = 1, max_train_size=16)

# generating all of the data which we will iterate over during CV
features_df = feat_generator.generate_features()

# creating col lists for training
cols_di={
    'index': feat_generator.index_cols,
    'target': feat_generator.target_col,
    'feats': feat_generator.shifted_cols + feat_generator.roll_cols
}


# Baseline

In [3]:
# initiating toy model
toy_model = ToyModel()

# iterating over CV folds
toy_cv_res = run_cv(df=features_df, months_cv_split=tscv, model=toy_model, cols_di=cols_di, verbose=2)

Fold 0:
  Train months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28], size: 120,494
  Test months: [29],   size: 7,039
  NRMSE:  1.0
  RMSE :  5.1

Fold 1:
  Train months: [14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29], size: 119,696
  Test months: [30],   size: 6,739
  NRMSE:  1.0
  RMSE :  4.4

Fold 2:
  Train months: [15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30], size: 118,557
  Test months: [31],   size: 5,669
  NRMSE:  1.0
  RMSE :  4.4

Fold 3:
  Train months: [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31], size: 116,851
  Test months: [32],   size: 5,439
  NRMSE:  1.0
  RMSE :  4.9

Fold 4:
  Train months: [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32], size: 114,617
  Test months: [33],   size: 5,668
  NRMSE:  1.0
  RMSE :  3.9


------------------------------
RMSE mean: 4.6
NRMSE mean: 1.0


# Linear model

In [5]:
# initiating lin model
lin_model =  Pipeline([
    ("scaler", StandardScaler()),
    ("lin_model", LinearRegression()) #ElasticNet() Lasso(alpha=.005) , 
])

# iterating over CV folds
lin_cv_res = run_cv(df=features_df, months_cv_split=tscv, model=lin_model, cols_di=cols_di, verbose=2)

Fold 0:
  Train months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28], size: 120,494
  Test months: [29],   size: 7,039
  NRMSE:  0.5
  RMSE :  2.6

Fold 1:
  Train months: [14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29], size: 119,696
  Test months: [30],   size: 6,739
  NRMSE:  0.49
  RMSE :  2.2

Fold 2:
  Train months: [15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30], size: 118,557
  Test months: [31],   size: 5,669
  NRMSE:  0.4
  RMSE :  1.7

Fold 3:
  Train months: [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31], size: 116,851
  Test months: [32],   size: 5,439
  NRMSE:  0.69
  RMSE :  3.4

Fold 4:
  Train months: [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32], size: 114,617
  Test months: [33],   size: 5,668
  NRMSE:  0.71
  RMSE :  2.7


------------------------------
RMSE mean: 2.5
NRMSE mean: 0.56


# Tree based

In [22]:
np.mean(tree_cv_res['rmse'])

2.618315051373108

In [24]:
df_for_optuna = features_df.sample(int(features_df.shape[0]*.3))

In [26]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, .6),
        'n_estimators': trial.suggest_int('n_estimators', 30, 1500),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 7.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 7.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0)
    }
    xgb = XGBRegressor(**params)
    tscv = TimeSeriesSplit(test_size = 1, max_train_size=16)
    tree_cv_res = run_cv(df=df_for_optuna, months_cv_split=tscv, model=xgb, cols_di=cols_di, verbose=0)
    
    return np.mean(tree_cv_res['rmse'])

study = optuna.create_study(direction='minimize', study_name='regression_2')
study.optimize(objective, n_trials=500)
print('Best parameters', study.best_params)

[32m[I 2023-06-20 07:08:03,256][0m A new study created in memory with name: regression_2[0m
[32m[I 2023-06-20 07:12:26,268][0m Trial 0 finished with value: 7.027438311927469 and parameters: {'max_depth': 9, 'learning_rate': 0.5494108848714635, 'n_estimators': 1048, 'gamma': 0.7314309627568815, 'reg_alpha': 6.881351625760832, 'reg_lambda': 0.3041189132619926, 'colsample_bytree': 0.8623198670552571}. Best is trial 0 with value: 7.027438311927469.[0m
[32m[I 2023-06-20 07:14:49,778][0m Trial 1 finished with value: 3.2398158214906063 and parameters: {'max_depth': 10, 'learning_rate': 0.12253799810128614, 'n_estimators': 635, 'gamma': 0.7550611110782909, 'reg_alpha': 0.790196904393736, 'reg_lambda': 3.4934281786530064, 'colsample_bytree': 0.5640316358786255}. Best is trial 1 with value: 3.2398158214906063.[0m
[32m[I 2023-06-20 07:18:07,331][0m Trial 2 finished with value: 5.0647887837327605 and parameters: {'max_depth': 10, 'learning_rate': 0.4943436270535506, 'n_estimators': 819,

KeyboardInterrupt: 

In [29]:
tscv = TimeSeriesSplit(test_size = 1, max_train_size=16)
params = {'max_depth': 7, 'learning_rate': 0.005298559699829036, 'n_estimators': 300, 'gamma': 0.7744615794844047, 
          'reg_alpha': 4.7427786108127865, 'reg_lambda': 3.9464923699087713, 'colsample_bytree': 0.5556916732637939}

tree_model = XGBRegressor(**params) #

tree_cv_res = run_cv(df=features_df, months_cv_split=tscv, model=tree_model, cols_di=cols_di, verbose=2)

Fold 0:
  Train months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28], size: 120,633
  Test months: [29],   size: 6,921
  NRMSE:  0.49
  RMSE :  2.5

Fold 1:
  Train months: [14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29], size: 119,621
  Test months: [30],   size: 6,680
  NRMSE:  0.45
  RMSE :  2.0

Fold 2:
  Train months: [15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30], size: 118,304
  Test months: [31],   size: 5,830
  NRMSE:  0.42
  RMSE :  1.8

Fold 3:
  Train months: [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31], size: 116,761
  Test months: [32],   size: 5,343
  NRMSE:  0.67
  RMSE :  3.3

Fold 4:
  Train months: [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32], size: 114,497
  Test months: [33],   size: 5,671
  NRMSE:  0.52
  RMSE :  2.0


------------------------------
RMSE mean: 2.3
NRMSE mean: 0.51


In [None]:
RMSE mean: 2.3
NRMSE mean: 0.5

# Recurrent NN