In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from tqdm.notebook import tqdm
from IPython.display import display

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
# from sklearn.ensemble import BaggingRegressor
from sklearn.neural_network import MLPRegressor, MLPClassifier
from xgboost import XGBClassifier, XGBRegressor

from utils.utils import get_file_names, open_files, save_files
from utils.model_processing import *

import warnings
warnings.filterwarnings('ignore')

## First models

### 1. Load datasets

In [2]:
with_eur = False
early = False
train_test_ratio = 0.7
if early:
    START_DATE = dt.datetime(1960, 2, 1)
elif with_eur:
    START_DATE = dt.datetime(2001, 2, 1)
else:
    START_DATE = dt.datetime(1992, 1, 1)
END_DATE = dt.datetime.today() - pd.offsets.MonthEnd(2)
TEST_DATE = START_DATE + train_test_ratio * (END_DATE - START_DATE)
ROW_SHIFTS = (1, 2, 3, 6, 12)
predicted_feature = 'FF_month_avg_diff'

path = 'Models/startdate_{:s}_testdate_{:s}/'.format(START_DATE.strftime("%Y"),
                                                     TEST_DATE.strftime("%Y")
                                                    )
file_names = [f + "_t-" + "_".join([str(i) for i in ROW_SHIFTS]) + ".csv" for f in ["X_train", "Y_train", "X_test", "Y_test"]]

X_train, Y_train, X_test, Y_test = open_files(path=path,
                                              file_names=file_names
                                             ).values()
Y_train['Date'] = pd.to_datetime(Y_train['Date'])
Y_test['Date'] = pd.to_datetime(Y_test['Date'])
Y_test.dtypes

Fed_rate_month_avg                        float64
Fed_rate_spot_EOM                         float64
Fed_rate_month_avg_diff                   float64
Fed_rate_month_avg_pct_change             float64
Fed_rate_month_avg_diff_3_class           float64
Fed_rate_month_avg_diff_5_class           float64
Fed_rate_month_avg_diff_9_class           float64
Fed_rate_month_avg_trend                  float64
Date                               datetime64[ns]
dtype: object

### 2. Predicted feature, model type and related parameters

In [None]:
def define_cv_params(estimator, params_to_cv=None):
    name = estimator.__class__.__name__
    if name == "LinearDiscriminantAnalysis":
        cv_params = {}
    elif name == "LinearRegression":
        cv_params = {}
    elif name == "LogisticRegression":
        cv_params == {}
    elif name == "RandomForestRegressor" or name == "RandomForestClassifier":
        cv_params = {
            'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],  # Number of trees
            'max_features': [1, 'auto', 'sqrt', 'log2'],  # Number of features to consider at every split
            'max_depth': [int(x) for x in np.linspace(10, 110, num = 6)] + [None],  # maximum depth of a tree
            'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
        }
    elif name == "GradientBoostingRegressor" or name == "GradientBoostingClassifier":
        cv_params = {
            'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 110, num = 6)],  # Number of trees
            'learning_rate': np.logspace(-3, -1, 3),  # weight of each tree in final estimator
            'max_depth': [3, 5, 7],  # maximum depth of a tree
            'min_samples_split': np.arange(2, int(np.sqrt(len(X_train.index))), 5),  # min number of obs in node to be considered for a split
            'max_features': ['sqrt', 'auto'],  # Number of features to consider at every split
            'min_samples_leaf': [1, 2, 3, 4],  # Minimum number of samples required at each leaf node
            'subsample': np.linspace(start=0.6, stop=1, num=5),
            'ccp_alpha': [0, 1e-4, 1e-2],  # Complexity parameter for tree prunning
        }
    elif name == "XGBRegressor" or name == "XGBClassifier":
        cv_params = {
            'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)],  # Number of trees
            'eta': np.logspace(-2, -0.5, 3),  # weight of each tree in final estimator
            'max_depth': [3, 6, 10],  # maximum depth of a tree
            'min_child_weight': [1, 3, 6, 10],
            'subsample': np.linspace(start=0.6, stop=1, num=5),
#             'colsample_bytree': [0.3, 0.3, 0.7, 1],
            'gamma': [0, 1, 5, 10],
        }
    else:
        print("\n" + " No cross-validation params defined for this estimator yet ".center(120, "-"))
        cv_params = {}
    if params_to_cv is not None:
        temp = {}
        for param in cv_params.keys():
            if param in params_to_cv:
                temp[param] = cv_params[param]
        cv_params = temp
    print("\n" + " Params to be tested: ".center(120, "-"))
    [print(key, value) for key, value in cv_params.items()]
    n_combi = len(list(itertools.product(*cv_params.values())))
    print("\n" + " # of possible combinations to be cross-validated: {:d}".format(n_combi))
    return cv_params

In [3]:
predicted_feature = 'Fed_rate_month_avg_diff'
# predicted_feature = 'Fed_rate_month_avg_diff_5_class'

# Regression
# estimator = GradientBoostingRegressor(random_state=SEED)
# estimator = RandomForestRegressor(random_state=SEED)
estimator = XGBRegressor(random_state = SEED)

# Classification
# estimator = LinearDiscriminantAnalysis()
# estimator = RandomForestClassifier(random_state=SEED)
# estimator = GradientBoostingClassifier(random_state=SEED)

# params_to_cv = ['max_depth']
params_to_cv = ['n_estimators',
                'eta',
                'max_depth',
                'min_child_weight',
#                 'subsample',
                'gamma'
               ]

### 3. Cross-validation

In [None]:
results_dict = cv_model(X_train,
                        Y_train,
                        X_test,
                        Y_test,
                        predicted_feature=predicted_feature,
                        estimator=estimator,
                        params_to_cv=params_to_cv,  # dict
                        n_splits_cv=5,
                        plot_feature_importance=True,
                        plot_model_perf=True,
                        plot_reconstitution=True,
                        reconstitution_feature='Fed_rate_month_avg',
                        reconstitution_type_of_diff='diff'  # None or 'diff' or 'pct'                
                       ) 


----------------------------------------------- Feature to be predicted: -----------------------------------------------
Fed_rate_month_avg_diff

------------------------------------------------------ Estimator: ------------------------------------------------------
XGBRegressor

------------------------------------------------- Params to be tested: -------------------------------------------------
n_estimators [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
eta [0.01       0.05623413 0.31622777]
max_depth [3, 6, 10]
min_child_weight [1, 3, 6, 10]
gamma [0, 1, 5, 10]

 # of possible combinations to be cross-validated: 1440

Continue with this cv-params ? (y/n)  y

---------------------------------- 5-folds Cross-validation starting for XGBRegressor-----------------------------------
 Fitting 5 folds for each of 1440 candidates, totalling 7200 fits 

-------------------------------------------------- Folder #1 starting --------------------------------------------------


HBox(children=(FloatProgress(value=0.0, max=1440.0), HTML(value='')))


-------------------------------------------------- Folder #2 starting --------------------------------------------------


HBox(children=(FloatProgress(value=0.0, max=1440.0), HTML(value='')))


-------------------------------------------------- Folder #3 starting --------------------------------------------------


HBox(children=(FloatProgress(value=0.0, max=1440.0), HTML(value='')))


-------------------------------------------------- Folder #4 starting --------------------------------------------------


HBox(children=(FloatProgress(value=0.0, max=1440.0), HTML(value='')))

### 4. Save cv and model

In [None]:
from utils.utils import get_file_names, open_files, save_files

file_name_struct = '{:s}_{:s}_score_{:.3f}'
file_name = file_name_struct.format(str(estimator.__class__.__name__),
                                  predicted_feature.replace("_", ""),
                                  model.score(X_test, Y_test[predicted_feature])
                                 ).replace(".", "")

save_files(path='Models/startdate_{:s}_testdate_{:s}/'.format(START_DATE.strftime("%Y"),
                                                             TEST_DATE.strftime("%Y")
                                                            ),
           files={file_name + ".pkl": results_dict["best_model_fitted"]}
          )


In [None]:
file_name_struct = 'cv_{:s}_{:s}'
file_name = file_name_struct.format(str(estimator.__class__.__name__),
                                    predicted_feature.replace("_", "")
                                   ).replace(".", "")
save_files(path='Models/startdate_{:s}_testdate_{:s}/'.format(START_DATE.strftime("%Y"),
                                                             TEST_DATE.strftime("%Y")
                                                            ),
           files={file_name + ".csv": results_dict[results_cv]}
          )