# Prepare a sci-kit grid-search / cross-val pipeline

### load data

In [1]:
import os
from google.colab import drive

drive.mount('/content/gdrive')
os.chdir('gdrive/My Drive/home/qminers')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import matplotlib.pyplot as plt
import pandas as pd
from qminers.utils import prepare_data_v1

prepare_data = prepare_data_v1

In [0]:
financial_df = pd.read_csv('data/sp500_2010_01_01-2014_07_31.csv')
calendar_df = pd.read_csv('data/econ_calendar.csv')

data_train_val, data_test = prepare_data(financial_df, calendar_df)

In [0]:
data_train_val = data_train_val.astype(float)

## Prepare baseline regressor

In [0]:
from sklearn.base import BaseEstimator, RegressorMixin

class BaselineRegressor(BaseEstimator, RegressorMixin):

    def __init__(self):
      pass

    def fit(self, X, y):
        return self

    def predict(self, X):
        # return the last column
        # which in our case predicts the volume in next time-step will be equal to the volume in current time-step
        return X[:, -1]

## Automated training & validation

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV


scaler, lin_regressor, transformer = None, None, None

estimators = [('preprocessing', scaler), ('model', lin_regressor)]
estimators = Pipeline(estimators)

regr = TransformedTargetRegressor(regressor=estimators,
                                   transformer=transformer)

baseline = [BaselineRegressor()]
lin_models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    ElasticNet()
]
svrs = [
    SVR(gamma='auto', kernel='linear'),
    SVR(gamma='auto', kernel='rbf')
]
ensembles = [
    AdaBoostRegressor(),
    AdaBoostRegressor(base_estimator=SVR(gamma='auto', kernel='linear')), # takes long
    BaggingRegressor(),
    BaggingRegressor(base_estimator=SVR(gamma='auto', kernel='linear')), # CV score=0.190 +/- 0.08
    BaggingRegressor(base_estimator=SVR(gamma='auto', kernel='rbf')), # CV score=0.168 +/- 0.06
    ExtraTreesRegressor(n_estimators=100),
    RandomForestRegressor(n_estimators=100),
    GradientBoostingRegressor(),
]

models = baseline + lin_models + svrs + ensembles
# models = ensembles

param_grid = {
    'regressor__preprocessing': [MinMaxScaler(), RobustScaler(), StandardScaler()],
    'regressor__model': models,
    'transformer': [MinMaxScaler(), RobustScaler(), StandardScaler()]
}

grid_search = GridSearchCV(
    cv=10,
    scoring=('r2', 'neg_mean_squared_error'),
    refit='r2',
    estimator=regr,
    param_grid=param_grid,
#     return_train_score=True)
    return_train_score=False)

In [7]:
grid_search.fit(data_train_val.iloc[:-1, :], data_train_val.iloc[1:, -1])

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
              regressor=Pipeline(memory=None, steps=[('preprocessing', None), ('model', None)]),
              transformer=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'regressor__preprocessing': [MinMaxScaler(copy=True, feature_range=(0, 1)), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), StandardScaler(copy=True, with_mean=True, with_std=True)], 'regressor__model': [BaselineRegressor(), LinearRegr...entering=True,
       with_scaling=True), StandardScaler(copy=True, with_mean=True, with_std=True)]},
       pre_dispatch='2*n_jobs', refit='r2', return_train_score=False,
       scoring=('r2', 'neg_mean_squared_error'), verbose=0)

In [13]:
results = pd.DataFrame(grid_search.cv_results_)
selected_columns = [x for x in results.columns if not any([y in x for y in ['split', 'time']])] 
results = results[selected_columns]
results

Unnamed: 0,mean_test_neg_mean_squared_error,mean_test_r2,param_regressor__model,param_regressor__preprocessing,param_transformer,params,rank_test_neg_mean_squared_error,rank_test_r2,std_test_neg_mean_squared_error,std_test_r2
0,-3.733753e+15,0.008618,BaselineRegressor(),"MinMaxScaler(copy=True, feature_range=(0, 1))","MinMaxScaler(copy=True, feature_range=(0, 1))","{'regressor__model': BaselineRegressor(), 'reg...",39,44,2.043626e+15,0.298496
1,-6.301007e+15,-0.421199,BaselineRegressor(),"MinMaxScaler(copy=True, feature_range=(0, 1))","RobustScaler(copy=True, quantile_range=(25.0, ...","{'regressor__model': BaselineRegressor(), 'reg...",52,51,5.970272e+15,0.505098
2,-6.762382e+15,-0.906453,BaselineRegressor(),"MinMaxScaler(copy=True, feature_range=(0, 1))","StandardScaler(copy=True, with_mean=True, with...","{'regressor__model': BaselineRegressor(), 'reg...",55,70,4.803416e+15,1.064273
3,-3.521386e+17,-75.224616,BaselineRegressor(),"RobustScaler(copy=True, quantile_range=(25.0, ...","MinMaxScaler(copy=True, feature_range=(0, 1))","{'regressor__model': BaselineRegressor(), 'reg...",71,71,3.490861e+17,28.207802
4,-3.734534e+15,0.007943,BaselineRegressor(),"RobustScaler(copy=True, quantile_range=(25.0, ...","RobustScaler(copy=True, quantile_range=(25.0, ...","{'regressor__model': BaselineRegressor(), 'reg...",40,46,2.036903e+15,0.298447
5,-3.934004e+15,-0.117342,BaselineRegressor(),"RobustScaler(copy=True, quantile_range=(25.0, ...","StandardScaler(copy=True, with_mean=True, with...","{'regressor__model': BaselineRegressor(), 'reg...",41,47,1.749314e+15,0.393720
6,-4.314417e+17,-113.885848,BaselineRegressor(),"StandardScaler(copy=True, with_mean=True, with...","MinMaxScaler(copy=True, feature_range=(0, 1))","{'regressor__model': BaselineRegressor(), 'reg...",72,72,3.514150e+17,61.277961
7,-4.509365e+15,-0.273075,BaselineRegressor(),"StandardScaler(copy=True, with_mean=True, with...","RobustScaler(copy=True, quantile_range=(25.0, ...","{'regressor__model': BaselineRegressor(), 'reg...",50,50,2.149024e+15,0.456401
8,-3.733461e+15,0.008568,BaselineRegressor(),"StandardScaler(copy=True, with_mean=True, with...","StandardScaler(copy=True, with_mean=True, with...","{'regressor__model': BaselineRegressor(), 'reg...",38,45,2.041611e+15,0.298507
9,-3.263012e+15,0.162149,"LinearRegression(copy_X=True, fit_intercept=Tr...","MinMaxScaler(copy=True, feature_range=(0, 1))","MinMaxScaler(copy=True, feature_range=(0, 1))",{'regressor__model': LinearRegression(copy_X=T...,3,25,2.065706e+15,0.244840


In [9]:
best_row = grid_search.best_index_
print("Best parameter (CV score=%0.3f +/- %.2f):" % (results.loc[best_row]['mean_test_r2'], results.loc[best_row]['std_test_r2'] / 2))

Best parameter (CV score=0.185 +/- 0.08):


In [12]:
for k in grid_search.cv_results_.keys():
  print(k)

mean_fit_time
std_fit_time
mean_score_time
std_score_time
param_regressor__model
param_regressor__preprocessing
param_transformer
params
split0_test_r2
split1_test_r2
split2_test_r2
split3_test_r2
split4_test_r2
split5_test_r2
split6_test_r2
split7_test_r2
split8_test_r2
split9_test_r2
mean_test_r2
std_test_r2
rank_test_r2
split0_test_neg_mean_squared_error
split1_test_neg_mean_squared_error
split2_test_neg_mean_squared_error
split3_test_neg_mean_squared_error
split4_test_neg_mean_squared_error
split5_test_neg_mean_squared_error
split6_test_neg_mean_squared_error
split7_test_neg_mean_squared_error
split8_test_neg_mean_squared_error
split9_test_neg_mean_squared_error
mean_test_neg_mean_squared_error
std_test_neg_mean_squared_error
rank_test_neg_mean_squared_error


In [0]:
results = pd.DataFrame(grid_search.cv_results_)
selected_columns = [x for x in results.columns if not any([y in x for y in ['param', 'split', 'time']])] 
results = results[selected_columns]
results

Unnamed: 0,mean_test_neg_mean_squared_error,mean_test_r2,rank_test_neg_mean_squared_error,rank_test_r2,std_test_neg_mean_squared_error,std_test_r2
0,-3.263012e+15,0.162149,4,27,2.065706e+15,0.244840
1,-3.263012e+15,0.162149,3,26,2.065706e+15,0.244840
2,-3.263012e+15,0.162149,2,28,2.065706e+15,0.244840
3,-3.263012e+15,0.162149,9,32,2.065706e+15,0.244840
4,-3.263012e+15,0.162149,10,34,2.065706e+15,0.244840
5,-3.263012e+15,0.162149,5,33,2.065706e+15,0.244840
6,-3.263012e+15,0.162149,6,30,2.065706e+15,0.244840
7,-3.263012e+15,0.162149,8,31,2.065706e+15,0.244840
8,-3.263012e+15,0.162149,7,29,2.065706e+15,0.244840
9,-3.395644e+15,0.172103,20,13,2.442510e+15,0.194064


In [0]:
best_row = grid_search.best_index_
print("Best parameter (CV score=%0.3f +/- %.2f):" % (results.loc[best_row]['mean_test_r2'], results.loc[best_row]['std_test_r2'] / 2))

Best parameter (CV score=0.183 +/- 0.08):


In [0]:
best_row = grid_search.best_index_
print("Best parameter (CV score=%0.3f +/- %.2f):" % (results.loc[best_row]['mean_test_r2'], results.loc[best_row]['std_test_r2'] / 2))

Best parameter (CV score=0.190 +/- 0.08):


In [0]:
best_row = grid_search.best_index_
print("Best parameter (CV score=%0.3f +/- %.2f):" % (results.loc[best_row]['mean_test_r2'], results.loc[best_row]['std_test_r2'] / 2))

Best parameter (CV score=0.187 +/- 0.09):


In [0]:
print(grid_search.best_estimator_)

TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
              regressor=Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False))]),
              transformer=RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True))


In [0]:
print(grid_search.best_estimator_)

TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
              regressor=Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', BaggingRegressor(base_estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, ver...mators=10, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False))]),
              transformer=StandardScaler(copy=True, with_mean=True, with_std=True))


## Eval best estimator on test

In [0]:
from sklearn.metrics import mean_squared_error

data_test_x, data_test_y = data_test.iloc[:-1, :], data_test.iloc[1:, -1]
test_r2 = grid_search.best_estimator_.score(data_test_x, data_test_y)
print('Test R2:\t%.3f' % test_r2)

test_mse = mean_squared_error(data_test_y, grid_search.best_estimator_.predict(data_test_x))
print('Test MSE:\t%E' % test_mse)

Test R2:	-0.095
Test MSE:	1.490505E+15


In [0]:
from sklearn.metrics import mean_squared_error

data_test_x, data_test_y = data_test.iloc[:-1, :], data_test.iloc[1:, -1]
test_r2 = grid_search.best_estimator_.score(data_test_x, data_test_y)
print('Test R2:\t%.3f' % test_r2)

test_mse = mean_squared_error(data_test_y, grid_search.best_estimator_.predict(data_test_x))
print('Test MSE:\t%E' % test_mse)

Test R2:	-0.209
Test MSE:	1.646470E+15


  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
