# Prepare a sci-kit grid-search / cross-val pipeline

### load data

In [1]:
import os
from google.colab import drive

drive.mount('/content/gdrive')
os.chdir('gdrive/My Drive/home/qminers')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import matplotlib.pyplot as plt
import pandas as pd
from qminers.utils import prepare_data_v1

prepare_data = prepare_data_v1

In [0]:
financial_df = pd.read_csv('data/sp500_2010_01_01-2014_07_31.csv')
calendar_df = pd.read_csv('data/econ_calendar.csv')

data_train_val, data_test = prepare_data(financial_df, calendar_df)

In [0]:
data_train_val = data_train_val.astype(float)

### prepare a custom cross-val split generator
- we want the cross-validation to be from a continuous time interval
- to make sure it doesn't overlap with training data

In [0]:
def split_generator(training_data, n_splits=5):
  n_folds = []
  fraction_size = training_data.shape[0] // n_splits
  
  for i in range(n_splits - 1):
    idx_start = i * fraction_size
    idx_end = (i + 1) * fraction_size
    
    fold = training_data.iloc[idx_start:idx_end]
    n_folds.append(fold)
   
  fold = training_data.iloc[idx_end:]
  n_folds.append(fold)

## Automated training & validation

In [0]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.model_selection import GridSearchCV


scaler, lin_regressor, transformer = None, None, None

estimators = [('rescale', scaler), ('lin_regressor', lin_regressor)]
estimators = Pipeline(estimators)

regr = TransformedTargetRegressor(regressor=estimators,
                                   transformer=transformer)

param_grid = {
    'regressor__rescale': [MinMaxScaler(), RobustScaler(), StandardScaler()],
    'regressor__lin_regressor': [LinearRegression(), Ridge()],
    'transformer': [MinMaxScaler(), RobustScaler(), StandardScaler()]
}

grid_search = GridSearchCV(
    cv=10,
    scoring=('r2', 'neg_mean_squared_error'),
    refit='r2',
    estimator=regr,
    param_grid=param_grid,
#     return_train_score=True)
    return_train_score=False)

In [111]:
grid_search.fit(data_train_val.iloc[:-1, :], data_train_val.iloc[1:, -1])

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
              regressor=Pipeline(memory=None, steps=[('rescale', None), ('lin_regressor', None)]),
              transformer=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'regressor__rescale': [MinMaxScaler(copy=True, feature_range=(0, 1)), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), StandardScaler(copy=True, with_mean=True, with_std=True)], 'regressor__lin_regressor': [LinearRegression(copy_X=True,...entering=True,
       with_scaling=True), StandardScaler(copy=True, with_mean=True, with_std=True)]},
       pre_dispatch='2*n_jobs', refit='r2', return_train_score=False,
       scoring=('r2', 'neg_mean_squared_error'), verbose=0)

In [112]:
results = pd.DataFrame(grid_search.cv_results_)
selected_columns = [x for x in results.columns if not any([y in x for y in ['param', 'split', 'time']])] 
results = results[selected_columns]
results

Unnamed: 0,mean_test_neg_mean_squared_error,mean_test_r2,rank_test_neg_mean_squared_error,rank_test_r2,std_test_neg_mean_squared_error,std_test_r2
0,-3263012000000000.0,0.162149,3,11,2065706000000000.0,0.24484
1,-3263012000000000.0,0.162149,2,10,2065706000000000.0,0.24484
2,-3263012000000000.0,0.162149,1,12,2065706000000000.0,0.24484
3,-3263012000000000.0,0.162149,8,16,2065706000000000.0,0.24484
4,-3263012000000000.0,0.162149,9,18,2065706000000000.0,0.24484
5,-3263012000000000.0,0.162149,4,17,2065706000000000.0,0.24484
6,-3263012000000000.0,0.162149,5,14,2065706000000000.0,0.24484
7,-3263012000000000.0,0.162149,7,15,2065706000000000.0,0.24484
8,-3263012000000000.0,0.162149,6,13,2065706000000000.0,0.24484
9,-3395644000000000.0,0.172103,17,2,2442510000000000.0,0.194064


In [113]:
best_row = grid_search.best_index_
print("Best parameter (CV score=%0.3f +/- %.2f):" % (results.loc[best_row]['mean_test_r2'], results.loc[best_row]['std_test_r2'] / 2))

Best parameter (CV score=0.172 +/- 0.10):


In [114]:
print(grid_search.best_estimator_)

TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
              regressor=Pipeline(memory=None,
     steps=[('rescale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('lin_regressor', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
              transformer=StandardScaler(copy=True, with_mean=True, with_std=True))


## Eval best estimator on test

In [125]:
from sklearn.metrics import mean_squared_error

data_test_x, data_test_y = data_test.iloc[:-1, :], data_test.iloc[1:, -1]
test_r2 = grid_search.best_estimator_.score(data_test_x, data_test_y)
print('Test R2:\t%.3f' % test_r2)

test_mse = mean_squared_error(data_test_y, grid_search.best_estimator_.predict(data_test_x))
print('Test MSE:\t%E' % test_mse)

Test R2:	-0.095
Test MSE:	1.490505E+15
