# Prepare a sci-kit training & cross-val pipeline

### load data

In [1]:
import os
from google.colab import drive

drive.mount('/content/gdrive')
os.chdir('gdrive/My Drive/home/qminers')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import matplotlib.pyplot as plt
import pandas as pd
from qminers.utils import prepare_data_v1

prepare_data = prepare_data_v1

In [0]:
financial_df = pd.read_csv('data/sp500_2010_01_01-2014_07_31.csv')
calendar_df = pd.read_csv('data/econ_calendar.csv')

data_train_val, data_test = prepare_data(financial_df, calendar_df)

In [0]:
for col in data_train_val.columns:
  data_train_val[col] = data_train_val[col].astype(float)

### prepare a custom cross-val split generator
- we want the cross-validation to be from a continuous time interval
- to make sure it doesn't overlap with training data

In [0]:
def split_generator(training_data, n_splits=5):
  n_folds = []
  fraction_size = training_data.shape[0] // n_splits
  
  for i in range(n_splits - 1):
    idx_start = i * fraction_size
    idx_end = (i + 1) * fraction_size
    
    fold = training_data.iloc[idx_start:idx_end]
    n_folds.append(fold)
   
  fold = training_data.iloc[idx_end:]
  n_folds.append(fold)

In [8]:
data_train_val.iloc[:,:-1].head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,High Impact Expected,Low Impact Expected,Medium Impact Expected,Non-Economic
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-04,112.370003,113.389999,111.510002,113.330002,94.545494,6,33,15,0
2010-01-05,113.260002,113.68,112.849998,113.629997,94.795784,6,15,27,0
2010-01-06,113.519997,113.989998,113.43,113.709999,94.862526,21,21,12,3
2010-01-07,113.5,114.330002,113.18,114.190002,95.262955,18,9,21,0
2010-01-08,113.889999,114.620003,113.660004,114.57,95.579971,10,20,10,0


In [0]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso


In [56]:
import numpy as np
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.compose import TransformedTargetRegressor

estimators = [('rescale', MinMaxScaler()), ('regress', LinearRegression())]
# estimators = [('rescale', MinMaxScaler()), ('logistic_reg', LogisticRegression())]

pipe = Pipeline(estimators)

regr = TransformedTargetRegressor(regressor=pipe,
                                   transformer=transformer)

# scores = cross_val_score(reg, data_train_val.iloc[:-1, :], data_train_val.iloc[1:, -1], cv=10)
# scores = cross_val_score(regr, data_train_val.iloc[:-1, :], data_train_val.iloc[1:, -1], cv=10)
scores = cross_validate(regr, data_train_val.iloc[:-1, :], data_train_val.iloc[1:, -1], cv=10,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True)
# scoring=('r2', 'neg_mean_squared_error')
# print("R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
scores

{'fit_time': array([0.00916386, 0.0042336 , 0.00396919, 0.00380468, 0.00396013,
        0.00374627, 0.00371051, 0.00370169, 0.00371027, 0.00378084]),
 'score_time': array([0.00531697, 0.00354648, 0.00358963, 0.0036037 , 0.00378752,
        0.00335908, 0.00340104, 0.00342178, 0.00333643, 0.00350308]),
 'test_neg_mean_squared_error': array([-6.04232711e+15, -3.81087123e+15, -2.76883174e+15, -2.75995329e+15,
        -8.07440241e+15, -2.45549652e+15, -1.29686963e+15, -1.86826090e+15,
        -2.01909083e+15, -1.53401467e+15]),
 'test_r2': array([ 0.4112371 ,  0.35510909, -0.0309428 ,  0.26651839,  0.42453434,
         0.31821347,  0.09401754, -0.19533945, -0.29942077,  0.27756256]),
 'train_neg_mean_squared_error': array([-2.65257883e+15, -2.89697548e+15, -3.01515474e+15, -3.01045656e+15,
        -2.54440519e+15, -3.04403663e+15, -3.17260014e+15, -3.11398612e+15,
        -3.09344309e+15, -3.15564253e+15]),
 'train_r2': array([0.60077489, 0.57951399, 0.60351728, 0.60015572, 0.48920453,
    

In [0]:
from sklearn.model_selection import GridSearchCV


scaler = MinMaxScaler()
lin_regressor = LinearRegression()

estimators = [('rescale', scaler), ('regress', lin_regressor)]
# estimators = [('rescale', MinMaxScaler()), ('logistic_reg', LogisticRegression())]

pipe = Pipeline(estimators)

param_grid = dict(rescale=[MinMaxScaler(), RobustScaler(), StandardScaler()],
                  regress=[LinearRegression()]
                 )
#                   clf__C=[0.1, 10, 100])

grid_search = GridSearchCV(
    cv=10,
#     scoring=('r2', 'neg_mean_squared_error'),
    scoring='r2',
    refit=False,
#     estimator=regr,
    estimator=pipe,
    param_grid=param_grid)

In [65]:
grid_search.fit(data_train_val.iloc[:-1, :], data_train_val.iloc[1:, -1])

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('rescale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('regress', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'rescale': [MinMaxScaler(copy=True, feature_range=(0, 1)), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), StandardScaler(copy=True, with_mean=True, with_std=True)], 'regress': [LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)]},
       pre_dispatch='2*n_jobs', refit=False, return_train_score='warn',
       scoring='r2', verbose=0)

In [67]:
print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)

Best parameter (CV score=0.162):


In [69]:
print(grid_search.best_params_)

{'regress': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False), 'rescale': MinMaxScaler(copy=True, feature_range=(0, 1))}
