# Imports

In [2]:
import pandas as pd
import numpy as np
np.random.seed(2121)

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [3]:
# import files
rdf_ydf = pd.read_csv('../data/rdf_ydf_preprocessed.csv')

In [4]:
rdf_ydf.set_index('Unnamed: 0', inplace=True)
rdf_ydf.index.name = None

In [5]:
rdf_target = 'runs_total_pg'
rdf_cols = ['g_of', 'runs_bat_pg_1yr', 'runs_bat_pg_2yr', 'runs_bat_pg_3yr',
       'runs_br_pg_1yr', 'runs_dp_pg_2yr', 'runs_dp_pg_3yr',
       'runs_defense_pg_1yr', 'runs_defense_pg_2yr', 'runs_defense_pg_3yr',
       'runs_position_pg_1yr', 'runs_position_pg_2yr', 'g_c_share_1yr',
       'g_c_share_2yr', 'g_c_share_3yr', 'g_1b_share_2yr', 'g_cf_share_1yr',
       'g_of_3yr', 'g_dh_share_1yr', 'g_cof_share_1yr', 'pc_0', 'pc_1', 'pc_2',
       'pc_3', 'pc_4']

y_rdf_train = rdf_ydf[rdf_ydf.is_train==1][rdf_target]
X_rdf_train = rdf_ydf[rdf_ydf.is_train==1][rdf_cols]
y_rdf_test = rdf_ydf[rdf_ydf.is_train==0][rdf_target]
X_rdf_test = rdf_ydf[rdf_ydf.is_train==0][rdf_cols]


ydf_target = 'opprpg'
ydf_cols = ['opprpg_1yr', 'opprpg_2yr', 'opprpg_3yr']

y_ydf_train = rdf_ydf[rdf_ydf.is_train==1][ydf_target]
X_ydf_train = rdf_ydf[rdf_ydf.is_train==1][ydf_cols]
y_ydf_test = rdf_ydf[rdf_ydf.is_train==0][ydf_target]
X_ydf_test = rdf_ydf[rdf_ydf.is_train==0][ydf_cols]

In [83]:
odf_target = 'waa_pg'
odf_cols = [c for c in rdf_ydf.columns if c not in ydf_cols
                    if c not in rdf_cols
                    if c not in [rdf_target, ydf_target,odf_target,'is_train']]

y_odf_train = rdf_ydf[rdf_ydf.is_train==1][odf_target]
X_odf_train = rdf_ydf[rdf_ydf.is_train==1][odf_cols]
y_odf_test = rdf_ydf[rdf_ydf.is_train==0][odf_target]
X_odf_test = rdf_ydf[rdf_ydf.is_train==0][odf_cols]

# RDF Modeling

## Linear Regression

In [27]:
# initialize
lr_rdf = LinearRegression()

# fit
lr_rdf.fit(X_rdf_train, y_rdf_train)

# score
lr_rdf.score(X_rdf_test,y_rdf_test)

0.3729301744261698

### Lasso Regularization

In [9]:
# initialize
lasso_cv = LassoCV()

# cv
scores = cross_val_score(lasso_cv, X_rdf_train, y_rdf_train)

# fit
lasso_cv.fit(X_rdf_train, y_rdf_train)

# best attributes
print(f'Best Alpha: {lasso_cv.alpha_:.4f}')
print(f'CV Score: {scores.mean():.4f}')

Best Alpha: 0.0007
CV Score: 0.3526


In [10]:
# generate features to select
lasso_cv.score(X_rdf_test, y_rdf_test)

0.3710231633117884

In [11]:
lasso_cols = [col for col, coef in zip(X_rdf_train.columns, lasso_cv.coef_) if coef]

In [12]:
lr.fit(X_rdf_train[lasso_cols], y_rdf_train)

LinearRegression()

In [13]:
lr.score(X_rdf_test[lasso_cols],y_rdf_test)

0.37225301284549295

## Random Forest

In [14]:
# initialize
rf = RandomForestRegressor()

# fit
rf.fit(X_rdf_train, y_rdf_train)

# score
rf.score(X_rdf_test,y_rdf_test)

0.32941430249385495

In [15]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV

In [16]:
param_grid = {'max_depth': [3,5,10],
              'min_samples_split': [2,5,10]}

rf = RandomForestRegressor(random_state=1212)

sh = HalvingGridSearchCV(rf, param_grid, cv=5,
                         factor=2, resource='n_estimators',
                         min_resources=50,
                         max_resources=1000, verbose=3)

sh.fit(X_rdf_train, y_rdf_train)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 5
min_resources_: 50
max_resources_: 1000
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 9
n_resources: 50
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.312, test=0.258) total time=   0.3s
[CV 2/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.338, test=0.168) total time=   0.3s
[CV 3/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.310, test=0.281) total time=   0.4s
[CV 4/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.315, test=0.235) total time=   0.3s
[CV 5/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.315, test=0.272) total time=   0.3s
[CV 1/5] END max_depth=3, min_samples_split=5, n_estimators=50;, score=(train=0.312, test=0.258) total time=   0.3s
[CV 2/5] END max_depth=3, min_samples_split=5, 

[CV 3/5] END max_depth=10, min_samples_split=10, n_estimators=100;, score=(train=0.746, test=0.351) total time=   1.6s
[CV 4/5] END max_depth=10, min_samples_split=10, n_estimators=100;, score=(train=0.750, test=0.342) total time=   1.6s
[CV 5/5] END max_depth=10, min_samples_split=10, n_estimators=100;, score=(train=0.754, test=0.315) total time=   1.6s
----------
iter: 2
n_candidates: 3
n_resources: 200
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=(train=0.792, test=0.309) total time=   3.5s
[CV 2/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=(train=0.813, test=0.214) total time=   3.6s
[CV 3/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=(train=0.794, test=0.353) total time=   3.6s
[CV 4/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=(train=0.792, test=0.337) total time=   3.3s
[CV 5/5] END max_depth=10, min_samples_split=5, n_estimator

HalvingGridSearchCV(estimator=RandomForestRegressor(random_state=1212),
                    factor=2, max_resources=1000, min_resources=50,
                    param_grid={'max_depth': [3, 5, 10],
                                'min_samples_split': [2, 5, 10]},
                    resource='n_estimators', verbose=3)

In [17]:
sh.best_params_

{'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 400}

In [18]:
# initialize
rf = RandomForestRegressor(n_estimators=400,
                           max_depth=10,
                           min_samples_split=10)

# fit
rf.fit(X_rdf_train, y_rdf_train)

# score
rf.score(X_rdf_test, y_rdf_test)

0.3410445229029072

In [19]:
pd.DataFrame(rf.feature_importances_, index=X_rdf_train.columns).sort_values(0,ascending=False).head(10)

Unnamed: 0,0
runs_bat_pg_1yr,0.2701
runs_bat_pg_2yr,0.098971
runs_defense_pg_2yr,0.081175
runs_defense_pg_1yr,0.07521
runs_bat_pg_3yr,0.048577
runs_br_pg_1yr,0.045247
runs_position_pg_1yr,0.043583
runs_position_pg_2yr,0.038433
runs_defense_pg_3yr,0.034535
pc_0,0.031558


## SVM

In [20]:
# initialize
svr = SVR()

# fit
svr.fit(X_rdf_train, y_rdf_train)

# score
svr.score(X_rdf_test,y_rdf_test)

0.23794974889437492

In [23]:
param_grid = {'epsilon': np.logspace(-2,0,3),
              'C': np.logspace(-2,0,3),
              'gamma': ['scale','auto']}

svr = SVR()

gs = GridSearchCV(svr, param_grid, cv=5,
                  verbose=3)

gs.fit(X_rdf_train, y_rdf_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .C=0.01, epsilon=0.01, gamma=scale;, score=0.283 total time=   0.3s
[CV 2/5] END .C=0.01, epsilon=0.01, gamma=scale;, score=0.221 total time=   0.3s
[CV 3/5] END .C=0.01, epsilon=0.01, gamma=scale;, score=0.313 total time=   0.3s
[CV 4/5] END .C=0.01, epsilon=0.01, gamma=scale;, score=0.283 total time=   0.3s
[CV 5/5] END .C=0.01, epsilon=0.01, gamma=scale;, score=0.293 total time=   0.3s
[CV 1/5] END ..C=0.01, epsilon=0.01, gamma=auto;, score=0.245 total time=   0.3s
[CV 2/5] END ..C=0.01, epsilon=0.01, gamma=auto;, score=0.194 total time=   0.3s
[CV 3/5] END ..C=0.01, epsilon=0.01, gamma=auto;, score=0.271 total time=   0.3s
[CV 4/5] END ..C=0.01, epsilon=0.01, gamma=auto;, score=0.244 total time=   0.3s
[CV 5/5] END ..C=0.01, epsilon=0.01, gamma=auto;, score=0.260 total time=   0.3s
[CV 1/5] END ..C=0.01, epsilon=0.1, gamma=scale;, score=0.269 total time=   0.1s
[CV 2/5] END ..C=0.01, epsilon=0.1, gamma=scale;

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': array([0.01, 0.1 , 1.  ]),
                         'epsilon': array([0.01, 0.1 , 1.  ]),
                         'gamma': ['scale', 'auto']},
             verbose=3)

In [24]:
gs.best_params_

{'C': 0.1, 'epsilon': 0.1, 'gamma': 'scale'}

In [25]:
gs.best_score_

0.31977747172827514

In [26]:
# initialize
svr = SVR(C=0.1, epsilon = 0.1, gamma = 'scale')

# fit
svr.fit(X_rdf_train, y_rdf_train)

# score
svr.score(X_rdf_test,y_rdf_test)

0.31822964825614153

# YDF Modeling

## Linear Regression

In [30]:
# initialize
lr_ydf = LinearRegression()

# fit
lr_ydf.fit(X_ydf_train, y_ydf_train)

# score
lr_ydf.score(X_ydf_test,y_ydf_test)

0.6086594412093911

### Lasso Regularization

In [31]:
# initialize
lasso_cv = LassoCV()

# cv
scores = cross_val_score(lasso_cv, X_ydf_train, y_ydf_train)

# fit
lasso_cv.fit(X_ydf_train, y_ydf_train)

# best attributes
print(f'Best Alpha: {lasso_cv.alpha_:.4f}')
print(f'CV Score: {scores.mean():.4f}')

Best Alpha: 0.0004
CV Score: 0.6205


In [32]:
# generate features to select
lasso_cv.score(X_ydf_test, y_ydf_test)

0.6087043647058756

In [33]:
lasso_cols = [col for col, coef in zip(X_ydf_train.columns, lasso_cv.coef_) if coef]

In [34]:
lr.fit(X_ydf_train[lasso_cols], y_ydf_train)

LinearRegression()

In [35]:
lr.score(X_ydf_test[lasso_cols],y_ydf_test)

0.6086594412093911

## Random Forest

In [37]:
# initialize
rf = RandomForestRegressor()

# fit
rf.fit(X_ydf_train, y_ydf_train)

# score
rf.score(X_ydf_test,y_ydf_test)

0.9595523258778794

In [48]:
param_grid = {'max_depth': [3,5,10, None],
              'min_samples_split': [2,5,10],
              'n_estimators': [50,100,200]}

rf = RandomForestRegressor(random_state=1212)

sh = GridSearchCV(rf, param_grid, cv=5,
                  verbose=3)

sh.fit(X_ydf_train, y_ydf_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=0.720 total time=   0.0s
[CV 2/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=0.750 total time=   0.0s
[CV 3/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=0.708 total time=   0.0s
[CV 4/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=0.709 total time=   0.0s
[CV 5/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=0.718 total time=   0.0s
[CV 1/5] END max_depth=3, min_samples_split=2, n_estimators=100;, score=0.722 total time=   0.1s
[CV 2/5] END max_depth=3, min_samples_split=2, n_estimators=100;, score=0.750 total time=   0.1s
[CV 3/5] END max_depth=3, min_samples_split=2, n_estimators=100;, score=0.711 total time=   0.1s
[CV 4/5] END max_depth=3, min_samples_split=2, n_estimators=100;, score=0.710 total time=   0.1s
[CV 5/5] END max_depth=3, min_samples_split=2, n_estimators=100;, scor

[CV 5/5] END max_depth=5, min_samples_split=10, n_estimators=100;, score=0.829 total time=   0.2s
[CV 1/5] END max_depth=5, min_samples_split=10, n_estimators=200;, score=0.859 total time=   0.4s
[CV 2/5] END max_depth=5, min_samples_split=10, n_estimators=200;, score=0.862 total time=   0.4s
[CV 3/5] END max_depth=5, min_samples_split=10, n_estimators=200;, score=0.837 total time=   0.4s
[CV 4/5] END max_depth=5, min_samples_split=10, n_estimators=200;, score=0.844 total time=   0.4s
[CV 5/5] END max_depth=5, min_samples_split=10, n_estimators=200;, score=0.829 total time=   0.4s
[CV 1/5] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.944 total time=   0.0s
[CV 2/5] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.949 total time=   0.1s
[CV 3/5] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.951 total time=   0.1s
[CV 4/5] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.949 total time=   0.1s
[CV 5/5] END max_depth=1

[CV 4/5] END max_depth=None, min_samples_split=10, n_estimators=50;, score=0.952 total time=   0.1s
[CV 5/5] END max_depth=None, min_samples_split=10, n_estimators=50;, score=0.954 total time=   0.0s
[CV 1/5] END max_depth=None, min_samples_split=10, n_estimators=100;, score=0.947 total time=   0.2s
[CV 2/5] END max_depth=None, min_samples_split=10, n_estimators=100;, score=0.951 total time=   0.2s
[CV 3/5] END max_depth=None, min_samples_split=10, n_estimators=100;, score=0.955 total time=   0.2s
[CV 4/5] END max_depth=None, min_samples_split=10, n_estimators=100;, score=0.951 total time=   0.2s
[CV 5/5] END max_depth=None, min_samples_split=10, n_estimators=100;, score=0.954 total time=   0.2s
[CV 1/5] END max_depth=None, min_samples_split=10, n_estimators=200;, score=0.947 total time=   0.5s
[CV 2/5] END max_depth=None, min_samples_split=10, n_estimators=200;, score=0.951 total time=   0.5s
[CV 3/5] END max_depth=None, min_samples_split=10, n_estimators=200;, score=0.956 total time=

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=1212),
             param_grid={'max_depth': [3, 5, 10, None],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]},
             verbose=3)

In [49]:
sh.best_params_

{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}

In [50]:
# initialize
rf = RandomForestRegressor(n_estimators=200,
                           max_depth=None,
                           min_samples_split=10)

# fit
rf.fit(X_ydf_train, y_ydf_train)

# score
rf.score(X_ydf_test, y_ydf_test)

0.9594747553118195

## SVM

In [51]:
# initialize
svr = SVR()

# fit
svr.fit(X_ydf_train, y_ydf_train)

# score
svr.score(X_ydf_test,y_ydf_test)

0.728590464905441

In [57]:
param_grid = {'epsilon': np.logspace(-3,1,3),
              'C': np.logspace(1,3,3),
              'gamma': ['scale','auto']}

svr = SVR()

gs = GridSearchCV(svr, param_grid, cv=5,
                  verbose=3)

gs.fit(X_ydf_train, y_ydf_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END C=10.0, epsilon=0.001, gamma=scale;, score=0.726 total time=   0.6s
[CV 2/5] END C=10.0, epsilon=0.001, gamma=scale;, score=0.771 total time=   0.7s
[CV 3/5] END C=10.0, epsilon=0.001, gamma=scale;, score=0.725 total time=   0.8s
[CV 4/5] END C=10.0, epsilon=0.001, gamma=scale;, score=0.721 total time=   0.7s
[CV 5/5] END C=10.0, epsilon=0.001, gamma=scale;, score=0.766 total time=   0.8s
[CV 1/5] END .C=10.0, epsilon=0.001, gamma=auto;, score=0.728 total time=   0.6s
[CV 2/5] END .C=10.0, epsilon=0.001, gamma=auto;, score=0.771 total time=   0.7s
[CV 3/5] END .C=10.0, epsilon=0.001, gamma=auto;, score=0.725 total time=   0.8s
[CV 4/5] END .C=10.0, epsilon=0.001, gamma=auto;, score=0.721 total time=   0.7s
[CV 5/5] END .C=10.0, epsilon=0.001, gamma=auto;, score=0.766 total time=   0.8s
[CV 1/5] END ..C=10.0, epsilon=0.1, gamma=scale;, score=0.770 total time=   0.4s
[CV 2/5] END ..C=10.0, epsilon=0.1, gamma=scale;

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': array([  10.,  100., 1000.]),
                         'epsilon': array([1.e-03, 1.e-01, 1.e+01]),
                         'gamma': ['scale', 'auto']},
             verbose=3)

In [58]:
gs.best_params_

{'C': 100.0, 'epsilon': 0.001, 'gamma': 'scale'}

In [59]:
gs.best_score_

0.764522987906367

In [60]:
# initialize
svr = SVR(C=100, epsilon = 0.001, gamma = 'scale')

# fit
svr.fit(X_ydf_train, y_ydf_train)

# score
svr.score(X_ydf_test,y_ydf_test)

0.7899773689061964

# Combined Modeling

In [104]:
y_rdf_hat = pd.Series(lr_rdf.predict(X_rdf_test),
                      index=X_rdf_test.index,
                      name='runs_total_pg')

In [105]:
# initialize
rf = RandomForestRegressor()

# fit
rf.fit(X_ydf_train, y_ydf_train)

# score
y_ydf_hat = pd.Series(rf.predict(X_ydf_test),
                      index=X_ydf_test.index,
                      name='opprpg')

In [106]:
X_train =\
pd.merge(
    pd.merge(X_odf_train,y_rdf_train,
             how = 'inner',
             left_index=True, right_index=True),
    y_ydf_train,
    how = 'inner',
    left_index=True, right_index=True)

In [107]:
X_test =\
pd.merge(
    pd.merge(X_odf_test,y_rdf_hat,
             how = 'inner',
             left_index=True, right_index=True),
    y_ydf_hat,
    how = 'inner',
    left_index=True, right_index=True)

In [108]:
y_train, y_test = y_odf_train, y_odf_test

## Linear Regression

In [109]:
# initialize
lr = LinearRegression()

# fit
lr.fit(X_train, y_train)

# score
lr.score(X_test,y_test)

0.3720853814385181

### Lasso Regularization

In [110]:
# initialize
lasso_cv = LassoCV()

# cv
scores = cross_val_score(lasso_cv, X_train, y_train)

# fit
lasso_cv.fit(X_train, y_train)

# best attributes
print(f'Best Alpha: {lasso_cv.alpha_:.4f}')
print(f'CV Score: {scores.mean():.4f}')

Best Alpha: 0.0000
CV Score: 0.9923


In [112]:
# generate features to select
lasso_cv.score(X_test, y_test)

0.3726768110825699

In [113]:
lasso_cols = [col for col, coef in zip(X_train.columns, lasso_cv.coef_) if coef]

In [114]:
lr.fit(X_train[lasso_cols], y_train)

LinearRegression()

In [115]:
lr.score(X_test[lasso_cols],y_test)

0.3720853814385181

## Random Forest

In [116]:
# initialize
rf = RandomForestRegressor()

# fit
rf.fit(X_train, y_train)

# score
rf.score(X_test,y_test)

0.37230916707983397

In [118]:
param_grid = {'max_depth': [3,5,10, None],
              'min_samples_split': [2,5,10]}

rf = RandomForestRegressor(random_state=1212)

sh = HalvingGridSearchCV(rf, param_grid, cv=5,
                         factor=2, resource='n_estimators',
                         min_resources=50,
                         max_resources=1000, verbose=3)

sh.fit(X_train, y_train)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 5
min_resources_: 50
max_resources_: 1000
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 12
n_resources: 50
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.976, test=0.976) total time=   0.1s
[CV 2/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.976, test=0.968) total time=   0.1s
[CV 3/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.977, test=0.972) total time=   0.1s
[CV 4/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.975, test=0.975) total time=   0.1s
[CV 5/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.975, test=0.975) total time=   0.1s
[CV 1/5] END max_depth=3, min_samples_split=5, n_estimators=50;, score=(train=0.976, test=0.976) total time=   0.1s
[CV 2/5] END max_depth=3, min_samples_split=5

[CV 3/5] END max_depth=10, min_samples_split=10, n_estimators=100;, score=(train=0.998, test=0.994) total time=   0.7s
[CV 4/5] END max_depth=10, min_samples_split=10, n_estimators=100;, score=(train=0.998, test=0.995) total time=   0.7s
[CV 5/5] END max_depth=10, min_samples_split=10, n_estimators=100;, score=(train=0.998, test=0.995) total time=   0.7s
[CV 1/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=(train=0.999, test=0.995) total time=   0.8s
[CV 2/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=(train=0.999, test=0.995) total time=   0.7s
[CV 3/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=(train=0.999, test=0.995) total time=   0.8s
[CV 4/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=(train=0.999, test=0.995) total time=   0.8s
[CV 5/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=(train=0.999, test=0.995) total time=   0.7s
[CV 1/5] END max_depth=None, min_samples_split=5, n_e

HalvingGridSearchCV(estimator=RandomForestRegressor(random_state=1212),
                    factor=2, max_resources=1000, min_resources=50,
                    param_grid={'max_depth': [3, 5, 10, None],
                                'min_samples_split': [2, 5, 10]},
                    resource='n_estimators', verbose=3)

In [119]:
sh.best_params_

{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 400}

In [120]:
# initialize
rf = RandomForestRegressor(n_estimators=400,
                           max_depth=None,
                           min_samples_split=2)

# fit
rf.fit(X_train, y_train)

# score
rf.score(X_test, y_test)

0.3720883744126777

In [121]:
pd.DataFrame(rf.feature_importances_, index=X_train.columns).sort_values(0,ascending=False).head(10)

Unnamed: 0,0
runs_total_pg,0.993725
opprpg,0.003342
year_id,0.001081
waa_pg_2yr,0.000519
waa_pg_3yr,0.000502
waa_pg_1yr,0.000495
age,0.000336


## SVM

In [122]:
# initialize
svr = SVR()

# fit
svr.fit(X_train, y_train)

# score
svr.score(X_test,y_test)

-0.052946710119773366

In [145]:
param_grid = {'epsilon': np.logspace(0,2,3),
              'C': np.logspace(-11,-9,3),
              'gamma': ['scale','auto']}

svr = SVR()

gs = GridSearchCV(svr, param_grid, cv=5,
                  verbose=3)

gs.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END C=1e-11, epsilon=1.0, gamma=scale;, score=-0.044 total time=   0.0s
[CV 2/5] END C=1e-11, epsilon=1.0, gamma=scale;, score=-0.074 total time=   0.0s
[CV 3/5] END C=1e-11, epsilon=1.0, gamma=scale;, score=-0.002 total time=   0.0s
[CV 4/5] END C=1e-11, epsilon=1.0, gamma=scale;, score=-0.059 total time=   0.0s
[CV 5/5] END C=1e-11, epsilon=1.0, gamma=scale;, score=-0.057 total time=   0.0s
[CV 1/5] END .C=1e-11, epsilon=1.0, gamma=auto;, score=-0.044 total time=   0.0s
[CV 2/5] END .C=1e-11, epsilon=1.0, gamma=auto;, score=-0.074 total time=   0.0s
[CV 3/5] END .C=1e-11, epsilon=1.0, gamma=auto;, score=-0.002 total time=   0.0s
[CV 4/5] END .C=1e-11, epsilon=1.0, gamma=auto;, score=-0.059 total time=   0.0s
[CV 5/5] END .C=1e-11, epsilon=1.0, gamma=auto;, score=-0.057 total time=   0.0s
[CV 1/5] END C=1e-11, epsilon=10.0, gamma=scale;, score=-0.044 total time=   0.0s
[CV 2/5] END C=1e-11, epsilon=10.0, gamma=scale

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': array([1.e-11, 1.e-10, 1.e-09]),
                         'epsilon': array([  1.,  10., 100.]),
                         'gamma': ['scale', 'auto']},
             verbose=3)

In [144]:
gs.best_params_

{'C': 1e-10, 'epsilon': 10.0, 'gamma': 'scale'}

In [147]:
gs.best_score_

-0.04709369427739065

In [146]:
# initialize
svr = SVR(C=1e-10, epsilon = 10, gamma = 'scale')

# fit
svr.fit(X_train, y_train)

# score
svr.score(X_test,y_test)

-0.05294671011975538