# Imports

In [42]:
import numpy as np
import pandas as pd
np.random.seed(2121)

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, LassoCV, Lasso
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [21]:
batpos = pd.read_csv('../data/batpos_full_preprocessed.csv')
baseline = pd.read_csv('../data/X_test_full_for_baseline.csv')

In [22]:
# set our index
for df in [batpos,baseline]:
    df.set_index('Unnamed: 0', inplace=True)
    df.index.name = None

In [23]:
# decompose into X and y, test and train
target = 'waa_pg'

y_train = batpos[batpos.is_train==1][target]
X_train = batpos[batpos.is_train==1]\
            [[c for c in batpos.columns if c != target]]

y_test = batpos[batpos.is_train==0][target]
X_test = batpos[batpos.is_train==0]\
            [[c for c in batpos.columns if c != target]]

# Baseline

Our baseline for modeling will be a more intuitive feeling one: the weighted average of the past three `waa_pg`. If asking how a player would perform, it would be reasonable to look closest at the last year, but also take the previous years into account with a marginal impact.

In [24]:
baseline = (baseline['waa_pg_1yr']*3 + baseline['waa_pg_2yr']*2\
                + baseline['waa_pg_3yr'])/6

In [26]:
# calculate baseline score
r2_score(y_test, baseline)

0.2958886964785129

**Note** Our baseline value is <span style='color:blue'><u>**29.59%**</u></span>

# Linear Regression

Start with a simple Linear Regression with no normalization.

In [27]:
# initialize
lr = LinearRegression()

# fit
lr.fit(X_train, y_train)

# score
lr.score(X_test,y_test)

0.39017970554780557

## Lasso Regularization

In [32]:
# initialize
lasso_cv = LassoCV()

# cv
scores = cross_val_score(lasso_cv, X_train, y_train)

# fit
lasso_cv.fit(X_train, y_train)

# best attributes
print(f'Best Alpha: {lasso_cv.alpha_:.4f}')
print(f'CV Score: {scores.mean():.4f}')

Best Alpha: 0.0003
CV Score: 0.3661


In [33]:
# generate features to select
lasso_cv.score(X_test,y_test)

0.3935131050271592

In [38]:
lasso_cols = [col for col, coef in zip(X_train.columns, lasso_cv.coef_) if coef]

In [39]:
lr.fit(X_train[lasso_cols], y_train)

LinearRegression()

In [41]:
lr.score(X_test[lasso_cols],y_test)

0.3914652994341199

In [43]:
lasso = Lasso(alpha=lasso_cv.alpha_)

In [44]:
lasso.fit(X_train, y_train)

Lasso(alpha=0.00029650414105832014)

In [45]:
lasso.score(X_test, y_test)

0.3935131050271592

# Random Forest

In [49]:
# initialize
rf = RandomForestRegressor()

# fit
rf.fit(X_train, y_train)

# score
rf.score(X_test,y_test)

0.3517548672173675

In [51]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV

In [63]:
param_grid = {'max_depth': [3,5,10],
              'min_samples_split': [2,5,10]}

rf = RandomForestRegressor(random_state=1212)

sh = HalvingGridSearchCV(rf, param_grid, cv=5,
                         factor=2, resource='n_estimators',
                         min_resources=50,
                         max_resources=1000, verbose=3)

sh.fit(X_train, y_train)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 5
min_resources_: 50
max_resources_: 1000
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 9
n_resources: 50
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.390, test=0.266) total time=   0.5s
[CV 2/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.369, test=0.356) total time=   0.5s
[CV 3/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.382, test=0.333) total time=   0.5s
[CV 4/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.377, test=0.348) total time=   0.5s
[CV 5/5] END max_depth=3, min_samples_split=2, n_estimators=50;, score=(train=0.376, test=0.324) total time=   0.5s
[CV 1/5] END max_depth=3, min_samples_split=5, n_estimators=50;, score=(train=0.390, test=0.266) total time=   0.5s
[CV 2/5] END max_depth=3, min_samples_split=5, 

[CV 3/5] END max_depth=5, min_samples_split=10, n_estimators=100;, score=(train=0.496, test=0.343) total time=   1.8s
[CV 4/5] END max_depth=5, min_samples_split=10, n_estimators=100;, score=(train=0.495, test=0.343) total time=   1.8s
[CV 5/5] END max_depth=5, min_samples_split=10, n_estimators=100;, score=(train=0.490, test=0.333) total time=   1.8s
----------
iter: 2
n_candidates: 3
n_resources: 200
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END max_depth=5, min_samples_split=2, n_estimators=200;, score=(train=0.501, test=0.266) total time=   3.7s
[CV 2/5] END max_depth=5, min_samples_split=2, n_estimators=200;, score=(train=0.490, test=0.374) total time=   3.7s
[CV 3/5] END max_depth=5, min_samples_split=2, n_estimators=200;, score=(train=0.499, test=0.345) total time=   3.6s
[CV 4/5] END max_depth=5, min_samples_split=2, n_estimators=200;, score=(train=0.498, test=0.346) total time=   3.7s
[CV 5/5] END max_depth=5, min_samples_split=2, n_estimators=200;, 

HalvingGridSearchCV(estimator=RandomForestRegressor(random_state=1212),
                    factor=2, max_resources=1000, min_resources=50,
                    param_grid={'max_depth': [3, 5, 10],
                                'min_samples_split': [2, 5, 10]},
                    resource='n_estimators', verbose=3)

In [64]:
sh.best_params_

{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 400}

In [65]:
# initialize
rf = RandomForestRegressor(n_estimators=400,
                           max_depth=5,
                           min_samples_split=2)

# fit
rf.fit(X_train, y_train)

# score
rf.score(X_test,y_test)

0.3598561648249313

In [66]:
pd.DataFrame(rf.feature_importances_, index=X_train.columns).sort_values(0,ascending=False).head(10)

Unnamed: 0,0
waa_pg_1yr,0.498273
waa_pg_2yr,0.251163
waa_pg_3yr,0.043511
age,0.022621
runs_dp_pg_2yr,0.011742
runs_bat_pg_3yr,0.011106
runs_defense_pg_1yr,0.010519
runs_bat_pg_1yr,0.010134
runs_bat_pg_2yr,0.008247
runs_br_pg_1yr,0.008006


# SVM

In [68]:
# initialize
svr = SVR()

# fit
svr.fit(X_train, y_train)

# score
svr.score(X_test,y_test)

-0.02602472079240159

In [81]:
param_grid = {'epsilon': np.logspace(-3,-1,3),
              'C': np.logspace(-3,-1,3),
              'gamma': ['scale','auto']}

svr = SVR()

gs = GridSearchCV(svr, param_grid, cv=5,
                  verbose=3)

gs.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END C=0.001, epsilon=0.001, gamma=scale;, score=0.279 total time=   0.4s
[CV 2/5] END C=0.001, epsilon=0.001, gamma=scale;, score=0.337 total time=   0.4s
[CV 3/5] END C=0.001, epsilon=0.001, gamma=scale;, score=0.290 total time=   0.4s
[CV 4/5] END C=0.001, epsilon=0.001, gamma=scale;, score=0.311 total time=   0.4s
[CV 5/5] END C=0.001, epsilon=0.001, gamma=scale;, score=0.291 total time=   0.3s
[CV 1/5] END C=0.001, epsilon=0.001, gamma=auto;, score=0.261 total time=   0.3s
[CV 2/5] END C=0.001, epsilon=0.001, gamma=auto;, score=0.315 total time=   0.4s
[CV 3/5] END C=0.001, epsilon=0.001, gamma=auto;, score=0.269 total time=   0.3s
[CV 4/5] END C=0.001, epsilon=0.001, gamma=auto;, score=0.288 total time=   0.4s
[CV 5/5] END C=0.001, epsilon=0.001, gamma=auto;, score=0.271 total time=   0.3s
[CV 1/5] END C=0.001, epsilon=0.01, gamma=scale;, score=0.264 total time=   0.1s
[CV 2/5] END C=0.001, epsilon=0.01, gamma=s

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': array([0.001, 0.01 , 0.1  ]),
                         'epsilon': array([0.001, 0.01 , 0.1  ]),
                         'gamma': ['scale', 'auto']},
             verbose=3)

In [82]:
gs.best_params_

{'C': 0.01, 'epsilon': 0.01, 'gamma': 'scale'}

In [83]:
gs.best_score_

0.32589192875336814

In [84]:
# initialize
svr = SVR(C=0.01, epsilon = 0.01, gamma = 'scale')

# fit
svr.fit(X_train, y_train)

# score
svr.score(X_test,y_test)

0.34022866463091095