# Imports

In [1]:
import pandas as pd
import numpy as np
np.random.seed(2121)

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score

from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV

In [2]:
# import files
rdf_ydf = pd.read_csv('../data/rdf_ydf_preprocessed.csv')
baseline = pd.read_csv('../data/X_test_rdf_ydf_for_baseline.csv')

In [3]:
rdf_ydf.set_index('Unnamed: 0', inplace=True)
rdf_ydf.index.name = None

baseline.set_index('Unnamed: 0', inplace=True)
baseline.index.name = None

In [4]:
rdf_target = 'runs_total_pg'
rdf_cols = ['runs_bat_pg_1yr',
       'runs_bat_pg_2yr', 'runs_bat_pg_3yr', 'runs_br_pg_1yr',
       'runs_br_pg_2yr', 'runs_br_pg_3yr', 'runs_dp_pg_1yr', 'runs_dp_pg_2yr',
       'runs_dp_pg_3yr', 'runs_defense_pg_1yr', 'runs_defense_pg_2yr',
       'runs_defense_pg_3yr', 'runs_position_pg_1yr', 'runs_position_pg_2yr',
       'runs_position_pg_3yr']

y_rdf_train = rdf_ydf[rdf_ydf.is_train==1][rdf_target]
X_rdf_train = rdf_ydf[rdf_ydf.is_train==1][rdf_cols]
y_rdf_test = rdf_ydf[rdf_ydf.is_train==0][rdf_target]
X_rdf_test = rdf_ydf[rdf_ydf.is_train==0][rdf_cols]


ydf_target = 'opprpg'
ydf_cols = ['opprpg_1yr', 'opprpg_2yr', 'opprpg_3yr']

y_ydf_train = rdf_ydf[rdf_ydf.is_train==1][ydf_target]
X_ydf_train = rdf_ydf[rdf_ydf.is_train==1][ydf_cols]
y_ydf_test = rdf_ydf[rdf_ydf.is_train==0][ydf_target]
X_ydf_test = rdf_ydf[rdf_ydf.is_train==0][ydf_cols]

In [5]:
odf_target = 'waa_pg'
odf_cols = [c for c in rdf_ydf.columns if c not in ydf_cols
                    if c not in rdf_cols
                    if c not in [rdf_target, ydf_target,odf_target,'is_train']]

y_odf_train = rdf_ydf[rdf_ydf.is_train==1][odf_target]
X_odf_train = rdf_ydf[rdf_ydf.is_train==1][odf_cols]
y_odf_test = rdf_ydf[rdf_ydf.is_train==0][odf_target]
X_odf_test = rdf_ydf[rdf_ydf.is_train==0][odf_cols]

# RDF Modeling

## Linear Regression

In [6]:
# initialize
lr_rdf = LinearRegression()

# fit
lr_rdf.fit(X_rdf_train, y_rdf_train)

# score
lr_rdf.score(X_rdf_test,y_rdf_test)

0.3703784439562483

### Lasso Regularization

In [7]:
# initialize
lasso_cv = LassoCV()

# cv
scores = cross_val_score(lasso_cv, X_rdf_train, y_rdf_train)

# fit
lasso_cv.fit(X_rdf_train, y_rdf_train)

# best attributes
print(f'Best Alpha: {lasso_cv.alpha_:.4f}')
print(f'CV Score: {scores.mean():.4f}')

Best Alpha: 0.0004
CV Score: 0.3533


In [8]:
rdf_lasso_cols = [col for col, coef in zip(X_rdf_train.columns, lasso_cv.coef_) if coef]

In [9]:
lr_rdf_lasso = LinearRegression()

lr_rdf_lasso.fit(X_rdf_train[rdf_lasso_cols], y_rdf_train)

lr_rdf_lasso.score(X_rdf_test[rdf_lasso_cols],y_rdf_test)

0.37106854317360527

Lasso feature selection does improve the models score slightly!

## Random Forest

In [10]:
# initialize
rf_rdf = RandomForestRegressor()

# fit
rf_rdf.fit(X_rdf_train, y_rdf_train)

# score
rf_rdf.score(X_rdf_test,y_rdf_test)

0.32037187447786997

**Run this only if you want to wait**

In [11]:
# param_grid = {'max_depth': [3,5,10],
#               'min_samples_split': [2,5,10]}

# rf = RandomForestRegressor(random_state=1212)

# sh = HalvingGridSearchCV(rf, param_grid, cv=5,
#                          factor=2, resource='n_estimators',
#                          min_resources=50,
#                          max_resources=1000, verbose=3)

# sh.fit(X_rdf_train, y_rdf_train)

In [12]:
# sh.best_params_

Best params were {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 400}

In [13]:
# initialize
rf_rdf = RandomForestRegressor(n_estimators=400,
                           max_depth=10,
                           min_samples_split=10)

# fit
rf_rdf.fit(X_rdf_train, y_rdf_train)

# score
rf_rdf.score(X_rdf_test, y_rdf_test)

0.33135872990804993

Halving grid search improved the model score!

## SVM

In [14]:
# initialize
svr_rdf = SVR()

# fit
svr_rdf.fit(X_rdf_train, y_rdf_train)

# score
svr_rdf.score(X_rdf_test,y_rdf_test)

0.22743572519738087

**Only runs this if you want to wait**

In [15]:
# param_grid = {'epsilon': np.logspace(-2,0,3),
#               'C': np.logspace(-2,0,3),
#               'gamma': ['scale','auto']}

# svr = SVR()

# gs = GridSearchCV(svr, param_grid, cv=5,
#                   verbose=3)

# gs.fit(X_rdf_train, y_rdf_train)

In [16]:
# gs.best_params_

Best Params were {'C': 0.1, 'epsilon': 0.1, 'gamma': 'scale'}

In [17]:
# initialize
svr_rdf = SVR(C=0.1, epsilon = 0.1, gamma = 'scale')

# fit
svr_rdf.fit(X_rdf_train, y_rdf_train)

# score
svr_rdf.score(X_rdf_test,y_rdf_test)

0.3282935992204479

Grid search improved our model score!

# YDF Modeling

## Linear Regression

In [18]:
# initialize
lr_ydf = LinearRegression()

# fit
lr_ydf.fit(X_ydf_train, y_ydf_train)

# score
lr_ydf.score(X_ydf_test,y_ydf_test)

0.6086594412093911

### Lasso Regularization

In [19]:
# initialize
lasso_cv = LassoCV()

# cv
scores = cross_val_score(lasso_cv, X_ydf_train, y_ydf_train)

# fit
lasso_cv.fit(X_ydf_train, y_ydf_train)

# best attributes
print(f'Best Alpha: {lasso_cv.alpha_:.4f}')
print(f'CV Score: {scores.mean():.4f}')

Best Alpha: 0.0004
CV Score: 0.6205


In [20]:
ydf_lasso_cols = [col for col, coef in zip(X_ydf_train.columns, lasso_cv.coef_) if coef]

In [21]:
lr_ydf.fit(X_ydf_train[ydf_lasso_cols], y_ydf_train)

lr_ydf.score(X_ydf_test[ydf_lasso_cols],y_ydf_test)

0.6086594412093911

Lasso feature selection has no effect because it keeps all of the columns originally in the set

## Random Forest

In [22]:
# initialize
rf_ydf = RandomForestRegressor()

# fit
rf_ydf.fit(X_ydf_train, y_ydf_train)

# score
rf_ydf.score(X_ydf_test,y_ydf_test)

0.9603664051660922

**Run this if you want to wait**

In [23]:
# param_grid = {'max_depth': [3,5,10, None],
#               'min_samples_split': [2,5,10],
#               'n_estimators': [50,100,200]}

# rf = RandomForestRegressor(random_state=1212)

# sh = GridSearchCV(rf, param_grid, cv=5,
#                   verbose=3)

# sh.fit(X_ydf_train, y_ydf_train)

In [24]:
# sh.best_params_

Best params were {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}

In [25]:
# initialize
rf_ydf = RandomForestRegressor(n_estimators=200,
                           max_depth=None,
                           min_samples_split=10)

# fit
rf_ydf.fit(X_ydf_train, y_ydf_train)

# score
rf_ydf.score(X_ydf_test, y_ydf_test)

0.9592851494756717

Halving grid Search improved the model score!

## SVM

In [26]:
# initialize
svr_ydf = SVR()

# fit
svr_ydf.fit(X_ydf_train, y_ydf_train)

# score
svr_ydf.score(X_ydf_test,y_ydf_test)

0.728590464905441

**Run this if you want to wait**

In [27]:
# param_grid = {'epsilon': np.logspace(-3,1,3),
#               'C': np.logspace(1,3,3),
#               'gamma': ['scale','auto']}

# svr = SVR()

# gs = GridSearchCV(svr, param_grid, cv=5,
#                   verbose=3)

# gs.fit(X_ydf_train, y_ydf_train)

In [28]:
# gs.best_params_

Best params were {'C': 100.0, 'epsilon': 0.001, 'gamma': 'scale'}

In [29]:
# initialize
svr_ydf = SVR(C=100, epsilon = 0.001, gamma = 'scale')

# fit
svr_ydf.fit(X_ydf_train, y_ydf_train)

# score
svr_ydf.score(X_ydf_test,y_ydf_test)

0.7899773689061964

Grid search improved the model!

# Combined Modeling

## Baseline

In [30]:
baseline = (baseline['waa_pg_1yr']*3 + baseline['waa_pg_2yr']*2\
                + baseline['waa_pg_3yr'])/6

In [31]:
r2_score(y_odf_test, baseline)

0.3094563793548818

An interesting note here is that the baseline is actually higher for this approach. Full approach's baseline was 29%

## Modeling

Now combine our predictions from rdf and ydf's respective best model predictions with our remaining features to get a model for waa_pg

In [32]:

y_rdf_hat = pd.Series(lr_rdf_lasso.predict(X_rdf_test[rdf_lasso_cols]),
                      index=X_rdf_test.index,
                      name='runs_total_pg')

In [33]:
# score
y_ydf_hat = pd.Series(rf_ydf.predict(X_ydf_test),
                      index=X_ydf_test.index,
                      name='opprpg')

In [34]:
# create x_train from rdf and ydf targets
X_train =\
pd.merge(
    pd.merge(X_odf_train,y_rdf_train,
             how = 'inner',
             left_index=True, right_index=True),
    y_ydf_train,
    how = 'inner',
    left_index=True, right_index=True)

In [35]:
# create x_test from rdf and ydf model predictions
X_test =\
pd.merge(
    pd.merge(X_odf_test,y_rdf_hat,
             how = 'inner',
             left_index=True, right_index=True),
    y_ydf_hat,
    how = 'inner',
    left_index=True, right_index=True)

In [36]:
y_train, y_test = y_odf_train, y_odf_test

## Linear Regression

In [37]:
# initialize
lr = LinearRegression()

# fit
lr.fit(X_train, y_train)

# score
lr.score(X_test,y_test)

0.3696392923964582

### Lasso Regularization

In [38]:
# initialize
lasso_cv = LassoCV()

# cv
scores = cross_val_score(lasso_cv, X_train, y_train)

# fit
lasso_cv.fit(X_train, y_train)

# best attributes
print(f'Best Alpha: {lasso_cv.alpha_:.4f}')
print(f'CV Score: {scores.mean():.4f}')

Best Alpha: 0.0000
CV Score: 0.9923


In [39]:
# generate features to select
lasso_cv.score(X_test, y_test)

0.3704104155015663

In [40]:
lasso_cols = [col for col, coef in zip(X_train.columns, lasso_cv.coef_) if coef]

In [41]:
lr.fit(X_train[lasso_cols], y_train)

lr.score(X_test[lasso_cols],y_test)

0.37049818472316887

Lasso feature selection helped our model!

## Random Forest

In [42]:
# initialize
rf = RandomForestRegressor()

# fit
rf.fit(X_train, y_train)

# score
rf.score(X_test,y_test)

0.37353619621445855

**Run if you want to wait**

In [43]:
# param_grid = {'max_depth': [3,5,10, None],
#               'min_samples_split': [2,5,10]}

# rf = RandomForestRegressor(random_state=1212)

# sh = HalvingGridSearchCV(rf, param_grid, cv=5,
#                          factor=2, resource='n_estimators',
#                          min_resources=50,
#                          max_resources=1000, verbose=3)

# sh.fit(X_train, y_train)

In [44]:
# sh.best_params_

best params were {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 400}

In [45]:
# initialize
rf = RandomForestRegressor(n_estimators=400,
                           max_depth=None,
                           min_samples_split=2)

# fit
rf.fit(X_train, y_train)

# score
rf.score(X_test, y_test)

0.3729312763020318

Halving grid search improved our model score!

## SVM

In [46]:
# initialize
svr = SVR()

# fit
svr.fit(X_train, y_train)

# score
svr.score(X_test,y_test)

-0.052946710119773366

**Run if you want to wait**

In [47]:
# param_grid = {'epsilon': np.logspace(0,2,3),
#               'C': np.logspace(0,2,3),
#               'gamma': ['scale','auto']}

# svr = SVR()

# gs = GridSearchCV(svr, param_grid, cv=5,
#                   verbose=3)

# gs.fit(X_train, y_train)

In [48]:
# gs.best_params_

Best params were {'C': 1.0, 'epsilon': 10.0, 'gamma': 'scale'}

In [49]:
# initialize
svr = SVR(C=1, epsilon = 10, gamma = 'scale')

# fit
svr.fit(X_train, y_train)

# score
svr.score(X_test,y_test)

-0.05294671011975538

SVR terrible no matter how you slice it.

# Results

None of the models outperformed the Full Approach model. If the models were closer to outperforming the production model, then more tuning would be done and analysis could be completed. That for now is out of scope.