## Modelling

Next, we'll find out which model can best predict fantasy value scores. Models will be assessed on RMSE and MAE since this is a regression problem. But ulimately RMSE since extreme errors should be punished more than moderate ones.

In [1]:
import os
import utils
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from bayes_opt import BayesianOptimization
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
from xgboost import XGBRegressor

from constants import DATA_DIR
from constants import PUNT_TYPES
warnings.filterwarnings('ignore')

In [2]:
cv = RepeatedKFold(n_splits=2, n_repeats=100)

We'll use some classic regression techniques including ridge and lasso. XGBRegressor will also be tested in its vanilla form as well as with some hyperparameter tuning. Lastly, some neural networks will be trained on the dataset.

### Ridge Regression

In [3]:
def ridge(X, y):
    parameters = {'alpha':[0.1, 1, 10]}
    Ridge_reg= GridSearchCV(Ridge(), parameters, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
    Ridge_reg.fit(X, y)
    best_ridge = Ridge_reg.best_estimator_
    errors = utils.cross_val(best_ridge, X, y, n_folds=3)
    print (Ridge_reg.best_params_)
    return Ridge_reg.best_params_, np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

### Lasso Regression

In [4]:
def lasso(X, y):
    parameters = {'alpha':[1e-4, 1e-3, 1e-2, 1e-1]}
    lasso_result = GridSearchCV(Lasso(), parameters, scoring='neg_root_mean_squared_error',cv=cv,n_jobs=-1)
    lasso_result.fit(X, y)
    best_model = lasso_result.best_estimator_
    errors = utils.cross_val(best_model, X, y, n_folds=3)
    print(lasso_result.best_params_)
    return lasso_result.best_params_, np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

### Vanilla XGBRegressor

In [5]:
def vanilla_xgb(X,y):
    vanilla_reg = XGBRegressor().fit(X, y)
    errors = utils.cross_val(vanilla_reg, X, y, n_folds=3)
    utils.summarize_errors(errors, verbose=0)
    return None, np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

###  XGBRegressor w/Bayesian Optimized Hyperparameters

In [6]:
def bo_tune_xgb(max_depth, gamma, n_estimators, learning_rate, subsample):
    params = {'gamma': gamma, 
              'learning_rate':learning_rate, 
              'max_depth': int(max_depth), 
              'n_estimators': int(n_estimators), 
              'subsample': subsample, 
              'eta': 0.1, 
              'eval_metric': 'rmse'}
    dtrain = xgb.DMatrix(X, y)
    cv_result = xgb.cv(params, dtrain, num_boost_round=70, nfold=3)
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]
    

In [7]:
def bo_xgb(X, y):
    params = {'max_depth': (3, 10), 
              'gamma': (0, 1), 
              'n_estimators':(100,120), 
              'learning_rate':(0,1), 
              'subsample': (0.8, 1)}
    xgb_bo = BayesianOptimization(bo_tune_xgb, params, random_state = 1)
    xgb_bo.maximize(n_iter=5, init_points=20, acq='ei')
    params = xgb_bo.max['params']
    params['max_depth']= int(params['max_depth'])
    params['n_estimators']= int(params['n_estimators'])
    print(params)
    clf = XGBRegressor(**params)
    errors = utils.cross_val(clf, X, y, n_folds=3)
    return params, np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

### Neural Network

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn import preprocessing
from keras import regularizers
from keras.callbacks import ModelCheckpoint
import math

In [9]:
def get_modelcheckpoint_path():
    #Create a file path for a model and save models in hdf5 files with datetime, validation losses and epochs
    parent = DATA_DIR+"/NN/"
    child = pd.Timestamp.now().strftime('%Y%m%d-%Hh%Mm') + "-model-epoch_{epoch:02d}-rmse_{val_loss:.5f}.hdf5"
    return parent + child 

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(X.shape[1], input_dim=X.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(15, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

def dropout_model():
    # create model
    model = Sequential()
    model.add(Dense(X.shape[1], input_dim=X.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(15, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

def create_regressor(model, verbose=0):
    reg = KerasRegressor(build_fn=model, epochs=100, batch_size=5, validation_split=0.2, shuffle=True, verbose=verbose)
    return reg

def plot_learning_process(history):
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    return None

def base_keras_model(X, y):
    base_keras = create_regressor(baseline_model, 0)
    errors = utils.cross_val(base_keras, X, y, n_folds=5)
    return None, np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

def dropout_keras_model(X, y):
    dropout_keras = create_regressor(dropout_model, 0)
    errors = utils.cross_val(dropout_keras, X, y, n_folds=5)
    return None, np.mean(errors['MAE']['valid']), np.mean(errors['RMSE']['valid'])

### Model Selection

In [10]:
rmse_table = pd.DataFrame()
mae_table = pd.DataFrame()
params_table = pd.DataFrame()
models = [ridge, lasso, vanilla_xgb, bo_xgb]#, base_keras_model, dropout_keras_model]
for punt in tqdm(PUNT_TYPES):
    if punt == []:
        punt = 'Base'
    else:
        punt = '+'.join(punt)
    df = pd.read_csv(os.path.join(DATA_DIR,'ABT', punt+'.csv'))
    X = df.loc[:, df.columns != 'VALUE']
    y = df['VALUE'].values.reshape(-1,1).flatten()
    for model in tqdm(models):
        param, mae_value, rmse_value = model(X, y)
        mae_table.loc[punt, model.__name__] = mae_value
        rmse_table.loc[punt, model.__name__] = rmse_value

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 2.13428, SD: 0.05147
RMSE | Mean: 2.78775, SD: 0.10978

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 2.20108, SD: 0.07511
RMSE | Mean: 2.87635, SD: 0.14074

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 2.00465, SD: 0.04767
RMSE | Mean: 2.63324, SD: 0.08733

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 2.09554, SD: 0.04893
RMSE | Mean: 2.75659, SD: 0.1074

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 2.17735, SD: 0.06067
RMSE | Mean: 2.86263, SD: 0.12454

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 2.06351, SD: 0.0603
RMSE | Mean: 2.68933, SD: 0.0986

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down t

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 2.15058, SD: 0.0152
RMSE | Mean: 2.81612, SD: 0.0599

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down t

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.001}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 1.80544, SD: 0.03887
RMSE | Mean: 2.37385, SD: 0.10034

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.001}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 1.86586, SD: 0.04412
RMSE | Mean: 2.43498, SD: 0.10656

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 1.81256, SD: 0.04817
RMSE | Mean: 2.38255, SD: 0.08419

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 1.78985, SD: 0.02888
RMSE | Mean: 2.35157, SD: 0.07424

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 10}


  0%|          | 0/3 [00:00<?, ?it/s]

{'alpha': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]


   <--- Validation Errors --->
MAE  | Mean: 2.14415, SD: 0.08296
RMSE | Mean: 2.82131, SD: 0.15154

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down

  0%|          | 0/3 [00:00<?, ?it/s]

Based on all models above, lasso regression seemed to yield the lowest RMSE and MAE using kfold validation to evaluate errors.

In [11]:
print(rmse_table)
print(mae_table)

            ridge     lasso  vanilla_xgb    bo_xgb
FG%      2.548933  2.552028     2.787752  2.627492
FT%      2.617610  2.613262     2.876353  2.688302
PTS      2.396022  2.395225     2.633238  2.460052
TRB      2.524535  2.527306     2.756591  2.569856
AST      2.598587  2.597978     2.862627  2.670657
STL      2.475641  2.477737     2.689329  2.526468
BLK      2.598074  2.600308     2.816119  2.667094
FG%+TRB  2.172072  2.172120     2.373852  2.205968
BLK+FG%  2.237396  2.238453     2.434978  2.285977
AST+STL  2.194266  2.197824     2.382548  2.223555
PTS+FT%  2.135954  2.131221     2.351574  2.194679
Base     2.578168  2.577890     2.821312  2.661052
            ridge     lasso  vanilla_xgb    bo_xgb
FG%      1.928379  1.930587     2.134278  2.003129
FT%      1.981701  1.979266     2.201079  2.060948
PTS      1.789310  1.788886     2.004647  1.851618
TRB      1.896646  1.898283     2.095541  1.945130
AST      1.956711  1.956299     2.177350  2.037029
STL      1.867596  1.869908    

It also seems that for the most punt types, lasso model works the best and so going forward, we'll be going with a lasso model with a 1e-3 alpha.

In [12]:
min_mae = mae_table.idxmin(axis = 1)
print(min_mae)
min_rmse = rmse_table.idxmin(axis = 1)
print(min_rmse)

FG%        ridge
FT%        lasso
PTS        lasso
TRB        ridge
AST        lasso
STL        ridge
BLK        ridge
FG%+TRB    lasso
BLK+FG%    lasso
AST+STL    ridge
PTS+FT%    lasso
Base       ridge
dtype: object
FG%        ridge
FT%        lasso
PTS        lasso
TRB        ridge
AST        lasso
STL        ridge
BLK        ridge
FG%+TRB    ridge
BLK+FG%    ridge
AST+STL    ridge
PTS+FT%    lasso
Base       lasso
dtype: object
