In [1]:
# imports
import sys  
sys.path.insert(0, '..')

import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from src import model
import xgboost as xgb

In [2]:
# load data
file_path = '../data/master.csv'

df = pd.read_csv(file_path, index_col=0)

In [3]:
# set X & y
X = df.drop(['date', 'time', 'carry_distance', 'distance_to_pin', 'total_distance'], axis=1)
X = pd.concat([X, pd.get_dummies(X['club_type'])], axis=1)
X.drop('club_type', axis=1, inplace=True)

y = df['carry_distance']

In [4]:
# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
# create class to stow models
models = model.Models(X_train, X_test, y_train, y_test)

In [9]:
# establish params for each model
elastic_net_params = {'alpha': [0.005, 0.05, 0.1, 0.5, 0.9],
                      'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1],
                      'fit_intercept': [False, True],
                      'normalize': [False, True]}

pls_params = {'n_components': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
             'tol': [0.000000001, 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001],
             'scale': [False, True]}

rfr_params = {'n_estimators': [10, 100, 1000, 10000],
              'max_depth': [None, 3, 7, 9, 15],
              'max_features': ['auto', 'sqrt', 'log2'],
              'bootstrap': [False, True]}

xgb_params = {'max_depth': [3, 7, 9, 15],
              'subsample': [0.01, 0.1, 0.5, 0.9, 0.99],
              'colsample_bytree': [0.01, 0.1, 0.5, 0.9, 0.99],
              'n_estimators': [10, 100, 1000, 10000]}

gbr_params = {'criterion': ['mse'],
              'loss': ['ls', 'lad', 'huber', 'quantile'],
              'max_features': ['auto', 'sqrt', 'log2'],
              'subsample': [0.9, 0.91, 0.92, 0.93, 0.94],
              'max_depth': [4, 5, 6, 7, 8],
              'n_estimators': [1000, 5000, 10000, 15000]}

In [10]:
# build, fit, & eval models
models.eval_model(model=ElasticNet(), params=elastic_net_params)
models.eval_model(model=PLSRegression(), params=pls_params)
models.eval_model(model=RandomForestRegressor(), params=rfr_params)
models.eval_model(model=xgb.XGBRegressor(objective='reg:squarederror'), params=xgb_params)
models.eval_model(model=GradientBoostingRegressor(), params=gbr_params)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 168 candidates, totalling 840 fits


In [11]:
# show results
models.results_df

Unnamed: 0,model,params,r2,mse
0,ElasticNet,"{'alpha': 0.05, 'copy_X': True, 'fit_intercept...",0.912401,354.930742
1,ElasticNet,"{'alpha': 0.05, 'copy_X': True, 'fit_intercept...",0.912401,354.930742
2,PLSRegression,"{'copy': True, 'max_iter': 500, 'n_components'...",0.91247,354.766633
