In [25]:
# imports
import sys  
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from src import model
import xgboost as xgb

In [2]:
# load data
file_path = '../data/master.csv'

df = pd.read_csv(file_path, index_col=0)

In [3]:
# set X & y
X = df.drop(['date', 'time', 'carry_distance', 'distance_to_pin', 'total_distance'], axis=1)
X = pd.concat([X, pd.get_dummies(X['club_type'])], axis=1)
X.drop('club_type', axis=1, inplace=True)

y = df['carry_distance']

In [21]:
# establish baseline
az_rad = X['launch_angle'] * np.pi / 180
g = 9.8

est_carry = (X['ball_speed'] ** 2 * np.sin(2*az_rad)) / g

In [26]:
# eval baseline
print(f'r2: {r2_score(est_carry, y)}')
print(f'rmse: {mean_squared_error(est_carry, y)**0.5}')

r2: 0.798360022388313
rmse: 29.348549370324417


In [4]:
# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
# create class to stow models
models = model.Models(X_train, X_test, y_train, y_test)

In [6]:
# establish params for each model
elastic_net_params = {'alpha': [0.005, 0.05, 0.1, 0.5, 0.9],
                      'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1],
                      'fit_intercept': [False, True],
                      'normalize': [False, True]}

pls_params = {'n_components': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
             'tol': [0.000000001, 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001],
             'scale': [False, True]}

rfr_params = {'n_estimators': [10, 100, 1000, 10000],
              'max_depth': [None, 3, 7, 9, 15],
              'max_features': ['auto', 'sqrt', 'log2'],
              'bootstrap': [False, True]}

xgb_params = {'max_depth': [3, 7, 9, 15],
              'subsample': [0.01, 0.1, 0.5, 0.9, 0.99],
              'colsample_bytree': [0.01, 0.1, 0.5, 0.9, 0.99],
              'n_estimators': [10, 100, 1000, 10000]}

gbr_params = {'criterion': ['mse'],
              'loss': ['ls', 'lad', 'huber', 'quantile'],
              'max_features': ['auto', 'sqrt', 'log2'],
              'subsample': [0.9, 0.91, 0.92, 0.93, 0.94],
              'max_depth': [4, 5, 6, 7, 8],
              'n_estimators': [1000, 5000, 10000, 15000]}

In [7]:
# build, fit, & eval models
models.eval_model(model=ElasticNet(), params=elastic_net_params)
models.eval_model(model=PLSRegression(), params=pls_params)
models.eval_model(model=RandomForestRegressor(), params=rfr_params)
models.eval_model(model=xgb.XGBRegressor(objective='reg:squarederror'), params=xgb_params)
models.eval_model(model=GradientBoostingRegressor(), params=gbr_params)

ElasticNet()
Fitting 5 folds for each of 120 candidates, totalling 600 fits


PLSRegression()
Fitting 5 folds for each of 168 candidates, totalling 840 fits


RandomForestRegressor()
Fitting 5 folds for each of 120 candidates, totalling 600 fits






XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)
Fitting 5 folds for each of 400 candidates, totalling 2000 fits






GradientBoostingRegressor()
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits








In [8]:
# show results
models.results_df

Unnamed: 0,model,params,r2,mse
0,ElasticNet,"{'alpha': 0.05, 'copy_X': True, 'fit_intercept...",0.891089,409.72196
1,PLSRegression,"{'copy': True, 'max_iter': 500, 'n_components'...",0.891074,409.746304
2,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.981709,73.207929
3,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.989654,41.942065
4,GradientBoostingRegressor,"{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': ...",0.987962,48.483444


In [15]:
# train best model
xgb_reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.003, max_delta_step=0, max_depth=3,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=1000000, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.003, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000000, n_jobs=16, num_parallel_tree=1,
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.9, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [None]:
# save results
models.results_df.to_csv('../data/model_results.csv')

In [None]:
# get the features of the best model
xgb_reg = models.best_models[3]
feature_importances = xgb_reg.feature_importances_

In [None]:
# create a dictionary of column names and feature importances
feature_dict ={}

for i, col in enumerate(X.columns):
    feature_dict[col] = feature_importances[i]
    
feature_dict = {key: value for key, value in sorted(feature_dict.items(), key=lambda item: item[1])}

feature_dict_wo_clubs = feature_dict.copy()

del_features = ['Fw', 'Irn', 'Hyb', '1w', 'Wdg']

for feature in del_features:
    del feature_dict_wo_clubs[feature]

In [None]:
# plot feature importances
fig = px.bar(x=feature_dict_wo_clubs.values(),
             y=feature_dict_wo_clubs.keys(),
             labels={
                     'y': 'Features',
                     'x': 'Importance'},
             width=1600,
             height=900)

fig.update_layout(title={
    'text': 'Feature Importances',
    'font': dict(size=36)},
    margin=dict(l=200, r=200, t=200, b=200))

fig.write_image(f'../img/feature_importances.png')