In [1]:
# imports
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
import xgboost as xgb

In [2]:
# load data
file_path = '../data/master.csv'

df = pd.read_csv(file_path, index_col=0)

In [10]:
# set X & y
X = df.drop(['date', 'time', 'carry_distance', 'distance_to_pin', 'total_distance'], axis=1)
X = pd.concat([X, pd.get_dummies(X['club_type'])], axis=1)
X.drop('club_type', axis=1, inplace=True)

y = df['carry_distance']

In [11]:
# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
# dataframe to stow results
results = pd.DataFrame(columns=['model', 'r2', 'mse'])

In [6]:
# model 1: train linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

In [7]:
# model 1: results
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9094829970050005
mse: 343.1936701022638


In [8]:
# model 1: add to df
row = ['ols', r2, mse]
results.loc[len(results)] = row

In [9]:
# model 2: train KFold linear regrssion
kf = KFold(n_splits=10)

r2s = []
mses = []

for train_index, test_index in kf.split(X_train):
    X_split_train, X_split_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_split_train, y_split_test = y.iloc[train_index], y.iloc[test_index]
    
    lin_reg = LinearRegression()
    lin_reg.fit(X_split_train, y_split_train)
    y_split_pred = lin_reg.predict(X_split_test)
    
    r2s.append(r2_score(y_split_pred, y_split_test))
    mses.append(mean_squared_error(y_split_pred, y_split_test))

In [10]:
# model 2: evaluation
y_pred = lin_reg.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9096611492602901
mse: 341.0768005066317


In [11]:
# model 2: add to df
row = ['ols w/ KFold', r2, mse]
results.loc[len(results)] = row

In [12]:
# model 3: build GridSearchCV
params = {'copy_X': [False, True], 'fit_intercept': [False, True], 'normalize': [False, True]}

lin_reg = LinearRegression()
gs = GridSearchCV(lin_reg, params, n_jobs=-1, scoring='r2', verbose=True)

In [13]:
# model 3: fit
gs.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.9s finished


GridSearchCV(estimator=LinearRegression(), n_jobs=-1,
             param_grid={'copy_X': [False, True],
                         'fit_intercept': [False, True],
                         'normalize': [False, True]},
             scoring='r2', verbose=True)

In [14]:
# model 3: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9094829970049991
mse: 343.19367010226404


In [15]:
# model 3: add to df
row = ['ols w/ gridsearch', r2, mse]
results.loc[len(results)] = row

In [16]:
# model 4: build GridSearchCV Lasso
params = {'alpha': [0.005, 0.05, 0.1, 0.5, 0.9], 'fit_intercept': [False, True], 'normalize': [False, True]}

lasso = Lasso()
gs = GridSearchCV(lasso, params, n_jobs=-1, scoring='r2', verbose=True)

In [17]:
# model 4: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s


Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'alpha': 0.1, 'fit_intercept': True, 'normalize': False}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished


In [18]:
# model 4: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9095814810363053
mse: 342.4197423967618


In [19]:
# model 4: add to df
row = ['lasso', r2, mse]
results.loc[len(results)] = row

In [20]:
# model 5: build GridSearchCV Ridge
params = {'alpha': [0.005, 0.05, 0.1, 0.5, 0.9], 'fit_intercept': [False, True], 'normalize': [False, True]}

ridge = Ridge()
gs = GridSearchCV(ridge, params, n_jobs=-1, scoring='r2', verbose=True)

In [21]:
# model 5: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s


Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'alpha': 0.9, 'fit_intercept': True, 'normalize': False}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished


In [22]:
# model 5: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9094832861744216
mse: 343.19023324539035


In [23]:
# model 5: add to df
row = ['ridge', r2, mse]
results.loc[len(results)] = row

In [24]:
# model 6: build GridSearchCV ElasticNet
params = {'alpha': [0.005, 0.05, 0.1, 0.5, 0.9], 'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1], 'fit_intercept': [False, True], 'normalize': [False, True]}

e_net = ElasticNet()
gs = GridSearchCV(e_net, params, n_jobs=-1, scoring='r2', verbose=True)

In [25]:
# model 6: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 320 tasks      | elapsed:    1.8s


{'alpha': 0.1, 'fit_intercept': True, 'l1_ratio': 1, 'normalize': False}


[Parallel(n_jobs=-1)]: Done 569 out of 600 | elapsed:    2.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    2.9s finished


In [26]:
# model 6: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9095814810363053
mse: 342.4197423967618


In [27]:
# model 6: add to df
row = ['elastic net', r2, mse]
results.loc[len(results)] = row

In [35]:
# model 7: build GridSearchCV PLSRegression
params = {'n_components': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'scale': [False, True],
         'tol': [0.000000001, 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001]}

pls = PLSRegression()
gs = GridSearchCV(pls, params, n_jobs=-1, scoring='r2', verbose=True)

In [36]:
# model 7: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

Fitting 5 folds for each of 168 candidates, totalling 840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 512 tasks      | elapsed:    1.7s


{'n_components': 11, 'scale': False, 'tol': 1e-09}


[Parallel(n_jobs=-1)]: Done 840 out of 840 | elapsed:    3.3s finished


In [38]:
# model 7: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9095761567121046
mse: 342.9061530682988


In [39]:
# model 7: add to df
row = ['pls regression', r2, mse]
results.loc[len(results)] = row

In [42]:
# model 8: build GridSearchCV RandomForestRegressor
params = {'n_estimators': [10, 100, 1000, 10000], 'max_depth': [None, 3, 7, 9, 15],
         'max_features': ['auto', 'sqrt', 'log2'], 'bootstrap': [False, True]}

rfr = RandomForestRegressor()
gs = GridSearchCV(rfr, params, n_jobs=-1, scoring='r2', verbose=True)

In [43]:
# model 8: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 51.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 70.4min finished


{'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'n_estimators': 10000}


In [44]:
# model 8: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9847607586056802
mse: 61.90496394602656


In [47]:
# model 8: add to df
row = ['rfr', r2, mse]
results.loc[len(results)] = row

In [51]:
# model 9: build GridSearchCV GradientBoostingRegressor
params = {'n_estimators': [10, 100, 1000, 10000], 'subsample': [0.01, 0.1, 0.5, 0.9, 0.99],
          'criterion': ['mse'], 'max_depth': [3, 7, 9, 15], 'max_features': ['auto', 'sqrt', 'log2']}

gbr = GradientBoostingRegressor()
gs = GridSearchCV(gbr, params, n_jobs=-1, scoring='r2', verbose=True)

In [52]:
# model 9: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  7.0min finished


{'criterion': 'mse', 'max_depth': 7, 'max_features': 'auto', 'n_estimators': 1000, 'subsample': 0.9}


In [53]:
# model 9: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9897507253879355
mse: 42.27320452072378


In [54]:
# model 9: add to df
row = ['gbr', r2, mse]
results.loc[len(results)] = row

In [71]:
# model 10: build GridSearchCV RandomForestRegressor
params = {'n_estimators': [1000, 5000, 10000], 'max_depth': [None],
         'max_features': ['auto'], 'bootstrap': [True]}

rfr = RandomForestRegressor()
gs = GridSearchCV(rfr, params, n_jobs=-1, scoring='r2', verbose=True)

In [72]:
# model 10: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  6.0min finished


{'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'n_estimators': 10000}


In [73]:
# model 10: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9847423709805129
mse: 61.96619354147312


In [74]:
# model 10: add to df
row = ['rfr_2', r2, mse]
results.loc[len(results)] = row

In [75]:
# model 11: build GridSearchCV GradientBoostingRegressor
params = {'n_estimators': [1000, 5000, 10000], 'subsample': [0.9],
          'criterion': ['mse'], 'max_depth': [7], 'max_features': ['auto']}

gbr = GradientBoostingRegressor()
gs = GridSearchCV(gbr, params, n_jobs=-1, scoring='r2', verbose=True)

In [76]:
# model 11: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  5.0min finished


{'criterion': 'mse', 'max_depth': 7, 'max_features': 'auto', 'n_estimators': 1000, 'subsample': 0.9}


In [77]:
# model 11: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9906193277319932
mse: 38.65906065945128


In [78]:
# model 11: add to df
row = ['gbr_2', r2, mse]
results.loc[len(results)] = row

In [128]:
best_gs.feature_importances_

array([0.7031684 , 0.00101758, 0.00918651, 0.00658252, 0.001676  ,
       0.01785507, 0.0020862 , 0.0021585 , 0.12693684, 0.11134257,
       0.00587352, 0.00153343, 0.00361449, 0.00100932, 0.00595917],
      dtype=float32)

In [112]:
# model 12: XGBoost
params = {'max_depth': [3, 7, 9, 15], 'subsample': [0.01, 0.1, 0.5, 0.9, 0.99],
          'colsample_bytree': [0.01, 0.1, 0.5, 0.9, 0.99], 'n_estimators': [10, 100, 1000, 10000]}

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')
gs = GridSearchCV(xgb_reg, params, n_jobs=-1, scoring='r2', verbose=True)

In [113]:
# model 12: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 31.6min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 60.0min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed: 103.8min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed: 160.2min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 194.2min finished


{'colsample_bytree': 0.99, 'max_depth': 3, 'n_estimators': 10000, 'subsample': 0.9}


In [114]:
# model 12: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.987383612538495
mse: 52.11877279159006


In [115]:
# model 12: add to df
row = ['xgb', r2, mse]
results.loc[len(results)] = row

In [181]:
# model 13: build GridSearchCV GradientBoostingRegressor
params = {'criterion': ['mse'], 'max_depth': [4, 5, 6, 7, 8], 'max_features': ['auto'],
          'subsample': [0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95]}

gbr = GradientBoostingRegressor()
gs = GridSearchCV(gbr, params, n_jobs=None, scoring='r2', verbose=True)

In [182]:
# model 13: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  4.8min finished


{'criterion': 'mse', 'max_depth': 7, 'max_features': 'auto', 'subsample': 0.9}


In [183]:
# model 13: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9888404934847734
mse: 46.74134688021586


In [195]:
# model 13: add to df
row = ['gbr_3', r2, mse]
results.loc[len(results)] = row

In [188]:
# model 14: build GridSearchCV GradientBoostingRegressor
params = {'criterion': ['mse'], 'n_estimators': list(range(100, 2000, 200)),
          'max_depth': [7], 'max_features': ['auto'],
          'subsample': [0.9, 0.91, 0.92, 0.93, 0.94],
          'loss': ['ls', 'lad', 'huber', 'quantile']}

gbr = GradientBoostingRegressor()
gs = GridSearchCV(gbr, params, n_jobs=None, scoring='r2', verbose=True)

In [189]:
# model 14: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed: 385.6min finished


{'criterion': 'mse', 'loss': 'huber', 'max_depth': 7, 'max_features': 'auto', 'n_estimators': 1500, 'subsample': 0.91}


In [190]:
# model 14: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9911146708855083
mse: 37.29707348946379


In [193]:
# model 14: add to df
row = ['gbr_4', r2, mse]
results.loc[len(results)] = row

In [199]:
best_gs.feature_importances_

array([8.03099876e-01, 2.62665718e-04, 5.36668580e-03, 4.33017109e-03,
       9.35459579e-04, 7.15477885e-03, 6.23349669e-04, 1.07836508e-03,
       1.03512418e-01, 7.33388512e-02, 4.42889470e-05, 3.77684901e-05,
       1.04765735e-05, 1.71258673e-04, 3.35863464e-05])

In [53]:
# model 15: build GridSearchCV GradientBoostingRegressor
gbr = GradientBoostingRegressor(criterion='mse', loss='huber', max_depth=7,
                                max_features='auto', n_estimators=15000,
                                subsample=0.91, learning_rate=0.01, verbose=True)

In [54]:
# model 15: fit
gbr.fit(X_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1        2061.6874           2.3691          165.48m
         2        2056.6649           3.8688          159.33m
         3        2056.2240           3.8052          153.33m
         4        2023.1835           4.3817          148.42m
         5        2044.1140           3.8609          146.75m
         6        2028.2307           4.0616          144.21m
         7        2037.4067           3.8880          141.12m
         8        2028.5107           3.9263          139.98m
         9        2036.9116           3.6326          139.79m
        10        2019.3707           3.8665          137.88m
        20        1980.6240           3.9269          130.63m
        30        1953.1279           3.5722          129.54m
        40        1901.5437           3.7663          128.11m
        50        1855.1985           3.8392          126.73m
        60        1841.2949           3.4392          126.70m
       

GradientBoostingRegressor(criterion='mse', learning_rate=0.001, loss='huber',
                          max_depth=7, max_features='auto', n_estimators=150000,
                          subsample=0.91, verbose=True)

In [None]:
# model 15: evaluation
y_pred = gbr.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

In [None]:
# model 14: add to df
row = ['gbr_5', r2, mse]
results.loc[len(results)] = row

In [None]:
'''
n_estimators=15000,
learning_rate=0.01

r2: 0.9915441206717956
mse: 35.61463404577434
'''


'''
n_estimators=150000,
learning_rate=0.001

r2: 0.9913024144985464
mse: 36.63308990894179
'''

In [196]:
results

Unnamed: 0,model,r2,mse
0,ols,0.909483,343.19367
1,ols w/ KFold,0.909661,341.076801
2,ols w/ gridsearch,0.909483,343.19367
3,lasso,0.909581,342.419742
4,ridge,0.909483,343.190233
5,elastic net,0.909581,342.419742
6,pls regression,0.909576,342.906153
7,rfr,0.984761,61.904964
8,gbr,0.989751,42.273205
9,rfr_2,0.984742,61.966194


In [None]:
'''
	model	r2	mse
0	ols	0.909483	343.193670
1	ols w/ KFold	0.909661	341.076801
2	ols w/ gridsearch	0.909483	343.193670
3	lasso	0.909581	342.419742
4	ridge	0.909483	343.190233
5	elastic net	0.909581	342.419742
6	pls regression	0.909576	342.906153
7	rfr	0.984761	61.904964
8	gbr	0.989751	42.273205
9	rfr_2	0.984742	61.966194
10	gbr_2	0.990619	38.659061
11	xgb	0.987384	52.118773
12	gbr_4	0.991115	37.297073
13	gbr_3	0.988840	46.741347
'''

In [149]:
results.to_csv('model_results.csv')