In [1]:
# imports
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

In [2]:
# load data
file_path = '../data/master.csv'

df = pd.read_csv(file_path, index_col=0)

In [3]:
# set X & y
X = df.drop(['date', 'time', 'total_distance', 'distance_to_pin'], axis=1)
X = pd.concat([X, pd.get_dummies(X['club_type'])], axis=1)
X.drop('club_type', axis=1, inplace=True)

y = df['carry_distance']

In [4]:
# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
# dataframe to stow results
results = pd.DataFrame(columns=['model', 'r2', 'mse'])

In [10]:
# model 1: train linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

In [11]:
# model 1: results
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 1.0
mse: 8.102028042417428e-25


In [8]:
# model 1: add to df
row = ['ols', r2, mse]
results.loc[len(results)] = row

In [9]:
# model 2: train KFold linear regrssion
kf = KFold(n_splits=10)

r2s = []
mses = []

for train_index, test_index in kf.split(X_train):
    X_split_train, X_split_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_split_train, y_split_test = y.iloc[train_index], y.iloc[test_index]
    
    lin_reg = LinearRegression()
    lin_reg.fit(X_split_train, y_split_train)
    y_split_pred = lin_reg.predict(X_split_test)
    
    r2s.append(r2_score(y_split_pred, y_split_test))
    mses.append(mean_squared_error(y_split_pred, y_split_test))

In [10]:
# model 2: evaluation
y_pred = lin_reg.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 1.0
mse: 3.7351701826608365e-24


In [11]:
# model 2: add to df
row = ['ols w/ KFold', r2, mse]
results.loc[len(results)] = row

In [12]:
# model 3: build GridSearchCV
params = {'copy_X': [False, True], 'fit_intercept': [False, True], 'normalize': [False, True]}

lin_reg = LinearRegression()
gs = GridSearchCV(lin_reg, params, n_jobs=-1, scoring='r2', verbose=True)

In [13]:
# model 3: fit
gs.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.7s finished


GridSearchCV(estimator=LinearRegression(), n_jobs=-1,
             param_grid={'copy_X': [False, True],
                         'fit_intercept': [False, True],
                         'normalize': [False, True]},
             scoring='r2', verbose=True)

In [14]:
# model 3: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 1.0
mse: 1.1342383450057754e-23


In [15]:
# model 3: add to df
row = ['ols w/ gridsearch', r2, mse]
results.loc[len(results)] = row

In [16]:
# model 4: build GridSearchCV Lasso
params = {'alpha': [0.005, 0.05, 0.1, 0.5, 0.9], 'fit_intercept': [False, True], 'normalize': [False, True]}

lasso = Lasso()
gs = GridSearchCV(lasso, params, n_jobs=-1, scoring='r2', verbose=True)

In [17]:
# model 4: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s


{'alpha': 0.005, 'fit_intercept': True, 'normalize': False}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished


In [18]:
# model 4: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9999999991156046
mse: 3.6932446403034015e-06


In [19]:
# model 4: add to df
row = ['lasso', r2, mse]
results.loc[len(results)] = row

In [20]:
# model 5: build GridSearchCV Ridge
params = {'alpha': [0.005, 0.05, 0.1, 0.5, 0.9], 'fit_intercept': [False, True], 'normalize': [False, True]}

ridge = Ridge()
gs = GridSearchCV(ridge, params, n_jobs=-1, scoring='r2', verbose=True)

In [21]:
# model 5: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s


Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'alpha': 0.005, 'fit_intercept': False, 'normalize': False}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished


In [22]:
# model 5: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 1.0
mse: 1.7315157709146682e-15


In [23]:
# model 5: add to df
row = ['ridge', r2, mse]
results.loc[len(results)] = row

In [24]:
# model 6: build GridSearchCV ElasticNet
params = {'alpha': [0.005, 0.05, 0.1, 0.5, 0.9], 'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1], 'fit_intercept': [False, True], 'normalize': [False, True]}

e_net = ElasticNet()
gs = GridSearchCV(e_net, params, n_jobs=-1, scoring='r2', verbose=True)

In [25]:
# model 6: fit
gs.fit(X_train, y_train)
print(gs.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 524 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 569 out of 600 | elapsed:    1.7s remaining:    0.1s


{'alpha': 0.005, 'fit_intercept': True, 'l1_ratio': 0, 'normalize': False}


[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    1.8s finished


In [26]:
# model 6: evaluation
best_gs = gs.best_estimator_
y_pred = best_gs.predict(X_test)
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
print(f'r2: {r2}\nmse: {mse}')

r2: 0.9999999999836848
mse: 6.813103948379958e-08


In [27]:
# model 6: add to df
row = ['elastic net', r2, mse]
results.loc[len(results)] = row

In [28]:
results

Unnamed: 0,model,r2,mse
0,ols,1.0,6.315202e-25
1,ols w/ KFold,1.0,3.73517e-24
2,ols w/ gridsearch,1.0,1.1342380000000001e-23
3,lasso,1.0,3.693245e-06
4,ridge,1.0,1.731516e-15
5,elastic net,1.0,6.813104e-08
