# Setup

In [82]:
import pandas as pd
import seaborn as sns
import re
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import scipy.stats as stats

# from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
#from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV

%config InlineBackend.figure_format = 'png'
plt.rcParams['figure.dpi']= 300

In [18]:
df = pd.read_csv('scraped_data_apr25.csv')

In [19]:
df.columns

Index(['YrTm', '#Bat', 'BatAge', 'R/G', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B',
       'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+',
       'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'LOB', 'Team', 'Year', '#P',
       'PAge', 'RA/G', 'W', 'L', 'W-L%', 'ERA', 'GS', 'GF', 'CG', 'tSho',
       'cSho', 'SV', 'IP', 'ER', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9',
       'HR9', 'BB9', 'SO9', 'SO/W', '#Fld', 'A', 'Ch', 'DP', 'DefEff', 'E',
       'Fld%', 'Inn', 'PO', 'Rdrs', 'Rdrs/yr', 'Rtot', 'Rtot/yr'],
      dtype='object')

In [20]:
df.sample(5)

Unnamed: 0,YrTm,#Bat,BatAge,R/G,G,PA,AB,R,H,2B,...,DP,DefEff,E,Fld%,Inn,PO,Rdrs,Rdrs/yr,Rtot,Rtot/yr
169,2014BAL,44,28.3,4.35,162,6130,5596,705,1434,264,...,156,0.706,87,0.986,13152.0,4384,57.0,5.0,41,4
204,2016HOU,43,26.4,4.47,162,6204,5545,724,1367,291,...,135,0.681,77,0.987,13212.0,4404,61.0,1.0,-12,-1
30,2004BOS,50,30.4,5.86,162,6515,5720,949,1613,373,...,129,0.693,118,0.981,13062.0,4354,-22.0,-2.0,-2,0
33,2004DET,39,28.1,5.1,162,6285,5623,827,1531,284,...,160,0.681,144,0.977,12957.0,4319,-38.0,-4.0,-19,-2
190,2015KCR,45,29.1,4.47,162,6116,5575,724,1497,300,...,138,0.701,88,0.985,13068.0,4356,52.0,1.0,18,2


# Select Features

Note: Whole bunch of features available, 229 rows of data. Targeting 5-7 features in final model.

In [90]:
X = df[['OPS','ERA','Fld%']]
y = df['W-L%']

# Train-Test Split

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [92]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160 entries, 135 to 120
Data columns (total 4 columns):
OPS       160 non-null float64
ERA       160 non-null float64
DefEff    160 non-null float64
Fld%      160 non-null float64
dtypes: float64(4)
memory usage: 6.2 KB


In [93]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69 entries, 192 to 132
Data columns (total 4 columns):
OPS       69 non-null float64
ERA       69 non-null float64
DefEff    69 non-null float64
Fld%      69 non-null float64
dtypes: float64(4)
memory usage: 2.7 KB


In [94]:
y_train

135    0.457
191    0.525
184    0.500
196    0.494
187    0.503
59     0.481
116    0.500
24     0.574
177    0.432
176    0.605
69     0.537
139    0.500
4      0.457
83     0.512
31     0.512
35     0.568
180    0.537
103    0.401
142    0.525
115    0.426
178    0.519
163    0.525
76     0.580
215    0.574
223    0.562
140    0.574
208    0.519
155    0.599
199    0.549
67     0.377
       ...  
17     0.531
158    0.574
101    0.401
118    0.494
22     0.623
91     0.540
93     0.466
77     0.488
20     0.512
209    0.426
117    0.414
127    0.556
41     0.416
112    0.407
207    0.364
84     0.422
157    0.568
206    0.457
97     0.531
57     0.531
183    0.512
81     0.407
85     0.586
220    0.494
66     0.481
137    0.562
210    0.531
162    0.407
194    0.420
120    0.586
Name: W-L%, Length: 160, dtype: float64

# statsmodel

In [95]:
# Plain OLS on whole dataset
model = sm.OLS(y, sm.add_constant(X))
fit = model.fit()
fit.summary()

0,1,2,3
Dep. Variable:,W-L%,R-squared:,0.848
Model:,OLS,Adj. R-squared:,0.846
Method:,Least Squares,F-statistic:,313.6
Date:,"Thu, 26 Apr 2018",Prob (F-statistic):,1.5699999999999999e-90
Time:,08:16:43,Log-Likelihood:,491.85
No. Observations:,229,AIC:,-973.7
Df Residuals:,224,BIC:,-956.5
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.9740,0.812,-3.662,0.000,-4.574,-1.374
OPS,1.2127,0.050,24.198,0.000,1.114,1.311
ERA,-0.0996,0.005,-20.516,0.000,-0.109,-0.090
DefEff,-0.1576,0.215,-0.734,0.464,-0.581,0.266
Fld%,3.1597,0.839,3.765,0.000,1.506,4.814

0,1,2,3
Omnibus:,1.508,Durbin-Watson:,2.016
Prob(Omnibus):,0.471,Jarque-Bera (JB):,1.222
Skew:,0.165,Prob(JB):,0.543
Kurtosis:,3.14,Cond. No.,2860.0


# sklearn

## test-train split on plain regression

In [106]:
# on whole dataset
model= LinearRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8079682273412429

In [97]:
# with test-train split
reg = LinearRegression()
model.fit(X_train,y_train)

train_score = model.score(X_train,y_train)
test_score = model.score(X_test,y_test)

print('Train score: ',train_score)
print('Test score: ',test_score)

Train score:  0.8601056915990077
Test score:  0.8079682273412429


## cross_val_score

Scored with MSE

In [98]:
# 3-fold cross-validation - 229 rows / 3 = 76, so can have 7 features for 10x rule
reg = LinearRegression()
scores = cross_val_score(reg, X, y, cv=3, scoring='neg_mean_squared_error')

# scores output is negative, a sklearn quirk bc mse is used to min. optimization func.
print(-scores)

[0.00092395 0.00071606 0.00089627]


Default scoring

In [99]:
reg = LinearRegression()
scores = cross_val_score(reg, X, y, cv=3) # try with default scoring

print(scores)

[0.87171937 0.84194001 0.77802148]


In [100]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # still default scoring

Accuracy: 0.83 (+/- 0.08)


## polynomial degree 2

In [101]:
degree = 2
est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
est.fit(X_train, y_train)
train_error = mean_squared_error(y_train, est.predict(X_train))
test_error = mean_squared_error(y_test, est.predict(X_test))

In [102]:
print('Train error: ',train_error)
print('Test error: ',test_error)

Train error:  0.0007160507806108331
Test error:  0.0010118004252139565


## Elastic Net

In [103]:
model = ElasticNet(1.0, l1_ratio = 0.5) # ratio splits equally between L1 (lasso) and L2 (ridge)
model.fit(X,y)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [104]:
model = ElasticNet(1.0, l1_ratio = 0.2) # ratio heavier on Ridge - to address collinearity
model.fit(X,y)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.2,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

## Elastic Net CV

Straight Linear Regression - Degree 1

In [105]:
degree = 1
l1rats = [0, .01, .05, .1, .15, .2, .3, .4, .5, .7, .9, .95, 1] # leaning toward heavier L2 weights b/c multicollin
λ = [1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3]
est = make_pipeline(PolynomialFeatures(degree), ElasticNetCV(l1_ratio=l1rats, alphas=λ))
est.fit(X,y)

  tol, rng, random, positive)


Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=1, include_bias=True, interaction_only=False)), ('elasticnetcv', ElasticNetCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
       copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0, 0.01, 0.05, 0.1, 0.15, 0....ive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0))])

Polynomial - Degree 2

In [89]:
degree = 2
l1ratios = [0, .05, .1, .15, .2, .3, .4, .5, .7, .9, .95, .99, 1]
α = [1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2]
est = make_pipeline(PolynomialFeatures(degree), ElasticNetCV(l1_ratio=l1ratios, alphas=α))
est.fit(X,y)

  tol, rng, random, positive)


Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('elasticnetcv', ElasticNetCV(alphas=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
       copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0, 0.05, 0.1, 0.15, 0.2, 0.3,...ive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0))])

Polynomial - Degree 3

In [None]:
degree = 3
l1ratio = [0, .05, .1, .15, .2, .3, .4, .5, .7, .9, .95, .99, 1]
α = [1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2]
est = make_pipeline(PolynomialFeatures(degree), ElasticNetCV(l1_ratio=l1rat, alphas=α))
est.fit(X,y)