In [4]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [5]:
start = 2002
end = 2022

In [6]:
# initial import
#batting = batting_stats(start, end, qual=200)

In [7]:
batting = pd.read_csv('batting.csv')

In [8]:
# remove players we only have 1 season of data for
batting = batting.groupby('IDfg', group_keys=False).filter(lambda x: x.shape[0] > 1)

In [9]:
batting

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,...,,,,,0,0.127,0.191,,,
1,1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,...,,,,,0,0.124,0.164,,,
2,8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,0.262,118.4,246.0,0.609,404,0.169,0.287,,,
3,15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,0.131,110.6,217.0,0.500,434,0.220,0.270,,,
4,2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,...,,,,,0,0.135,0.223,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7108,6885,1698,2010,Gerald Laird,DET,30,89,270,299,56,...,,,0.0,,0,0.166,0.252,,,
7110,7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,...,0.096,111.8,113.0,0.401,282,0.174,0.316,,,
7111,6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,...,,,0.0,,0,0.169,0.295,,,
7112,6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,...,,,,,0,0.130,0.187,,,


In [10]:
# set up target we are trying to predict
def next_season_war(player):
    player = player.sort_values('Season')
    player['nextWAR'] = player['WAR'].shift(-1)
    return player

batting = batting.groupby('IDfg', group_keys=False).apply(next_season_war)

In [11]:
batting[["Name", "Season", "WAR", "nextWAR"]]

Unnamed: 0,Name,Season,WAR,nextWAR
3952,Alfredo Amezaga,2006,1.1,2.0
2614,Alfredo Amezaga,2007,2.0,1.2
3780,Alfredo Amezaga,2008,1.2,
1034,Garret Anderson,2002,3.7,5.1
431,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
4852,Owen Miller,2022,0.6,
6212,Andrew Vaughn,2021,-0.3,-0.4
6319,Andrew Vaughn,2022,-0.4,
5048,Ha-seong Kim,2021,0.5,3.7


In [12]:
null_count = batting.isnull().sum()

In [13]:
null_count

Unnamed: 0       0
IDfg             0
Season           0
Name             0
Team             0
              ... 
CSW%             0
xBA           6754
xSLG          6754
xwOBA         6754
nextWAR       1179
Length: 321, dtype: int64

In [14]:
complete_cols = list(batting.columns[null_count == 0])

In [15]:
batting = batting[complete_cols + ["nextWAR"]].copy()

In [16]:
batting

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,nextWAR
3952,5562,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,...,86,107,113,143,109,63,0,0.188,0.256,2.0
2614,5006,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,...,92,101,112,109,113,75,0,0.175,0.227,1.2
3780,5252,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,...,99,101,101,123,111,64,0,0.178,0.244,
1034,1169,2,2002,Garret Anderson,ANA,30,158,638,678,195,...,118,91,80,65,97,129,0,0.137,0.232,5.1
431,864,2,2003,Garret Anderson,ANA,31,159,638,673,201,...,112,101,80,90,99,109,0,0.164,0.252,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4852,6002,24655,2022,Owen Miller,CLE,25,130,424,472,103,...,92,111,97,131,100,83,340,0.188,0.266,
6212,4881,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,...,87,104,116,84,99,110,321,0.185,0.285,-0.4
6319,3377,26197,2022,Andrew Vaughn,CHW,24,134,510,555,138,...,88,106,111,94,100,104,419,0.201,0.291,
5048,6620,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,...,126,99,59,137,96,88,201,0.216,0.303,3.7


In [17]:
batting.dtypes[batting.dtypes == 'object']

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [18]:
batting['Dol']

3952      $5.5
2614     $11.2
3780      $7.2
1034     $14.6
431      $22.0
         ...  
4852      $4.8
6212    ($2.6)
6319    ($3.5)
5048      $4.0
1076     $29.3
Name: Dol, Length: 6754, dtype: object

In [19]:
del batting['Dol']

In [20]:
del batting['Age Rng']

In [21]:
batting['team_code'] = batting['Team'].astype('category').cat.codes

In [25]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [26]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction='forward', cv=split, n_jobs=4)

In [27]:
removed_cols = ['Name', 'Season', 'Team', 'nextWAR', 'IDfg']
selected_cols = batting.columns[~batting.columns.isin(removed_cols)]

In [29]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_cols] = scaler.fit_transform(batting.loc[:, selected_cols])

In [31]:
batting.describe()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,nextWAR,team_code
count,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,...,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0
mean,0.451547,5366.78583,2011.163229,0.3606,0.652755,0.478666,0.480943,0.365973,0.290481,0.399279,...,0.457544,0.403164,0.410923,0.511026,0.478646,0.172991,0.498932,0.545898,1.794816,0.474128
std,0.27945,5133.255295,5.612014,0.147476,0.255929,0.242481,0.26229,0.182585,0.138786,0.171732,...,0.113985,0.131213,0.121082,0.130359,0.133992,0.273858,0.13718,0.120701,1.996478,0.305105
min,0.0,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.4,0.0
25%,0.209265,1131.5,2006.0,0.269231,0.478632,0.27518,0.257785,0.211207,0.179245,0.258621,...,0.382022,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.3,0.205882
50%,0.431885,3531.0,2011.0,0.346154,0.709402,0.505396,0.508651,0.37069,0.283019,0.37931,...,0.460674,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,1.5,0.470588
75%,0.681358,9015.0,2016.0,0.461538,0.871795,0.688849,0.710208,0.508621,0.391509,0.517241,...,0.52809,0.488722,0.483146,0.594203,0.564626,0.346411,0.591489,0.625551,2.9,0.735294
max,1.0,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.9,1.0


In [32]:
sfs.fit(batting[selected_cols], batting['nextWAR'])

In [34]:
predictors = list(selected_cols[sfs.get_support()])

In [35]:
# create backtest function to generate predictions
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    years = sorted(data['Season'].unique())
    for i in range(start, len(years), step):
        train = data[data['Season'] < years[i]]
        test = data[data['Season'] == years[i]]
        model.fit(train[predictors], train['nextWAR'])
        predictions = model.predict(test[predictors])
        predictions = pd.Series(predictions, index=test.index)
        combined = pd.concat([test['nextWAR'], predictions], axis=1)
        combined.columns = ['actual', 'predicted']

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [36]:
predictions = backtest(batting, rr, predictors)

In [37]:
predictions

Unnamed: 0,actual,predicted
2614,1.2,1.486905
3387,1.4,0.821891
4567,-0.1,0.596370
4661,0.6,0.885452
1749,4.8,2.321248
...,...,...
2068,2.3,2.710649
4640,0.9,1.834531
6881,0.6,1.503899
6212,-0.4,1.719220


In [38]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions['actual'], predictions['predicted'])

2.8011794216119004

In [39]:
batting['nextWAR'].describe()

count    5575.000000
mean        1.794816
std         1.996478
min        -3.400000
25%         0.300000
50%         1.500000
75%         2.900000
max        11.900000
Name: nextWAR, dtype: float64

In [40]:
2.8011794216119004 ** .5

1.673672435577494

In [54]:
# create player history function
def player_history(df):
    df = df.sort_values('Season')

    df['player_season'] = range(0, df.shape[0])
    df['war_corr'] = list(df[['player_season', 'WAR']].expanding().corr().loc[(slice(None), 'player_season'), 'WAR'])
    df['war_corr'] = df['war_corr'].fillna(1)

    df['war_diff'] = df['WAR'] / df['WAR'].shift(1)
    df['war_diff'] = df['war_diff'].fillna(1)

    df['war_diff'][df['war_diff'] == np.inf] = 1

    return df

batting = batting.groupby('IDfg', group_keys=False).apply(player_history)

In [55]:
# define group averages function
def group_averages(df):
    return df['WAR'] / df['WAR'].mean()

In [56]:
batting['war_season'] = batting.groupby('Season', group_keys=False).apply(group_averages)

In [57]:
new_predictors = predictors + ['player_season', 'war_corr','war_season', 'war_diff']

In [58]:
predictions = backtest(batting, rr, new_predictors)

In [59]:
mean_squared_error(predictions['actual'], predictions['predicted'])

2.709942658548639

In [61]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.658698
BABIP           -1.752406
WAR             -1.742232
SLG+            -1.318259
Soft%+          -1.208705
BU              -0.950315
SO              -0.709608
PH              -0.706024
war_diff        -0.586725
wGDP            -0.363992
wCH             -0.277163
CH%             -0.273385
CB%             -0.272780
Pull%+          -0.158075
war_corr        -0.128438
player_season   -0.007356
O-Contact%       0.228793
OBP+             0.533119
Oppo%            0.680364
Spd              0.799780
SB               1.061534
IBB              1.630398
Hard%+           2.346631
war_season       3.443354
dtype: float64

In [62]:
diff = predictions['actual'] - predictions['predicted']

In [63]:
diff

2614   -0.300209
3387    0.913096
4567   -0.499657
4661   -0.292583
1749    2.739979
          ...   
2068   -0.349892
4640   -0.781653
6881   -0.606054
6212   -1.830561
5048    2.673661
Length: 4127, dtype: float64

In [64]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [65]:
merged['diff'] = (merged['actual'] - merged['predicted']).abs()

In [68]:
merged[['IDfg', 'Season', 'Name', 'WAR', 'nextWAR', 'diff']].sort_values('diff', ascending=False)

Unnamed: 0,IDfg,Season,Name,WAR,nextWAR,diff
324,15640,2021,Aaron Judge,0.552795,11.4,7.490491
3127,11579,2014,Bryce Harper,0.310559,9.3,7.486271
856,9166,2010,Buster Posey,0.459627,10.1,6.624996
3575,1875,2009,Josh Hamilton,0.291925,8.4,6.355356
3264,4810,2007,Brian McCann,0.304348,8.6,6.342718
...,...,...,...,...,...,...
2251,826,2010,Derek Jeter,0.354037,2.2,0.004464
1364,731,2007,Torii Hunter,0.409938,2.4,0.003953
2854,1825,2012,David DeJesus,0.322981,2.0,0.002951
2878,5227,2013,Jon Jay,0.322981,1.7,0.001808
