In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [2]:
START = 2002
END = 2022

In [3]:
if os.path.exists("batting.csv"):
    batting = pd.read_csv("batting.csv", index_col=0)
else:
    batting = batting_stats(START, END, qual=200)
    batting.to_csv("batting.csv")

In [4]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,,0,0.127,0.191,,,
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,,0,0.124,0.164,,,
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,0.262,118.4,246.0,0.609,404,0.169,0.287,,,
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,0.131,110.6,217.0,0.500,434,0.220,0.270,,,
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,,0,0.135,0.223,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7084,9362,2010,Adam Moore,SEA,26,60,205,218,40,30,...,,,0.0,,0,0.181,0.325,,,
7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,0.096,111.8,113.0,0.401,282,0.174,0.316,,,
6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,,0.0,,0,0.169,0.295,,,
6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,,0,0.130,0.187,,,


In [16]:
# IDfg is unique ID for each player, split the data based on IDfg and each player has its own group.
# filter function is saying loop thruough each player group, only keep the players who played
# more than 1 seasons. Becouse only 1 season doesn't help with the prediction.
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0]>1)

In [6]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,,0,0.127,0.191,,,
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,,0,0.124,0.164,,,
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,0.262,118.4,246.0,0.609,404,0.169,0.287,,,
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,0.131,110.6,217.0,0.500,434,0.220,0.270,,,
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,,0,0.135,0.223,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6885,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,,0.0,,0,0.166,0.252,,,
7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,0.096,111.8,113.0,0.401,282,0.174,0.316,,,
6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,,0.0,,0,0.169,0.295,,,
6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,,0,0.130,0.187,,,


In [7]:
def next_season(player):
    player = player.sort_values("Season")
    player['Next_WAR'] = player['WAR'].shift(-1)
    return player
batting = batting.groupby('IDfg', group_keys=False).apply(next_season)
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,Next_WAR
5562,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,,,,0,0.188,0.256,,,,2.0
5006,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,,0.0,,0,0.175,0.227,,,,1.2
5252,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,,0.0,,0,0.178,0.244,,,,
1169,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,,,,0,0.137,0.232,,,,5.1
864,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,,,,0,0.164,0.252,,,,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6002,24655,2022,Owen Miller,CLE,25,130,424,472,103,70,...,109.1,106.0,0.312,340,0.188,0.266,,,,
4881,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,115.0,149.0,0.464,321,0.185,0.285,,,,-0.4
3377,26197,2022,Andrew Vaughn,CHW,24,134,510,555,138,92,...,112.1,203.0,0.484,419,0.201,0.291,,,,
6620,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,112.3,64.0,0.318,201,0.216,0.303,,,,3.7


In [18]:
null_count = batting.isnull().sum()

complete_cols = list(batting.columns[null_count==0])
batting = batting[complete_cols + ["Next_WAR"]].copy()
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,team_code,Next_WAR
5562,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,107,113,143,109,63,0,0.188,0.256,12,2.0
5006,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,101,112,109,113,75,0,0.175,0.227,12,1.2
5252,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,101,101,123,111,64,0,0.178,0.244,12,
1169,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,91,80,65,97,129,0,0.137,0.232,1,5.1
864,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,101,80,90,99,109,0,0.164,0.252,1,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6002,24655,2022,Owen Miller,CLE,25,130,424,472,103,70,...,111,97,131,100,83,340,0.188,0.266,9,
4881,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,104,116,84,99,110,321,0.185,0.285,7,-0.4
3377,26197,2022,Andrew Vaughn,CHW,24,134,510,555,138,92,...,106,111,94,100,104,419,0.201,0.291,7,
6620,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,99,59,137,96,88,201,0.216,0.303,26,3.7


In [8]:
batting.dtypes[batting.dtypes=="object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [9]:
del batting['Age Rng']
del batting['Dol']

In [19]:
batting['team_code'] = batting['Team'].astype("category").cat.codes

In [20]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,team_code,Next_WAR
5562,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,107,113,143,109,63,0,0.188,0.256,12,2.0
5006,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,101,112,109,113,75,0,0.175,0.227,12,1.2
5252,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,101,101,123,111,64,0,0.178,0.244,12,
1169,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,91,80,65,97,129,0,0.137,0.232,1,5.1
864,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,101,80,90,99,109,0,0.164,0.252,1,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6002,24655,2022,Owen Miller,CLE,25,130,424,472,103,70,...,111,97,131,100,83,340,0.188,0.266,9,
4881,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,104,116,84,99,110,321,0.185,0.285,7,-0.4
3377,26197,2022,Andrew Vaughn,CHW,24,134,510,555,138,92,...,106,111,94,100,104,419,0.201,0.291,7,
6620,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,99,59,137,96,88,201,0.216,0.303,26,3.7


In [22]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [23]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

In [24]:
rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, 
                               direction = "forward",
                               cv=split,
                               n_jobs=8)

In [25]:
removed_columns = ['Next_WAR', "Name","Team","IDfg","Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [26]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:,selected_columns] = scaler.fit_transform(batting[selected_columns])

In [27]:
sfs.fit(batting[selected_columns], batting['Next_WAR'])

In [28]:
predictors = list(selected_columns[sfs.get_support()])
predictors

['Age',
 'IBB',
 'SO',
 'SB',
 'BU',
 'BABIP',
 'WAR',
 'Spd',
 'PH',
 'CB%',
 'CH%',
 'wCH',
 'O-Contact%',
 'wGDP',
 'Oppo%',
 'OBP+',
 'SLG+',
 'Pull%+',
 'Soft%+',
 'Hard%+']

In [31]:
def backtest(data, model, predictors, start=5,step=1):
    all_predictions = []
    
    years = sorted(data['Season'].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data['Season'] < current_year]
        test = data[data['Season']==current_year]
        
        model.fit(train[predictors],train['Next_WAR'])
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test['Next_WAR'], preds], axis=1)
        combined.columns = ["actual","prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [32]:
predictions = backtest(batting, rr, predictors)

In [33]:
predictions.shape

(4127, 2)

In [36]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(predictions["actual"], predictions["prediction"])
mse

2.8011794216119004

In [39]:
def player_history(df):
    df = df.sort_values("Season")
    
    df['player_season'] = range(0, df.shape[0])
    df['war_corr'] = list(df[['player_season','WAR']].expanding().corr().loc[(slice(None),'player_season'),'WAR'])
    df['war_corr'].fillna(0,inplace=True)
    
    df['war_diff'] = df['WAR'] / df['WAR'].shift(1)
    df['war_diff'].fillna(1, inplace=True)
    
    df['war_diff'][df['war_diff']== np.inf]=1
    return df


In [40]:
batting = batting.groupby('IDfg', group_keys=False).apply(player_history)

In [41]:
def group_averages(df):
    return df['WAR']/df['WAR'].mean()

In [43]:
batting['war_season'] = batting.groupby('Season', group_keys=False).apply(group_averages)

In [44]:
new_predictors = predictors + ['player_season','war_corr','war_season','war_diff']

In [45]:
predictions = backtest(batting, rr, new_predictors)

In [46]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.7141271122632338

In [47]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.613361
WAR             -1.873384
BABIP           -1.784208
SLG+            -1.347609
Soft%+          -1.200109
BU              -0.975275
PH              -0.702824
SO              -0.668198
war_diff        -0.588878
wGDP            -0.363427
CB%             -0.320678
wCH             -0.282596
CH%             -0.245154
Pull%+          -0.161653
war_corr        -0.092850
player_season    0.000959
O-Contact%       0.249812
OBP+             0.531483
Oppo%            0.676553
Spd              0.792787
SB               1.089041
IBB              1.681935
Hard%+           2.349468
war_season       3.478724
dtype: float64

In [49]:
diff = predictions["actual"] - predictions["prediction"]

In [50]:
diff

5006   -0.226571
1925    0.789842
3102   -0.607188
5797   -0.357194
1109    2.654835
          ...   
1914   -0.384865
5875   -0.857594
7032   -0.681476
4881   -1.905877
6620    2.585811
Length: 4127, dtype: float64

In [51]:
merged = predictions.merge(batting,left_index=True, right_index=True)

In [52]:
merged

Unnamed: 0,actual,prediction,IDfg,Season,Name,Team,Age,G,AB,PA,...,Hard%+,Events,CStr%,CSW%,team_code,Next_WAR,player_season,war_corr,war_diff,war_season
5006,1.2,1.426571,1,2007,Alfredo Amezaga,FLA,0.384615,0.743590,0.431655,0.429066,...,0.292517,0.000000,0.527660,0.396476,0.352941,1.2,1,1.000000,1.200000,0.998355
1925,1.4,0.610158,2,2007,Garret Anderson,LAA,0.615385,0.529915,0.462230,0.432526,...,0.523810,0.000000,0.442553,0.480176,0.441176,1.4,5,-0.692192,1.371429,0.887427
3102,-0.1,0.507188,10,2007,David Eckstein,STL,0.500000,0.606838,0.492806,0.491349,...,0.265306,0.000000,0.676596,0.436123,0.852941,-0.1,5,-0.694330,0.836735,0.758010
5797,0.6,0.957194,11,2007,Darin Erstad,CHW,0.538462,0.350427,0.269784,0.254325,...,0.380952,0.000000,0.765957,0.691630,0.205882,0.6,4,-0.828562,0.803922,0.758010
1109,4.8,2.145165,15,2007,Troy Glaus,TOR,0.423077,0.589744,0.404676,0.442907,...,0.680272,0.000000,0.634043,0.704846,0.970588,4.8,5,0.231396,0.897059,1.127772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,2.3,2.684865,23667,2021,Wander Franco,TBR,0.038462,0.205128,0.217626,0.186851,...,0.394558,0.409015,0.391489,0.352423,0.911765,2.3,0,0.000000,1.000000,1.060031
5875,0.9,1.757594,24618,2021,Ryan Jeffers,MIN,0.192308,0.333333,0.192446,0.160900,...,0.619048,0.265442,0.514894,0.788546,0.558824,0.9,0,0.000000,1.000000,0.749333
7032,0.6,1.281476,24655,2021,Owen Miller,CLE,0.192308,0.119658,0.055755,0.003460,...,0.394558,0.230384,0.548936,0.700441,0.264706,0.6,0,0.000000,1.000000,0.438634
4881,-0.4,1.505877,26197,2021,Andrew Vaughn,CHW,0.153846,0.692308,0.462230,0.465398,...,0.530612,0.535893,0.570213,0.651982,0.205882,-0.4,0,0.000000,1.000000,0.566569


In [53]:
merged['diff'] = (predictions['actual'] - predictions['prediction']).abs()

In [54]:
merged[['IDfg','Season','Name','WAR','Next_WAR','diff']].sort_values(['diff'])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
4115,5352,2014,Yangervis Solarte,0.316770,1.8,0.000725
567,3442,2010,Dan Uggla,0.496894,2.3,0.000737
3902,1388,2007,Greg Norton,0.192547,0.4,0.002451
4389,6589,2013,Sean Rodriguez,0.267081,0.2,0.002630
4179,12510,2019,Curt Casali,0.273292,0.5,0.003488
...,...,...,...,...,...,...
3161,4810,2007,Brian McCann,0.304348,8.6,6.373020
3823,1875,2009,Josh Hamilton,0.291925,8.4,6.392358
871,9166,2010,Buster Posey,0.459627,10.1,6.580159
2517,11579,2014,Bryce Harper,0.310559,9.3,7.518072
