In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [2]:
start = 2002
end = 2023

In [3]:
batting = batting_stats(start, end, qual=200)

In [4]:
batting.to_csv("batting.csv")

In [5]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x : x.shape[0] > 1)
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,0,0.127,0.191,,,,12.7
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.2
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,0,0.135,0.223,,,,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6976,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,0.0,,0,0.166,0.252,,,,-2.4
7133,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-3.1
6764,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.169,0.295,,,,-2.9
7079,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,0,0.130,0.187,,,,-2.9


In [6]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [7]:
batting[["Name","Season","WAR","Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
5644,Alfredo Amezaga,2006,1.1,2.0
5081,Alfredo Amezaga,2007,2.0,1.2
5330,Alfredo Amezaga,2008,1.2,
1187,Garret Anderson,2002,3.7,5.1
875,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
5492,Spencer Torkelson,2023,-0.2,
6711,Ha-seong Kim,2021,0.5,3.7
4464,Ha-seong Kim,2022,3.7,
1122,Vinnie Pasquantino,2022,1.5,0.5


In [8]:
batting.keys()

Index(['IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B',
       ...
       'HardHit', 'HardHit%', 'Events', 'CStr%', 'CSW%', 'xBA', 'xSLG',
       'xwOBA', 'L-WAR', 'Next_WAR'],
      dtype='object', length=321)

In [9]:
null_count = batting.isnull().sum()
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         6856
xSLG        6856
xwOBA       6856
L-WAR          0
Next_WAR    1194
Length: 321, dtype: int64

In [10]:
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [11]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR
5644,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,107,113,143,109,63,0,0.188,0.256,1.1,2.0
5081,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,101,112,109,113,75,0,0.175,0.227,2.0,1.2
5330,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,101,101,123,111,64,0,0.178,0.244,1.2,
1187,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,91,80,65,97,129,0,0.137,0.232,3.7,5.1
875,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,101,80,90,99,109,0,0.164,0.252,5.1,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5492,27465,2023,Spencer Torkelson,DET,23,49,185,209,44,28,...,79,109,97,94,111,146,0.181,0.266,-0.2,
6711,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,99,59,137,96,88,201,0.216,0.303,0.6,3.7
4464,27506,2022,Ha-seong Kim,SDP,26,150,517,582,130,87,...,104,71,117,100,90,424,0.205,0.276,4.0,
1122,27676,2022,Vinnie Pasquantino,KCR,24,72,258,298,76,56,...,104,87,79,93,123,226,0.162,0.228,1.5,0.5


In [12]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
Next_WAR    float64
Length: 133, dtype: object

In [13]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [14]:
batting["Dol"]

5644      $5.5
5081     $11.2
5330      $7.2
1187     $14.6
875      $22.0
         ...  
5492    ($1.4)
6711      $3.9
4464     $29.4
1122     $11.9
2001      $3.9
Name: Dol, Length: 6856, dtype: object

In [15]:
del batting["Dol"]

In [16]:
batting["Age Rng"]

5644    28 - 28
5081    29 - 29
5330    30 - 30
1187    30 - 30
875     31 - 31
         ...   
5492    23 - 23
6711    25 - 25
4464    26 - 26
1122    24 - 24
2001    25 - 25
Name: Age Rng, Length: 6856, dtype: object

In [17]:
del batting["Age Rng"]

In [18]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [19]:
batting_full = batting.copy()
batting = batting.dropna()

In [20]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr , 
                                n_features_to_select=20, 
                                direction="forward", cv=split, 
                                n_jobs=8)

In [21]:
removed_columns = ["Next_WAR", "Season", "Name", "Team", "IDfg"]
selected_cloumns = batting.columns[~ batting.columns.isin(removed_columns)]

In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
batting.loc[:, selected_cloumns] = scaler.fit_transform(batting[selected_cloumns])

In [23]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code
count,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0,...,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0,5662.0
mean,5540.199576,2011.329036,0.360011,0.654743,0.481034,0.483512,0.367272,0.291152,0.400516,0.103244,...,0.403193,0.41064,0.510438,0.479323,0.180586,0.498221,0.545953,0.33581,1.782321,0.474074
std,5317.675311,5.724802,0.147294,0.255709,0.242331,0.262136,0.182276,0.138524,0.171705,0.105536,...,0.131126,0.120856,0.130216,0.133694,0.279013,0.137213,0.120666,0.120318,1.986874,0.305195
min,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.4,0.0
25%,1152.0,2006.0,0.269231,0.478632,0.278777,0.261246,0.215517,0.179245,0.258621,0.043478,...,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.248447,0.3,0.205882
50%,3711.0,2011.0,0.346154,0.717949,0.510791,0.512976,0.37069,0.287736,0.37931,0.086957,...,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,0.31677,1.4,0.470588
75%,9218.0,2016.0,0.461538,0.871795,0.690647,0.712803,0.512931,0.391509,0.517241,0.130435,...,0.488722,0.483146,0.594203,0.571429,0.37187,0.591489,0.625551,0.403727,2.9,0.735294
max,27676.0,2022.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.9,1.0


In [24]:
sfs.fit(batting[selected_cloumns], batting["Next_WAR"])

In [25]:
sfs.get_support()

array([ True, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False,  True, False,
        True, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False,  True, False, False, False, False, False,
       False,  True,  True, False,  True, False, False, False,  True,
       False])

In [26]:
predictors = list(selected_cloumns[sfs.get_support()])

In [27]:
predictors

['Age',
 'IBB',
 'SO',
 'SB',
 'BU',
 'BB/K',
 'OBP',
 'WAR',
 'Spd',
 'PH',
 'CB%',
 'CH%',
 'wCH',
 'Off',
 'SLG+',
 'BABIP+',
 'Oppo%+',
 'Soft%+',
 'Hard%+',
 'L-WAR']

In [28]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    predictor2 = predictors
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)



In [29]:
predictions = backtest(batting, rr, predictors)


In [30]:
predictions.shape

(4214, 2)

In [31]:
from sklearn.metrics import mean_squared_error
mean_squared_error(predictions["actual"],predictions["prediction"])

2.805356256317713

In [32]:
def player_history(df):
    df = df.sort_values("Season")

    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    
    df["war_corr"].fillna(0, inplace=True)

    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)
    df["war_diff"][df["war_diff"]== np.inf] = 1

    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [33]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [34]:
batting["war_season"] =  batting.groupby("Season", group_keys=False).apply(group_averages)

In [35]:
new_predictors = predictors + ["player_season", "war_corr", "war_diff", "war_season"]

In [36]:
predictions = backtest(batting, rr, new_predictors)
predictions

Unnamed: 0,actual,prediction
5081,1.2,1.499834
1962,1.4,0.655483
3153,-0.1,0.462730
5883,0.6,1.035929
1124,4.8,2.262742
...,...,...
3430,-0.1,0.428981
1640,1.5,2.643216
5978,0.8,1.580127
6643,-0.2,0.925871


In [37]:
from sklearn.metrics import mean_squared_error
mean_squared_error(predictions["actual"],predictions["prediction"])

2.689545726239883

In [41]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.706506
BABIP+          -1.826422
WAR             -1.605663
Soft%+          -1.219779
BU              -1.158990
SLG+            -1.105167
Off             -0.956173
SO              -0.767053
PH              -0.755707
war_diff        -0.579505
CB%             -0.315083
CH%             -0.280426
wCH             -0.221231
war_corr        -0.105409
L-WAR           -0.011950
player_season    0.008234
BB/K             0.189362
Oppo%+           0.706887
Spd              0.737909
OBP              0.890412
SB               1.142129
IBB              1.557587
Hard%+           2.193653
war_season       3.547601
dtype: float64

In [43]:
diff = predictions["actual"] - predictions["prediction"]
diff

5081   -0.299834
1962    0.744517
3153   -0.562730
5883   -0.435929
1124    2.537258
          ...   
3430   -0.528981
1640   -1.143216
5978   -0.780127
6643   -1.125871
1122   -0.978795
Length: 4214, dtype: float64

In [44]:
merged = predictions.merge(batting, left_index=True, right_index=True)
merged

Unnamed: 0,actual,prediction,IDfg,Season,Name,Team,Age,G,AB,PA,...,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code,player_season,war_corr,war_diff,war_season
5081,1.2,1.499834,1,2007,Alfredo Amezaga,FLA,0.384615,0.743590,0.431655,0.429066,...,0.000000,0.527660,0.396476,0.335404,1.2,0.352941,1,1.000000,1.200000,0.998355
1962,1.4,0.655483,2,2007,Garret Anderson,LAA,0.615385,0.529915,0.462230,0.432526,...,0.000000,0.442553,0.480176,0.298137,1.4,0.441176,5,-0.692192,1.371429,0.887427
3153,-0.1,0.462730,10,2007,David Eckstein,STL,0.500000,0.606838,0.492806,0.491349,...,0.000000,0.676596,0.436123,0.254658,-0.1,0.852941,5,-0.694330,0.836735,0.758010
5883,0.6,1.035929,11,2007,Darin Erstad,CHW,0.538462,0.350427,0.269784,0.254325,...,0.000000,0.765957,0.691630,0.254658,0.6,0.205882,4,-0.828562,0.803922,0.758010
1124,4.8,2.262742,15,2007,Troy Glaus,TOR,0.423077,0.589744,0.404676,0.442907,...,0.000000,0.634043,0.704846,0.378882,4.8,0.970588,5,0.231396,0.897059,1.127772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430,-0.1,0.428981,26197,2022,Andrew Vaughn,CHW,0.192308,0.752137,0.629496,0.614187,...,0.699499,0.638298,0.678414,0.236025,-0.1,0.205882,1,-1.000000,0.935484,0.433303
1640,1.5,2.643216,26288,2022,Adley Rutschman,BAL,0.192308,0.572650,0.428058,0.467128,...,0.525876,0.617021,0.537445,0.540373,1.5,0.117647,0,0.000000,1.000000,1.299910
5978,0.8,1.580127,26294,2022,Bryson Stott,PHI,0.192308,0.692308,0.480216,0.460208,...,0.567613,0.651064,0.585903,0.322981,0.8,0.705882,0,0.000000,1.000000,0.717192
6643,-0.2,0.925871,27465,2022,Spencer Torkelson,DET,0.115385,0.547009,0.359712,0.352941,...,0.439065,0.506383,0.607930,0.155280,-0.2,0.323529,0,0.000000,1.000000,0.358596


In [46]:
merged["diff"] = predictions["actual"] - predictions["prediction"].abs()
merged

Unnamed: 0,actual,prediction,IDfg,Season,Name,Team,Age,G,AB,PA,...,CStr%,CSW%,L-WAR,Next_WAR,team_code,player_season,war_corr,war_diff,war_season,diff
5081,1.2,1.499834,1,2007,Alfredo Amezaga,FLA,0.384615,0.743590,0.431655,0.429066,...,0.527660,0.396476,0.335404,1.2,0.352941,1,1.000000,1.200000,0.998355,-0.299834
1962,1.4,0.655483,2,2007,Garret Anderson,LAA,0.615385,0.529915,0.462230,0.432526,...,0.442553,0.480176,0.298137,1.4,0.441176,5,-0.692192,1.371429,0.887427,0.744517
3153,-0.1,0.462730,10,2007,David Eckstein,STL,0.500000,0.606838,0.492806,0.491349,...,0.676596,0.436123,0.254658,-0.1,0.852941,5,-0.694330,0.836735,0.758010,-0.562730
5883,0.6,1.035929,11,2007,Darin Erstad,CHW,0.538462,0.350427,0.269784,0.254325,...,0.765957,0.691630,0.254658,0.6,0.205882,4,-0.828562,0.803922,0.758010,-0.435929
1124,4.8,2.262742,15,2007,Troy Glaus,TOR,0.423077,0.589744,0.404676,0.442907,...,0.634043,0.704846,0.378882,4.8,0.970588,5,0.231396,0.897059,1.127772,2.537258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430,-0.1,0.428981,26197,2022,Andrew Vaughn,CHW,0.192308,0.752137,0.629496,0.614187,...,0.638298,0.678414,0.236025,-0.1,0.205882,1,-1.000000,0.935484,0.433303,-0.528981
1640,1.5,2.643216,26288,2022,Adley Rutschman,BAL,0.192308,0.572650,0.428058,0.467128,...,0.617021,0.537445,0.540373,1.5,0.117647,0,0.000000,1.000000,1.299910,-1.143216
5978,0.8,1.580127,26294,2022,Bryson Stott,PHI,0.192308,0.692308,0.480216,0.460208,...,0.651064,0.585903,0.322981,0.8,0.705882,0,0.000000,1.000000,0.717192,-0.780127
6643,-0.2,0.925871,27465,2022,Spencer Torkelson,DET,0.115385,0.547009,0.359712,0.352941,...,0.506383,0.607930,0.155280,-0.2,0.323529,0,0.000000,1.000000,0.358596,-1.125871


In [49]:
merged[["IDfg","Season","Name","WAR","Next_WAR","diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
1803,2113,2007,Ryan Doumit,0.341615,-3.4,-4.929469
996,1201,2010,Carl Crawford,0.689441,0.0,-4.924563
156,393,2014,Victor Martinez,0.490683,-2.0,-4.646744
596,319,2010,Adam Dunn,0.397516,-2.9,-4.643894
2092,2090,2008,Alex Rios,0.552795,0.0,-4.418760
...,...,...,...,...,...,...
3212,4810,2007,Brian McCann,0.304348,8.6,6.441163
3296,5631,2010,Matt Kemp,0.211180,8.3,6.512663
882,9166,2010,Buster Posey,0.459627,10.1,6.592030
2561,11579,2014,Bryce Harper,0.310559,9.3,7.487125


In [50]:
from sklearn.metrics import mean_squared_error
mean_squared_error(predictions["actual"],predictions["prediction"])

2.689545726239883