In [2]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [3]:
START = 2002
END = 2023

In [4]:
if os.path.exists("batting.csv"):
    batting = pd.read_csv("batting.csv", index_col=0)
else:
    batting = batting_stats(START, END, qual=200)
    batting.to_csv("batting.csv")

In [5]:
working_batting = pd.read_csv('batting.csv')

In [6]:
working_batting = working_batting.groupby('IDfg', group_keys=False).filter(lambda x: x.shape[0] > 1)

In [7]:
working_batting

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,...,,,,0,0.127,0.191,,,,12.7
1,1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,...,,,,0,0.124,0.164,,,,11.9
2,8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.6
3,2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,...,,,,0,0.135,0.223,,,,10.2
4,15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7471,7400,9272,2018,Chris Davis,BAL,32,128,470,522,79,...,111.8,113.0,0.401,282,0.174,0.316,,,,-2.6
7472,6861,45,2012,Rod Barajas,PIT,36,104,321,361,66,...,,0.0,,0,0.147,0.258,,,,-2.6
7473,7008,319,2011,Adam Dunn,CHW,31,122,415,496,66,...,,0.0,,0,0.169,0.295,,,,-2.9
7474,7344,620,2002,Neifi Perez,KCR,29,145,554,585,131,...,,,,0,0.130,0.187,,,,-2.9


In [8]:
def next_season(player):
    player = player.sort_values("Season")
    player['Next_WAR'] = player['WAR'].shift(-1)
    return player

working_batting = working_batting.groupby("IDfg", group_keys=False).apply(next_season, include_groups=True)

  working_batting = working_batting.groupby("IDfg", group_keys=False).apply(next_season, include_groups=True)


In [9]:
working_batting[['IDfg', 'Name', 'Season', 'WAR', 'Next_WAR']]

Unnamed: 0,IDfg,Name,Season,WAR,Next_WAR
4144,1,Alfredo Amezaga,2006,1.1,2.0
2739,1,Alfredo Amezaga,2007,2.0,1.2
3975,1,Alfredo Amezaga,2008,1.2,
1072,2,Garret Anderson,2002,3.7,5.1
439,2,Garret Anderson,2003,5.1,0.8
...,...,...,...,...,...
771,27506,Ha-seong Kim,2023,4.3,
3559,27676,Vinnie Pasquantino,2022,1.4,0.1
5903,27676,Vinnie Pasquantino,2023,0.1,
2974,30116,Seiya Suzuki,2022,1.8,3.0


In [10]:
null_count = working_batting.isnull().sum()

In [11]:
null_count

Unnamed: 0       0
IDfg             0
Season           0
Name             0
Team             0
              ... 
xBA           7092
xSLG          7092
xwOBA         7092
L-WAR            0
Next_WAR      1235
Length: 322, dtype: int64

In [12]:
complete_cols = list(working_batting.columns[null_count == 0])

In [13]:
working_batting = working_batting[complete_cols + ['Next_WAR']].copy()

In [14]:
working_batting.dtypes[working_batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [15]:
del working_batting['Dol']
del working_batting['Age Rng']

In [16]:
working_batting['team_code'] = working_batting['Team'].astype('category').cat.codes

In [17]:
batting_full = working_batting.copy()
working_batting = working_batting.dropna()

In [18]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits = 3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction='forward', cv=split, n_jobs=4)



In [19]:
removed_columns = ['Next_WAR', 'Name', 'Team', 'IDfg', 'Season']
selected_columns = working_batting.columns[~working_batting.columns.isin(removed_columns)]

In [20]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])

  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])
  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])
  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])
  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])
  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])
  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])
  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])
  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])
  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_columns])
  working_batting.loc[:, selected_columns] = scaler.fit_transform(working_batting[selected_

In [21]:
sfs.fit(working_batting[selected_columns], working_batting['Next_WAR'])

In [22]:
batting_backup = working_batting.copy()

In [23]:
batting_backup

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code
4144,0.779666,1,2006,Alfredo Amezaga,FLA,0.346154,0.735043,0.312950,0.307958,0.245690,...,0.503759,0.662921,0.652174,0.210884,0.000000,0.582979,0.524229,0.265823,2.0,0.352941
2739,0.700870,1,2007,Alfredo Amezaga,FLA,0.384615,0.743590,0.431655,0.429066,0.323276,...,0.496241,0.471910,0.710145,0.292517,0.000000,0.527660,0.396476,0.322785,1.2,0.352941
1072,0.160535,2,2002,Garret Anderson,ANA,0.423077,0.957265,0.859712,0.826990,0.711207,...,0.255639,0.224719,0.478261,0.659864,0.000000,0.365957,0.418502,0.430380,5.1,0.029412
439,0.117993,2,2003,Garret Anderson,ANA,0.461538,0.965812,0.859712,0.818339,0.737069,...,0.255639,0.365169,0.507246,0.523810,0.000000,0.480851,0.506608,0.518987,0.8,0.029412
4588,0.358796,2,2004,Garret Anderson,ANA,0.500000,0.564103,0.507194,0.475779,0.443966,...,0.218045,0.297753,0.608696,0.448980,0.000000,0.531915,0.585903,0.246835,-0.2,0.029412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7176,0.920134,27465,2022,Spencer Torkelson,DET,0.115385,0.547009,0.359712,0.352941,0.185345,...,0.436090,0.337079,0.550725,0.496599,0.439065,0.506383,0.607930,0.132911,1.3,0.323529
5510,0.930033,27506,2021,Ha-seong Kim,SDP,0.230769,0.606838,0.192446,0.169550,0.103448,...,0.097744,0.629213,0.463768,0.380952,0.335559,0.702128,0.731278,0.227848,3.7,0.764706
1120,0.614582,27506,2022,Ha-seong Kim,SDP,0.269231,0.888889,0.642086,0.660900,0.431034,...,0.187970,0.516854,0.521739,0.394558,0.707846,0.655319,0.612335,0.430380,4.3,0.764706
3559,0.152241,27676,2022,Vinnie Pasquantino,KCR,0.192308,0.222222,0.176259,0.169550,0.198276,...,0.308271,0.303371,0.420290,0.619048,0.377295,0.472340,0.400881,0.291139,0.1,0.411765


In [24]:
predictors = list(selected_columns[sfs.get_support()])

In [25]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []

    years = sorted(data['Season'].unique())

    for i in range(start, len(years), step):
        current_year = years[i]

        train = data[data['Season'] < current_year]
        test = data[data['Season'] == current_year]

        model.fit(train[predictors], train['Next_WAR'])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index = test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis = 1)
        combined.columns = ['actual', 'predictions']

        all_predictions.append(combined)

    return pd.concat(all_predictions)



In [26]:
predictions = backtest(working_batting, rr, predictors)

In [27]:
predictions

Unnamed: 0,actual,predictions
2739,1.2,1.384595
3564,1.4,0.738473
4810,-0.1,0.515400
4898,0.6,0.983679
1841,4.8,2.256597
...,...,...
6360,0.0,1.714227
7176,1.3,1.652519
1120,4.3,2.789176
3559,0.1,2.078297


In [28]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions['actual'], predictions['predictions'])

2.6787306746900406

In [29]:
working_batting['Next_WAR'].describe()

count    5857.000000
mean        1.791668
std         1.976187
min        -3.100000
25%         0.400000
50%         1.500000
75%         2.900000
max        11.900000
Name: Next_WAR, dtype: float64

In [30]:
2.693236744398038 ** .5

1.641108388985334

In [31]:
working_batting = working_batting.copy()

In [32]:
def player_history(df):
    df = df.sort_values("Season")

    df['player_season'] = range(0, df.shape[0])
    df['war_corr'] = list(df[['player_season', 'WAR']].expanding().corr().loc[(slice(None), 'player_season'), 'WAR'])
    df['war_corr'].fillna(1, inplace=True)

    df['war_diff'] = df['WAR'] / df['WAR'].shift(1)
    df['war_diff'].fillna(1, inplace=True)

    df['war_diff'][df['war_diff'] == np.inf] = 1

    return df

working_batting = working_batting.groupby("IDfg", group_keys=False).apply(player_history, include_groups=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['war_corr'].fillna(1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['war_diff'].fillna(1, inplace=True)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default beha

In [33]:
pd.options.mode.chained_assignment = None  # default='warn'

In [34]:
def group_averages(df):
    return df['WAR'] / df['WAR'].mean()

In [35]:
working_batting['war_season'] = batting.groupby('Season', group_keys=False).apply(group_averages)

  working_batting['war_season'] = batting.groupby('Season', group_keys=False).apply(group_averages)


In [36]:
new_predictors = predictors + ['player_season', 'war_corr', 'war_season', 'war_diff']

In [37]:
predictions = backtest(working_batting, rr, new_predictors)

In [38]:
mean_squared_error(predictions['actual'], predictions['predictions'])

2.6588928003409205

In [40]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.758980
BABIP           -1.647857
G               -1.547315
SO              -1.350573
BU              -1.152043
Soft%           -0.953515
ISO             -0.634823
CH%             -0.387220
war_diff        -0.266007
CB%             -0.250442
war_corr        -0.141254
player_season    0.000073
war_season       0.035900
PH               0.077532
OBP+             0.394698
Unnamed: 0       0.561338
Oppo%+           0.620277
Spd              0.812610
SB               0.867284
Balls            0.928048
Strikes          1.498028
IBB              1.935834
Hard%+           2.499669
WAR              7.418849
dtype: float64

In [42]:
diff = predictions['actual'] - predictions['predictions']

In [43]:
merged = predictions.merge(working_batting, left_index=True, right_index=True)

In [44]:
merged['diff'] = (predictions['actual'] - predictions['predictions']).abs()

In [45]:
merged[['IDfg', 'Season', 'Name', 'WAR', 'Next_WAR', 'diff']].sort_values(['diff'])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
6924,14109,2019,Albert Almora Jr.,0.158228,0.6,0.000251
7063,4314,2022,Joey Votto,0.145570,0.1,0.000439
5141,9393,2017,Matt Adams,0.227848,0.5,0.000796
1202,7287,2011,Carlos Gonzalez,0.417722,2.7,0.001039
364,791,2011,Brandon Phillips,0.537975,3.3,0.001198
...,...,...,...,...,...,...
3780,1875,2009,Josh Hamilton,0.278481,8.4,6.555312
6048,5631,2010,Matt Kemp,0.196203,8.3,6.568964
948,9166,2010,Buster Posey,0.443038,9.8,6.661731
328,15640,2021,Aaron Judge,0.544304,11.2,7.297012
