In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats


In [2]:
START = 2002
END = 2022

In [3]:
batting = batting_stats(START, END, qual = 200)

In [4]:
batting.to_csv('batting.csv')

In [5]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [6]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,0,0.127,0.191,,,,12.7
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.6
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,0,0.135,0.223,,,,10.2
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-2.6
6535,45,2012,Rod Barajas,PIT,36,104,321,361,66,44,...,,0.0,,0,0.147,0.258,,,,-2.6
6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.169,0.295,,,,-2.9
6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,0,0.130,0.187,,,,-2.9


In [7]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

  batting = batting.groupby("IDfg", group_keys=False).apply(next_season)


In [8]:
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
5562,Alfredo Amezaga,2006,1.1,2.0
5006,Alfredo Amezaga,2007,2.0,1.2
5252,Alfredo Amezaga,2008,1.2,
1169,Garret Anderson,2002,3.7,5.1
864,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
6002,Owen Miller,2022,0.8,
4881,Andrew Vaughn,2021,-0.2,-0.5
3377,Andrew Vaughn,2022,-0.5,
6620,Ha-seong Kim,2021,0.4,3.7


In [9]:
null_count = batting.isnull().sum()

In [10]:
complete_cols = list(batting.columns[null_count == 0])

In [11]:
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [12]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
Next_WAR    float64
Length: 133, dtype: object

In [13]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [14]:
del batting["Dol"]

In [15]:
del batting["Age Rng"]

In [16]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [17]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [18]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha = 1)

split = TimeSeriesSplit(n_splits = 3)

sfs = SequentialFeatureSelector(rr, n_features_to_select = 20, direction = "forward", cv = split, n_jobs = 4)

In [19]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [20]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting

In [21]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code
count,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,...,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0
mean,5366.78583,2011.163229,0.3606,0.652755,0.478666,0.480943,0.365973,0.290481,0.399279,0.103459,...,0.403164,0.410923,0.511026,0.478646,0.172991,0.498932,0.545898,0.322038,1.793238,0.474128
std,5133.255295,5.612014,0.147476,0.255929,0.242481,0.26229,0.182585,0.138786,0.171732,0.105891,...,0.131213,0.121082,0.130359,0.133992,0.273858,0.13718,0.120701,0.122152,1.981221,0.305105
min,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.1,0.0
25%,1131.5,2006.0,0.269231,0.478632,0.27518,0.257785,0.211207,0.179245,0.258621,0.043478,...,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.234177,0.4,0.205882
50%,3531.0,2011.0,0.346154,0.709402,0.505396,0.508651,0.37069,0.283019,0.37931,0.086957,...,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,0.303797,1.5,0.470588
75%,9015.0,2016.0,0.461538,0.871795,0.688849,0.710208,0.508621,0.391509,0.517241,0.130435,...,0.488722,0.483146,0.594203,0.564626,0.346411,0.591489,0.625551,0.392405,2.9,0.735294
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.9,1.0


In [22]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [46]:
predictors = list(selected_columns[sfs.get_support()])
predictors
predictors.remove('Pull%+')

In [47]:
def backtest(data, model, predictors, start = 5, step = 1):
    all_predictions = []
    years = sorted(batting["Season"].unique())

    for i in range(start, len(years), step):
        current_year = years[i]

        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]

        model.fit(train[predictors], train["Next_WAR"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis = 1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [48]:
predictions = backtest(batting, rr, predictors)

In [49]:
predictions

Unnamed: 0,actual,prediction
5006,1.2,1.402890
1925,1.4,0.717662
3102,-0.1,0.466504
5797,0.6,0.969669
1109,4.8,2.206954
...,...,...
1914,2.3,2.716903
5875,0.8,2.082882
7032,0.8,1.558004
4881,-0.5,1.807690


In [50]:
from sklearn.metrics import mean_squared_error
    
mean_squared_error(predictions["actual"], predictions["prediction"])

2.7362441237789197

In [51]:
batting["Next_WAR"].describe()

count    5575.000000
mean        1.793238
std         1.981221
min        -3.100000
25%         0.400000
50%         1.500000
75%         2.900000
max        11.900000
Name: Next_WAR, dtype: float64

In [29]:
2.7366985598430023 ** .5

1.654296998680407

In [None]:
def player_history(df):
    df = df.sort_values("Season")
    
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
    df["war_corr"].fillna(1, inplace = True)

    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace = True)

    df["war_diff"][df["war_diff"] == np.inf] = 1

    return df
batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [31]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [32]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

  batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)


In [33]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [34]:
predictions = backtest(batting, rr, new_predictors)

In [35]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.679428763278086

In [36]:
2.679428763278086 ** .5

1.636896075894278

In [43]:
pd.Series(rr.coef_, index = new_predictors).sort_values()

Age             -2.713989
BABIP           -1.945977
WAR             -1.842012
SLG+            -1.381045
Soft%+          -1.320271
BU              -1.129082
SO              -0.897458
PH              -0.743798
WPA             -0.552260
CH%             -0.289706
wCH             -0.285358
war_diff        -0.284241
CB%             -0.273339
Pull%+          -0.223272
war_corr        -0.137474
player_season    0.000193
IFH              0.633765
Oppo%            0.690392
Spd              0.768238
OBP+             0.837998
SB               0.965537
IBB              2.053877
Hard%+           2.440750
war_season       3.189380
dtype: float64

In [38]:
diff = predictions["actual"] - predictions["prediction"]

In [39]:
diff

5006   -0.222652
1925    0.909449
3102   -0.346150
5797   -0.386085
1109    2.842180
          ...   
1914   -0.349220
5875   -1.162907
7032   -0.566905
4881   -2.064679
6620    2.764165
Length: 4127, dtype: float64

In [40]:
merged = predictions.merge(batting, left_index = True, right_index=True)

In [41]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [42]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
6023,4403,2013,Erik Kratz,0.246835,1.1,0.001375
1190,15172,2019,Tim Anderson,0.481013,2.3,0.002581
3266,1286,2008,Michael Young,0.348101,2.6,0.003986
2082,5887,2013,John Jaso,0.234177,0.6,0.004329
2159,1702,2008,Reed Johnson,0.284810,0.3,0.004360
...,...,...,...,...,...,...
3823,1875,2009,Josh Hamilton,0.278481,8.4,6.457327
871,9166,2010,Buster Posey,0.443038,9.8,6.526769
3245,5631,2010,Matt Kemp,0.196203,8.3,6.526948
451,15640,2021,Aaron Judge,0.544304,11.2,7.417743
