# Install

In [1]:
pip install pybaseball

Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading PyGithub-2.6.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyGithub-2.6.1-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected

# Imports

In [29]:
import os
import pandas as pd
import numpy as np
import pybaseball as pyb
from pybaseball import batting_stats

# Pull Data and Save Data

In [30]:
START = 2002
END = 2022

In [31]:
batting = batting_stats(START, END, qual=200)

In [32]:
batting.to_csv('batting.csv')

In [33]:
# we want players with at least two seasons -- remove groups with one season of data
# IDfg is unique player ID
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [34]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,0,0.127,0.191,,,,12.7
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.4
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,0,0.135,0.223,,,,10.2
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-2.6
6535,45,2012,Rod Barajas,PIT,36,104,321,361,66,44,...,,0.0,,0,0.147,0.258,,,,-2.6
6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.169,0.295,,,,-2.9
6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,0,0.130,0.187,,,,-2.9


# Prep the target variable

In [42]:
# set up target variable - WAR for next season
def next_season(player):
  """take data for single player and backfill WAR value for next season"""
  player = player.sort_values("Season")
  player["Next_WAR"] = player["WAR"].shift(-1)
  return player

# split dataframe by playerId and compute next season WAR for each player
batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

  batting = batting.groupby("IDfg", group_keys=False).apply(next_season)


In [43]:
batting[["Name","Season","WAR","Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
5562,Alfredo Amezaga,2006,1.1,2.0
5006,Alfredo Amezaga,2007,2.0,1.2
5252,Alfredo Amezaga,2008,1.2,
1169,Garret Anderson,2002,3.7,5.1
864,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
6002,Owen Miller,2022,0.7,
4881,Andrew Vaughn,2021,-0.2,-0.5
3377,Andrew Vaughn,2022,-0.5,
6620,Ha-seong Kim,2021,0.4,3.6


# Clean the data

In [44]:
# we can also impute these values
# we are counting missing values in each column
null_count = batting.isnull().sum()
null_count

Unnamed: 0,0
IDfg,0
Season,0
Name,0
Team,0
Age,0
...,...
xBA,6754
xSLG,6754
xwOBA,6754
L-WAR,0


In [45]:
complete_cols = list(batting.columns[null_count==0])
complete_cols

['IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'GB',
 'FB',
 'LD',
 'IFFB',
 'Pitches',
 'Balls',
 'Strikes',
 'IFH',
 'BU',
 'BUH',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'GB/FB',
 'LD%',
 'GB%',
 'FB%',
 'IFFB%',
 'HR/FB',
 'IFH%',
 'BUH%',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Dol',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'FB% (Pitch)',
 'FBv',
 'SL%',
 'SLv',
 'CB%',
 'CBv',
 'CH%',
 'CHv',
 'wFB',
 'wSL',
 'wCB',
 'wCH',
 'wFB/C',
 'wSL/C',
 'wCB/C',
 'wCH/C',
 'O-Swing%',
 'Z-Swing%',
 'Swing%',
 'O-Contact%',
 'Z-Contact%',
 'Contact%',
 'Zone%',
 'F-Strike%',
 'SwStr%',
 'BsR',
 'Def',
 'wSB',
 'UBR',
 'Age Rng',
 'Off',
 'Lg',
 'wGDP',
 'Pull%',
 'Cent%',
 'Oppo%',
 'Soft%',
 'Med%',
 'Hard%',
 'TTO%',
 'AVG+',
 'BB%+',
 'K

In [46]:
# avoid copy warnings later
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [47]:
batting.dtypes

Unnamed: 0,0
IDfg,int64
Season,int64
Name,object
Team,object
Age,int64
...,...
Events,int64
CStr%,float64
CSW%,float64
L-WAR,float64


In [48]:
# handle string values
batting.dtypes[batting.dtypes == "object"]

Unnamed: 0,0
Name,object
Team,object
Dol,object
Age Rng,object


In [49]:
# for now delete these columns -- could use age in the future
del batting['Dol']
del batting['Age Rng']
batting['team_code'] = batting['Team'].astype("category").cat.codes # ordinal encoding ... careful here

In [50]:
batting_full = batting.copy()
batting = batting.dropna().copy() # drop where next war is null

# Model

In [51]:
# run feature selector to help model optimize accuracy
# define model (ridge regression model)

from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit # used in feature selector

In [52]:
# play around with lambda / alpha parameter, the higher it is the more it penalizes Ridge Coeff and avoids overfitting
rr = Ridge(alpha=1)
split = TimeSeriesSplit(n_splits=3) # make predictions for three parts because we have time series data
# go through all the features, find the best one, so on -- direction forward, remove features for direction = backward
sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv=split, n_jobs=4)

In [53]:
removed_cols = ['Next_WAR', 'Name', 'Team','IDfg', 'Season']
selected_cols = batting.columns[~batting.columns.isin(removed_cols)]

In [54]:
# with ridge regression we have to scale data so mean is 1 and regreesion is 1
from sklearn.preprocessing import MinMaxScaler # puts all vallus betweeen 0-1
scaler = MinMaxScaler()
batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])

  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transfo

In [56]:
batting.describe()
# columns now have been scaled between 0 and 1

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code
count,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,...,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0
mean,5366.78583,2011.163229,0.3606,0.652755,0.478666,0.480943,0.365973,0.290481,0.399279,0.103459,...,0.403164,0.410923,0.511026,0.478646,0.172991,0.498932,0.545898,0.322039,1.793112,0.474128
std,5133.255295,5.612014,0.147476,0.255929,0.242481,0.26229,0.182585,0.138786,0.171732,0.105891,...,0.131213,0.121082,0.130359,0.133992,0.273858,0.13718,0.120701,0.122147,1.9811,0.305105
min,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.1,0.0
25%,1131.5,2006.0,0.269231,0.478632,0.27518,0.257785,0.211207,0.179245,0.258621,0.043478,...,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.234177,0.4,0.205882
50%,3531.0,2011.0,0.346154,0.709402,0.505396,0.508651,0.37069,0.283019,0.37931,0.086957,...,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,0.303797,1.5,0.470588
75%,9015.0,2016.0,0.461538,0.871795,0.688849,0.710208,0.508621,0.391509,0.517241,0.130435,...,0.488722,0.483146,0.594203,0.564626,0.346411,0.591489,0.625551,0.392405,2.9,0.735294
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.9,1.0


In [57]:
# fit selector to data -- fits best 20 features to give us best results for ridge regression
# then extract list of predictors from feature selector
sfs.fit(batting[selected_cols], batting["Next_WAR"])

In [60]:
predictors = list(selected_cols[sfs.get_support()])
# trues are columns we want to select

In [62]:
def backtest(data, model,predictors, start=5, step=1):
  """what we use to validate data in cross validation -- in timeseries
  we don't want to use data from a future season to predict past season.
  we only want past data to predict future data for timeseries cross validation
  ie. use 2002 to predict 2003, then use 2002, 2003 to predict 2004, etc.
  """

  all_predictions = []
  # get sorted years / seasons
  years = sorted(data["Season"].unique())

  for i in range(start, len(years), step):
    current_year = years[i]
    # split data
    train = data[data["Season"] < current_year]
    test = data[data["Season"] == current_year]
    # define label and data
    x_train = train[predictors]
    x_test = test[predictors]
    y_train = train["Next_WAR"]
    y_test = test["Next_WAR"]
    # fit and predict
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    # convert to series so easier to work with (column of data)
    preds = pd.Series(preds, index=y_test.index)
    # combine prediction with actuals
    combined = pd.concat([y_test, preds], axis=1)
    # assign column names
    combined.columns = ["actual", "prediction"]
    # append to all predictions - list of dataframes each is a predictions for a different season
    all_predictions.append(combined)
    # concat on axis=0 (vertically one long dataframe)
  return pd.concat(all_predictions)

In [64]:
predictions = backtest(batting, rr, predictors)
predictions

Unnamed: 0,actual,prediction
5006,1.2,1.405835
1925,1.4,0.716105
3102,-0.1,0.457908
5797,0.6,0.979155
1109,4.8,2.214873
...,...,...
1914,2.2,2.752929
5875,0.8,2.083841
7032,0.7,1.583827
4881,-0.5,1.819488


# Evaluate

In [65]:
# use summary stats to evaluate performance
from sklearn.metrics import mean_squared_error
mean_squared_error(predictions["actual"], predictions["prediction"])

2.736105953760221

In [66]:
batting["Next_WAR"].describe()

Unnamed: 0,Next_WAR
count,5575.0
mean,1.793112
std,1.9811
min,-3.1
25%,0.4
50%,1.5
75%,2.9
max,11.9


We probably want MSE to be lower than the standard dev ideally (typical rule of thumb) -- indicates our model is doing better than randomly guessing.

In [67]:
2.736105953760221 ** 0.5
# take away square root and we see the value is slightly lower which tells us our model is ok.

1.654117877831027

# Improving Accuracy

Right now we are only telling the model what happened in the current season, we might want to tell the model what happened in past seasons -- might be useful to see if the player is in decline

In [74]:
# example:
ga = batting[batting["IDfg"] == 2]
ga["player_season"] = range(0,ga.shape[0])
# defines groups and finds correlation within each group that expanding creates
# two index value to select one number
ga[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ga["player_season"] = range(0,ga.shape[0])


Unnamed: 0,Unnamed: 1,WAR
1169,player_season,
864,player_season,1.0
2569,player_season,-0.661143
4187,player_season,-0.836562
3964,player_season,-0.836312
1925,player_season,-0.692192
3346,player_season,-0.595013


In [75]:
def group_averages(df):
  """did our player play better than an average player"""
  return df["WAR"] / df["WAR"].mean()

In [78]:
def player_history(df):
  df = df.sort_values("Season")
  # indicates which season it is for the player
  df["player_season"] = range(0,df.shape[0])
  df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
  # 1 implies 1:1 correlation
  df["war_corr"].fillna(1, inplace=True)
  # brings prevous season up to current
  df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
  df["war_diff"].fillna(1,inplace=True)
  df["war_diff"][df["war_diff"] == np.inf] = 1
  return df

batting = batting.groupby("IDfg", group_keys = False).apply(player_history)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["war_corr"].fillna(1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["war_diff"].fillna(1,inplace=True)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the int

In [76]:
# how average player performed
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

  batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)


In [79]:
new_predictors = predictors + ["player_season","war_corr", "war_season", "war_diff"]

In [82]:
predictons = backtest(batting, rr, new_predictors)
mean_squared_error(predictions["actual"], predictions["prediction"])

2.736105953760221

# Evaluate Predictors

In [88]:
pd.Series(rr.coef_, index=new_predictors).sort_values()
# use this to diagnose which predictors are best

Unnamed: 0,0
Age,-2.71256
BABIP,-1.937445
WAR,-1.831011
SLG+,-1.381371
Soft%+,-1.322751
BU,-1.125847
SO,-0.896876
PH,-0.74679
WPA,-0.552493
CH%,-0.289703


In [90]:
diff = predictions["actual"] - predictions["prediction"]

In [92]:
merged = predictions.merge(batting,left_index=True, right_index=True)
merged["diff"] = predictions["actual"] - predictions["prediction"].abs()
# merged[['IDfg', "Season",'Name', 'WAR', 'Next_WAR']]]

Unnamed: 0,actual,prediction,IDfg,Season,Name,Team,Age,G,AB,PA,...,CStr%,CSW%,L-WAR,Next_WAR,team_code,war_season,player_season,war_corr,war_diff,diff
5006,1.2,1.405835,1,2007,Alfredo Amezaga,FLA,0.384615,0.743590,0.431655,0.429066,...,0.527660,0.396476,0.322785,1.2,0.352941,0.998259,1,1.000000,1.214286,-0.205835
1925,1.4,0.716105,2,2007,Garret Anderson,LAA,0.615385,0.529915,0.462230,0.432526,...,0.442553,0.480176,0.284810,1.4,0.441176,0.880816,5,-0.692192,1.406250,0.683895
3102,-0.1,0.457908,10,2007,David Eckstein,STL,0.500000,0.606838,0.492806,0.491349,...,0.676596,0.436123,0.240506,-0.1,0.852941,0.743801,5,-0.694330,0.826087,-0.557908
5797,0.6,0.979155,11,2007,Darin Erstad,CHW,0.538462,0.350427,0.269784,0.254325,...,0.765957,0.691630,0.240506,0.6,0.205882,0.743801,4,-0.828562,0.791667,-0.379155
1109,4.8,2.214873,15,2007,Troy Glaus,TOR,0.423077,0.589744,0.404676,0.442907,...,0.634043,0.704846,0.367089,4.8,0.970588,1.135274,5,0.231396,0.892308,2.585127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,2.2,2.752929,23667,2021,Wander Franco,TBR,0.038462,0.205128,0.217626,0.186851,...,0.391489,0.352423,0.348101,2.2,0.911765,1.062206,0,1.000000,1.000000,-0.552929
5875,0.8,2.083841,24618,2021,Ryan Jeffers,MIN,0.192308,0.333333,0.192446,0.160900,...,0.514894,0.788546,0.240506,0.8,0.558824,0.830452,0,1.000000,1.000000,-1.283841
7032,0.7,1.583827,24655,2021,Owen Miller,CLE,0.192308,0.119658,0.055755,0.003460,...,0.548936,0.700441,0.139241,0.7,0.264706,0.463508,0,1.000000,1.000000,-0.883827
4881,-0.5,1.819488,26197,2021,Andrew Vaughn,CHW,0.153846,0.692308,0.462230,0.465398,...,0.570213,0.651982,0.170886,-0.5,0.205882,0.560072,0,1.000000,1.000000,-2.319488


# Next steps

- Use better data (ie. minor league data, etc.)
- Use a different model
- Use different selection strategies
- Feature engineering