In [2]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [3]:
START = 2002
END = 2023

In [32]:
batting = batting_stats(START, END, qual=200)

In [33]:
batting.to_csv('batting.csv')

In [34]:
working_batting = pd.read_csv('batting.csv')

In [35]:
working_batting = working_batting.groupby('IDfg', group_keys=False).filter(lambda x: x.shape[0] > 1)

In [36]:
working_batting

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,...,,,,0,0.127,0.191,,,,12.7
1,1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,...,,,,0,0.124,0.164,,,,11.9
2,8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.6
3,2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,...,,,,0,0.135,0.223,,,,10.2
4,15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7471,7400,9272,2018,Chris Davis,BAL,32,128,470,522,79,...,111.8,113.0,0.401,282,0.174,0.316,,,,-2.6
7472,6861,45,2012,Rod Barajas,PIT,36,104,321,361,66,...,,0.0,,0,0.147,0.258,,,,-2.6
7473,7008,319,2011,Adam Dunn,CHW,31,122,415,496,66,...,,0.0,,0,0.169,0.295,,,,-2.9
7474,7344,620,2002,Neifi Perez,KCR,29,145,554,585,131,...,,,,0,0.130,0.187,,,,-2.9


In [37]:
def next_season(player):
    player = player.sort_values("Season")
    player['Next_WAR'] = player['WAR'].shift(-1)
    return player

working_batting = working_batting.groupby("IDfg", group_keys=False).apply(next_season, include_groups=False)

In [38]:
working_batting[['Name', 'Season', 'WAR', 'Next_WAR']]

Unnamed: 0,Name,Season,WAR,Next_WAR
4147,Alfredo Amezaga,2006,1.1,2.0
2739,Alfredo Amezaga,2007,2.0,1.2
3975,Alfredo Amezaga,2008,1.2,
1072,Garret Anderson,2002,3.7,5.1
438,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
770,Ha-seong Kim,2023,4.3,
3559,Vinnie Pasquantino,2022,1.4,0.1
5902,Vinnie Pasquantino,2023,0.1,
2974,Seiya Suzuki,2022,1.8,3.0


In [40]:
null_count = working_batting.isnull().sum()

In [41]:
null_count

Unnamed: 0       0
Season           0
Name             0
Team             0
Age              0
              ... 
xBA           7092
xSLG          7092
xwOBA         7092
L-WAR            0
Next_WAR      1235
Length: 321, dtype: int64

In [42]:
complete_cols = list(working_batting.columns[null_count == 0])

In [43]:
working_batting = working_batting[complete_cols + ['Next_WAR']].copy()

In [44]:
working_batting

Unnamed: 0.1,Unnamed: 0,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR
4147,5828,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,107,113,143,109,63,0,0.188,0.256,1.1,2.0
2739,5239,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,101,112,109,113,75,0,0.175,0.227,2.0,1.2
3975,5497,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,101,101,123,111,64,0,0.178,0.244,1.2,
1072,1200,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,91,80,65,97,129,0,0.137,0.232,3.7,5.1
438,882,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,101,80,90,99,109,0,0.164,0.252,5.1,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770,3254,2023,Ha-seong Kim,SDP,27,152,538,626,140,100,...,99,85,117,106,82,424,0.233,0.293,4.4,
3559,1138,2022,Vinnie Pasquantino,KCR,24,72,258,298,76,56,...,104,87,79,93,123,226,0.162,0.228,1.5,0.1
5902,3504,2023,Vinnie Pasquantino,KCR,25,61,231,260,57,31,...,102,88,87,91,121,202,0.179,0.252,0.0,
2974,2946,2022,Seiya Suzuki,CHC,27,111,397,446,104,66,...,105,130,106,95,106,290,0.232,0.312,2.1,3.0


In [45]:
working_batting.dtypes

Unnamed: 0      int64
Season          int64
Name           object
Team           object
Age             int64
               ...   
Events          int64
CStr%         float64
CSW%          float64
L-WAR         float64
Next_WAR      float64
Length: 133, dtype: object

In [46]:
working_batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [47]:
del working_batting['Dol']
del working_batting['Age Rng']

In [49]:
working_batting['team_code'] = working_batting['Team'].astype('category').cat.codes

In [50]:
batting_full = working_batting.copy()
working_batting = working_batting.dropna()