In [62]:
import pandas as pd

Retrosheet Game Logs

Columns of interest:

- (Field number)
- (0) Date in the form "yyyymmdd"
- (1) Number of game : '0' indicates only one game played that day
- (3) Visiting team
- (5) Visiting team game number
- (6) Home team
- (8) Home team game number
- (9-10) Visiting and home team score
- (13) completion information, field should be empty for games that were completed same day
- (21-37) Visiting team offensive stats:
    - at-bats
    - hits
    - doubles
    - triples
    - homeruns
    - RBI
    - sacrifice hits.  This may include sacrifice flies for years prior to 1954 when sacrifice flies were allowed.
    - sacrifice flies (since 1954)
    - hit-by-pitch
    - walks
    - intentional walks
    - strikeouts
    - stolen bases
    - caught stealing
    - grounded into double plays
    - awarded first on catcher's interference
    - left on base
- (49-65) Home team offensive stats
- (101-102) Visiting starting pitcher ID and name
- (103-104) Home starting pitcher ID and name
- (105-131) Visiting starting players ID, name and defensive position, listed in the order (1-9) they appeared in the batting order
- (132-158) Home starting players ID, name and defensive position listed in the order (1-9) they appeared in the batting order

In [66]:
bs_2018 = pd.read_csv('data/GL2018.TXT', header=None)
bs_2018.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,151,152,153,154,155,156,157,158,159,160
0,20180329,0,Thu,COL,NL,1,ARI,NL,1,2,...,Nick Ahmed,6,dysoj001,Jarrod Dyson,9,corbp001,Patrick Corbin,1,,Y
1,20180329,0,Thu,PHI,NL,1,ATL,NL,1,5,...,Dansby Swanson,6,flahr001,Ryan Flaherty,5,tehej001,Julio Teheran,1,,Y
2,20180329,0,Thu,SFN,NL,1,LAN,NL,1,1,...,Yasmani Grandal,2,forsl001,Logan Forsythe,5,kersc001,Clayton Kershaw,1,,Y
3,20180329,0,Thu,CHN,NL,1,MIA,NL,1,8,...,Miguel Rojas,6,wallc001,Chad Wallach,2,urenj001,Jose Urena,1,,Y
4,20180329,0,Thu,SLN,NL,1,NYN,NL,1,4,...,Kevin Plawecki,2,syndn001,Noah Syndergaard,1,rosaa003,Amed Rosario,6,,Y


In [67]:
field_nos = [0,1,3,5,6,8,9,10,13]+list(range(21,38))+list(range(49,66))+[101,102,103,104]
bs_2018_cut = bs_2018.iloc[:,field_nos] # only columns of interest
bs_2018_cut = bs_2018_cut.loc[bs_2018_cut[13].isna(),:] # keep games that were completed same day

In [68]:
# rename columns
col_rename_dict = {0 : 'date',
                   1 : 'game_num',
                   3 : 'v_team',
                   5 : 'v_game_num',
                   6 : 'h_team',
                   8 : 'h_game_num',
                   9 : 'v_score',
                   10 : 'h_score',
                   13 : 'completion_info',
                   21 : 'v_ab',
                   22 : 'v_h',
                   23 : 'v_2b',
                   24 : 'v_3b',
                   25 : 'v_hr',
                   26 : 'v_rbi',
                   27 : 'v_sac_bunt',
                   28 : 'v_sac_fly',
                   29 : 'v_hbp',
                   30 : 'v_bb',
                   31 : 'v_ibb',
                   32 : 'v_k',
                   33 : 'v_sb',
                   34 : 'v_cs',
                   35 : 'v_gidp',
                   36 : 'v_catch_int',
                   37 : 'v_lob',
                   49 : 'h_ab',
                   50 : 'h_h',
                   51 : 'h_2b',
                   52 : 'h_3b',
                   53 : 'h_hr',
                   54 : 'h_rbi',
                   55 : 'h_sac_bunt',
                   56 : 'h_sac_fly',
                   57 : 'h_hbp',
                   58 : 'h_bb',
                   59 : 'h_ibb',
                   60 : 'h_k',
                   61 : 'h_sb',
                   62 : 'h_cs',
                   63 : 'h_gidp',
                   64 : 'h_catch_int',
                   65 : 'h_lob',
                   101 : 'v_sp_id',
                   102 : 'v_sp_name',
                   103 : 'h_sp_id',
                   104 : 'h_sp_name'}

bs_2018_cut.rename(columns=col_rename_dict, inplace=True)

team hitting stats over last 10 games

In [87]:
cutoff_day = 20180501 # keep games at least one month into season
look_back = 10 # how many games to look back

keep_2018 = bs_2018_cut.loc[bs_2018_cut['date']>=cutoff_day]

# columns for hitting dataframe
hitting_col = ['h_team','h_obp','h_slg','h_k_rate','h_bb_rate','v_team','v_obp','v_slg','v_k_rate','v_bb_rate']
y_col = ['home_win']
hitting_2018 = pd.DataFrame(columns=(hitting_col+y_col))

# list of offensive stats (removing v_ from column names)
hit_stats = list(keep_2018.columns.values[9:26])
for i in range(len(hit_stats)):
    hit_stats[i] = hit_stats[i][2:]

#for i in range(keep_2018.shape[0]):
for i in range(3):
    game = keep_2018.iloc[i,:]
    teams = ['v_', 'h_']
    
    # to be used to add new row to hitting dataframe
    new_row_dict = {}
    new_row_dict['home_win'] = int(game['h_score']>game['v_score'])
    new_row_dict['h_team'] = game['h_team']
    new_row_dict['v_team'] = game['v_team']
    
    for t in teams:
        t_name = game[t+'team']
        t_game_num = game[t+'game_num']
        
        # boolean for filtering last n games
        # games when team was away
        away = (bs_2018_cut['v_team']==t_name) & (bs_2018_cut['v_game_num']>=(t_game_num-10)) & (bs_2018_cut['v_game_num']<t_game_num)
        # when team was home
        home = (bs_2018_cut['h_team']==t_name) & (bs_2018_cut['h_game_num']>=(t_game_num-10)) & (bs_2018_cut['h_game_num']<t_game_num)
         
        last_away = bs_2018_cut.loc[away,:]
        last_home = bs_2018_cut.loc[home,:]
        
        last_n_dict = {}
        for stat in hit_stats:
            last_n_dict[stat] = list(last_away['v_'+stat].values)+list(last_home['h_'+stat].values)
        
        # last n games for given team
        last_n = pd.DataFrame.from_dict(last_n_dict)
        totals = last_n.sum()
        
        # team on base percentage
        obp = (totals['h']+totals['bb']+totals['ibb']+totals['hbp'])/(totals['ab']+totals['bb']+totals['ibb']+totals['hbp']+totals['sac_fly'])
        new_row_dict[t+'obp'] = round(obp,3)
        
        # team slugging percentage
        n_1b = totals['h']-(totals['2b']+totals['3b']+totals['hr'])
        slg = (n_1b + 2*totals['2b'] + 3*totals['3b'] + 4*totals['hr'])/totals['ab']
        new_row_dict[t+'slg'] = round(slg,3)
        
        # team strikeout rate
        # NOTE : PA calculation excludes reaching on fielding error
        pa = totals['ab']+totals['bb']+totals['ibb']+totals['sac_bunt']+totals['sac_fly']+totals['hbp']+totals['catch_int']
        k_rate = totals['k']/pa
        new_row_dict[t+'k_rate'] = round(k_rate,3)
        
        # team walk rate
        bb_rate = (totals['bb']+totals['ibb'])/pa
        new_row_dict[t+'bb_rate'] = round(bb_rate,3)
    
    hitting_2018 = hitting_2018.append(new_row_dict, ignore_index=True)

In [88]:
hitting_2018.head()

Unnamed: 0,h_team,h_obp,h_slg,h_k_rate,h_bb_rate,v_team,v_obp,v_slg,v_k_rate,v_bb_rate,home_win
0,ARI,0.334,0.443,0.236,0.119,LAN,0.335,0.428,0.228,0.103,1
1,CHN,0.322,0.411,0.207,0.065,COL,0.336,0.366,0.249,0.11,0
2,CIN,0.369,0.434,0.181,0.117,MIL,0.291,0.33,0.209,0.106,0


checkpoint : basic team hitting stats done

next : starting pitchers recent performance

In [100]:
from pybaseball import playerid_lookup
from pybaseball import pitching_stats_range
from datetime import datetime, timedelta

In [91]:
data = pitching_stats_range('2018-03-29', '2018-04-30')

In [93]:
data.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W'],
      dtype='object')

In [94]:
data.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,GS,W,L,SV,...,Str,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W
1,Austin Adams,27,548,MLB-NL,Washington,2,0,,,,...,0.46,0.25,0.04,0.5,0.25,0.0,4.0,0.25,0.0,0.0
2,Matt Albers,35,547,MLB-NL,Milwaukee,12,0,2.0,1.0,1.0,...,0.64,0.18,0.12,0.45,0.21,0.16,0.9,0.263,7.4,11.0
3,Scott Alexander,28,548,MLB-NL,Los Angeles,11,0,1.0,,,...,0.57,0.16,0.1,0.61,0.19,0.03,1.853,0.324,5.6,0.78
4,Cody Allen,29,546,MLB-AL,Cleveland,12,0,2.0,,5.0,...,0.63,0.16,0.15,0.33,0.2,0.03,1.026,0.267,9.9,3.5
5,Dan Altavilla,25,547,MLB-AL,Seattle,13,0,2.0,2.0,,...,0.61,0.18,0.13,0.32,0.2,0.12,1.378,0.25,11.7,1.78


In [130]:
data.loc[data['SO/W'].isna(),:]

Unnamed: 0,Name,Age,#days,Lev,Tm,G,GS,W,L,SV,...,Str,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W
20,Johnny Barbato,25,549,MLB-AL,Detroit,1,0,,,,...,0.63,0.06,0.16,0.3,0.3,0.0,2.0,0.333,0.0,
22,Scott Barlow,25,546,MLB-AL,Kansas City,1,0,,,,...,0.73,0.2,0.09,0.57,0.21,0.07,1.667,0.357,3.0,
33,Chris Beck,27,548,MLB-AL,Chicago,2,0,,,1.0,...,0.73,0.15,0.11,0.39,0.22,0.11,1.6,0.412,5.4,
57,John Brebbia,28,551,MLB-NL,St. Louis,3,0,,,1.0,...,0.67,0.25,0.11,0.5,0.2,0.0,0.4,0.2,12.6,
60,Parker Bridwell,26,570,MLB-AL,Los Angeles,1,1,,,,...,0.58,0.17,0.06,0.5,0.17,0.0,4.2,0.444,0.0,
77,Brett Cecil,31,578,MLB-NL,St. Louis,1,0,,,,...,0.5,0.1,0.1,0.5,0.0,0.5,3.0,0.5,0.0,
107,Zac Curtis,25,546,MLB-NL,Philadelphia,1,0,,,,...,0.73,0.18,0.0,0.5,0.0,0.25,1.0,0.25,0.0,
108,John Curtiss,25,546,MLB-AL,Minnesota,1,0,,,,...,0.57,0.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
134,Danny Farquhar,31,556,MLB-AL,Chicago,8,0,1.0,1.0,,...,0.66,0.12,0.15,0.35,0.25,0.1,0.75,0.176,10.1,
135,Luke Farrell,27,546,MLB-NL,Chicago,3,0,1.0,,,...,0.68,0.11,0.28,0.17,0.67,0.0,0.6,0.4,18.9,


In [128]:
sp_look_back = 18
pitch_pull_cols = ['H','BB','SO','HR','HBP','AB','2B','3B','IBB','SF','SB','GB/FB','LD','SO/W']

# columns for pitching dataframe
pitching_col = ['h_sp_name','h_sp_obp','h_sp_slg','h_sp_gb/fb','h_sp_ld','h_sp_k/bb','v_sp_name','v_sp_obp','v_sp_slg','v_sp_gb/fb','v_sp_ld','v_sp_k/bb']
pitching_2018 = pd.DataFrame(columns=pitching_col)

dates = keep_2018['date'].unique()
for date in dates[0:2]:
    day = keep_2018.loc[keep_2018['date']==date,:]
    start_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=sp_look_back)).strftime('%Y-%m-%d')
    end_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=1)).strftime('%Y-%m-%d')
    
    # league-wide pitching stats during look back window
    pitch_stats = pitching_stats_range(start_dt, end_dt)
    
    #for i in range(len(day.shape[0])):
    for i in range(3):
        new_row_dict = {}
        game = day.iloc[i,:]
        teams = ['v_', 'h_']
        for t in teams:
            t_sp_name = game[t+'sp_name']
            sp_stats = pitch_stats.loc[pitch_stats['Name']==t_sp_name,:][pitch_pull_cols].reset_index(drop=True).iloc[0,:]
            new_row_dict[t+'sp_name'] = t_sp_name
            
            # opposing on base percentage
            obp = (sp_stats['H']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP'])/(sp_stats['AB']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP']+sp_stats['SF'])
            new_row_dict[t+'sp_obp'] = round(obp, 3)
            
            # opposing slugging
            n_1b = sp_stats['H']-(sp_stats['2B']+sp_stats['3B']+sp_stats['HR'])
            slg = (sp_stats['H'] + 2*sp_stats['2B'] + 3*sp_stats['3B'] + 4*sp_stats['HR'])/sp_stats['AB']
            new_row_dict[t+'sp_slg'] = round(slg, 3)
            
            # line drive rate
            new_row_dict[t+'sp_ld'] = sp_stats['LD']
            # groundball to flyball ratio
            new_row_dict[t+'sp_gb/fb'] = sp_stats['GB/FB']
            # strikeout to walk ratio
            new_row_dict[t+'sp_k/bb'] = sp_stats['SO/W']
        
        pitching_2018 = pitching_2018.append(new_row_dict, ignore_index=True)
                       

(1, 14)
(1, 14)
(1, 14)
(1, 14)
(1, 14)
(1, 14)
(1, 14)
(1, 14)
(1, 14)
(1, 14)
(0, 14)


IndexError: single positional indexer is out-of-bounds

left to handle

- K/BB is nan when pitcher issued no walks during time period
- no data for pitcher making season debut