# Pitching Data Prep

In [11]:
import pandas as pd

cutoff_18 = 20180501
cutoff_17 = 20170501
cutoff_16 = 20160501

field_nos = [0,1,3,5,6,8,9,10,13]+list(range(21,38))+list(range(49,66))+[101,102,103,104]

pitch_pull_cols = ['H','BB','SO','HR','HBP','AB','2B','3B','IBB','SF','SB','GB/FB','LD','SO/W']
pitching_col = ['date','h_team','h_sp_name','h_sp_obp','h_sp_slg','h_sp_gb/fb','h_sp_ld','h_sp_k/bb','v_team','v_sp_name','v_sp_obp','v_sp_slg','v_sp_gb/fb','v_sp_ld','v_sp_k/bb']

# columns for hitting dataframe
hitting_col = ['date','h_team','h_obp','h_slg','h_k_rate','h_bb_rate','v_team','v_obp','v_slg','v_k_rate','v_bb_rate']
y_col = ['home_win']

lookback = 10 # games

col_rename_dict = {0 : 'date',
                   1 : 'game_num',
                   3 : 'v_team',
                   5 : 'v_game_num',
                   6 : 'h_team',
                   8 : 'h_game_num',
                   9 : 'v_score',
                   10 : 'h_score',
                   13 : 'completion_info',
                   21 : 'v_ab',
                   22 : 'v_h',
                   23 : 'v_2b',
                   24 : 'v_3b',
                   25 : 'v_hr',
                   26 : 'v_rbi',
                   27 : 'v_sac_bunt',
                   28 : 'v_sac_fly',
                   29 : 'v_hbp',
                   30 : 'v_bb',
                   31 : 'v_ibb',
                   32 : 'v_k',
                   33 : 'v_sb',
                   34 : 'v_cs',
                   35 : 'v_gidp',
                   36 : 'v_catch_int',
                   37 : 'v_lob',
                   49 : 'h_ab',
                   50 : 'h_h',
                   51 : 'h_2b',
                   52 : 'h_3b',
                   53 : 'h_hr',
                   54 : 'h_rbi',
                   55 : 'h_sac_bunt',
                   56 : 'h_sac_fly',
                   57 : 'h_hbp',
                   58 : 'h_bb',
                   59 : 'h_ibb',
                   60 : 'h_k',
                   61 : 'h_sb',
                   62 : 'h_cs',
                   63 : 'h_gidp',
                   64 : 'h_catch_int',
                   65 : 'h_lob',
                   101 : 'v_sp_id',
                   102 : 'v_sp_name',
                   103 : 'h_sp_id',
                   104 : 'h_sp_name'}

## 2018 Pitching Data

In [3]:
# data import & cleansing
bs_2018 = pd.read_csv('data/GL2018.TXT', header=None)
bs_2018_cut = bs_2018.iloc[:,field_nos] # only columns of interest
bs_2018_cut = bs_2018_cut.loc[bs_2018_cut[13].isna(),:] # keep games that were completed same day
bs_2018_cut.rename(columns=col_rename_dict, inplace=True)
keep_2018 = bs_2018_cut.loc[bs_2018_cut['date'] >= cutoff_18]

# start compiling pitching data
pitching_2018 = pd.DataFrame(columns=pitching_col)
dates = keep_2018['date'].unique()

for date in dates:
    day = keep_2018.loc[keep_2018['date']==date,:] # all games on one day
    pitch_stats = pd.read_csv('pitch/{}.csv'.format(date)) # pitching stats for that day

    for i in range(day.shape[0]): # for each game on that day
        new_row_dict = {}
        new_row_dict['date'] = date

        game = day.iloc[i,:]
        teams = ['v_', 'h_']
        
        for t in teams: # for opposing teams
            t_team = game[t+'team']
            t_sp_name = game[t+'sp_name'] # starting pitcher

            if t_sp_name in list(pitch_stats['Name']):
                new_row_dict[t+'team'] = t_team
                new_row_dict[t+'sp_name'] = t_sp_name

                sp_stats = pitch_stats.loc[pitch_stats['Name']==t_sp_name,:][pitch_pull_cols].reset_index(drop=True).iloc[0,:]

                # opposing on base percentage
                obp = (sp_stats['H']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP'])/(sp_stats['AB']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP']+sp_stats['SF'])
                new_row_dict[t+'sp_obp'] = round(obp, 3)

                # opposing slugging
                n_1b = sp_stats['H']-(sp_stats['2B']+sp_stats['3B']+sp_stats['HR'])
                slg = (sp_stats['H'] + 2*sp_stats['2B'] + 3*sp_stats['3B'] + 4*sp_stats['HR'])/sp_stats['AB']
                new_row_dict[t+'sp_slg'] = round(slg, 3)

                # line drive rate
                new_row_dict[t+'sp_ld'] = sp_stats['LD']
                # groundball to flyball ratio
                new_row_dict[t+'sp_gb/fb'] = sp_stats['GB/FB']
                # strikeout to walk ratio
                new_row_dict[t+'sp_k/bb'] = sp_stats['SO/W']
                
            else:
                new_row_dict[t+'team'] = t_team
                new_row_dict[t+'sp_name'] = t_sp_name

                for attr in ['sp_obp', 'sp_slg', 'sp_ld', 'sp_gb/fb', 'sp_k/bb']:
                    new_row_dict[t+attr] = float('nan')

        pitching_2018 = pitching_2018.append(new_row_dict, ignore_index=True)

pitching_2018.to_csv(path_or_buf = 'data/pitching_2018.csv', index=False)

## 2017 Pitching Data

In [5]:
# data import & cleansing
bs_2017 = pd.read_csv('data/GL2017.TXT', header=None)
bs_2017_cut = bs_2017.iloc[:,field_nos] # only columns of interest
bs_2017_cut = bs_2017_cut.loc[bs_2017_cut[13].isna(),:] # keep games that were completed same day
bs_2017_cut.rename(columns=col_rename_dict, inplace=True)
keep_2017 = bs_2017_cut.loc[bs_2017_cut['date'] >= cutoff_17]

# start compiling pitching data
pitching_2017 = pd.DataFrame(columns=pitching_col)
dates = keep_2017['date'].unique()

for date in dates:
    day = keep_2017.loc[keep_2017['date']==date,:] # all games on one day
    pitch_stats = pd.read_csv('pitch/{}.csv'.format(date)) # pitching stats for that day

    for i in range(day.shape[0]): # for each game on that day
        new_row_dict = {}
        new_row_dict['date'] = date

        game = day.iloc[i,:]
        teams = ['v_', 'h_']
        
        for t in teams: # for opposing teams
            t_team = game[t+'team']
            t_sp_name = game[t+'sp_name'] # starting pitcher

            if t_sp_name in list(pitch_stats['Name']):
                new_row_dict[t+'team'] = t_team
                new_row_dict[t+'sp_name'] = t_sp_name

                sp_stats = pitch_stats.loc[pitch_stats['Name']==t_sp_name,:][pitch_pull_cols].reset_index(drop=True).iloc[0,:]

                # opposing on base percentage
                obp = (sp_stats['H']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP'])/(sp_stats['AB']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP']+sp_stats['SF'])
                new_row_dict[t+'sp_obp'] = round(obp, 3)

                # opposing slugging
                n_1b = sp_stats['H']-(sp_stats['2B']+sp_stats['3B']+sp_stats['HR'])
                slg = (sp_stats['H'] + 2*sp_stats['2B'] + 3*sp_stats['3B'] + 4*sp_stats['HR'])/sp_stats['AB']
                new_row_dict[t+'sp_slg'] = round(slg, 3)

                # line drive rate
                new_row_dict[t+'sp_ld'] = sp_stats['LD']
                # groundball to flyball ratio
                new_row_dict[t+'sp_gb/fb'] = sp_stats['GB/FB']
                # strikeout to walk ratio
                new_row_dict[t+'sp_k/bb'] = sp_stats['SO/W']
                
            else:
                new_row_dict[t+'team'] = t_team
                new_row_dict[t+'sp_name'] = t_sp_name

                for attr in ['sp_obp', 'sp_slg', 'sp_ld', 'sp_gb/fb', 'sp_k/bb']:
                    new_row_dict[t+attr] = float('nan')

        pitching_2017 = pitching_2017.append(new_row_dict, ignore_index=True)

pitching_2017.to_csv(path_or_buf = 'data/pitching_2017.csv', index=False)

## 2016 Pitching Data

In [7]:
# data import & cleansing
bs_2016 = pd.read_csv('data/GL2016.TXT', header=None)
bs_2016_cut = bs_2016.iloc[:,field_nos] # only columns of interest
bs_2016_cut = bs_2016_cut.loc[bs_2016_cut[13].isna(),:] # keep games that were completed same day
bs_2016_cut.rename(columns=col_rename_dict, inplace=True)
keep_2016 = bs_2016_cut.loc[bs_2016_cut['date'] >= cutoff_16]

# start compiling pitching data
pitching_2016 = pd.DataFrame(columns=pitching_col)
dates = keep_2016['date'].unique()

for date in dates:
    day = keep_2016.loc[keep_2016['date']==date,:] # all games on one day
    pitch_stats = pd.read_csv('pitch/{}.csv'.format(date)) # pitching stats for that day

    for i in range(day.shape[0]): # for each game on that day
        new_row_dict = {}
        new_row_dict['date'] = date

        game = day.iloc[i,:]
        teams = ['v_', 'h_']
        
        for t in teams: # for opposing teams
            t_team = game[t+'team']
            t_sp_name = game[t+'sp_name'] # starting pitcher

            if t_sp_name in list(pitch_stats['Name']):
                new_row_dict[t+'team'] = t_team
                new_row_dict[t+'sp_name'] = t_sp_name

                sp_stats = pitch_stats.loc[pitch_stats['Name']==t_sp_name,:][pitch_pull_cols].reset_index(drop=True).iloc[0,:]

                # opposing on base percentage
                obp = (sp_stats['H']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP'])/(sp_stats['AB']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP']+sp_stats['SF'])
                new_row_dict[t+'sp_obp'] = round(obp, 3)

                # opposing slugging
                n_1b = sp_stats['H']-(sp_stats['2B']+sp_stats['3B']+sp_stats['HR'])
                slg = (sp_stats['H'] + 2*sp_stats['2B'] + 3*sp_stats['3B'] + 4*sp_stats['HR'])/sp_stats['AB']
                new_row_dict[t+'sp_slg'] = round(slg, 3)

                # line drive rate
                new_row_dict[t+'sp_ld'] = sp_stats['LD']
                # groundball to flyball ratio
                new_row_dict[t+'sp_gb/fb'] = sp_stats['GB/FB']
                # strikeout to walk ratio
                new_row_dict[t+'sp_k/bb'] = sp_stats['SO/W']
                
            else:
                new_row_dict[t+'team'] = t_team
                new_row_dict[t+'sp_name'] = t_sp_name

                for attr in ['sp_obp', 'sp_slg', 'sp_ld', 'sp_gb/fb', 'sp_k/bb']:
                    new_row_dict[t+attr] = float('nan')

        pitching_2016 = pitching_2016.append(new_row_dict, ignore_index=True)

pitching_2016.to_csv(path_or_buf = 'data/pitching_2016.csv', index=False)

# Hitting Data Prep

## 2018 Pitching Data

In [12]:
hitting_2018 = pd.DataFrame(columns=(hitting_col+y_col))

# list of offensive stats (removing v_ from column names)
hit_stats = list(keep_2018.columns.values[9:26])
for i in range(len(hit_stats)):
    hit_stats[i] = hit_stats[i][2:]

for i in range(keep_2018.shape[0]):
    game = keep_2018.iloc[i,:]
    teams = ['v_', 'h_']
    
    # to be used to add new row to hitting dataframe
    new_row_dict = {}
    new_row_dict['date'] = game['date']
    new_row_dict['home_win'] = int(game['h_score']>game['v_score'])
    new_row_dict['h_team'] = game['h_team']
    new_row_dict['v_team'] = game['v_team']
    
    for t in teams:
        t_name = game[t+'team']
        t_game_num = game[t+'game_num']
        
        # boolean for filtering last n games
        # games when team was away
        away = (bs_2018_cut['v_team']==t_name) & (bs_2018_cut['v_game_num']>=(t_game_num-lookback)) & (bs_2018_cut['v_game_num']<t_game_num)
        # when team was home
        home = (bs_2018_cut['h_team']==t_name) & (bs_2018_cut['h_game_num']>=(t_game_num-lookback)) & (bs_2018_cut['h_game_num']<t_game_num)
         
        last_away = bs_2018_cut.loc[away,:]
        last_home = bs_2018_cut.loc[home,:]
        
        last_n_dict = {}
        for stat in hit_stats:
            last_n_dict[stat] = list(last_away['v_'+stat].values)+list(last_home['h_'+stat].values)
        
        # last n games for given team
        last_n = pd.DataFrame.from_dict(last_n_dict)
        totals = last_n.sum()
        
        # team on base percentage
        obp = (totals['h']+totals['bb']+totals['ibb']+totals['hbp'])/(totals['ab']+totals['bb']+totals['ibb']+totals['hbp']+totals['sac_fly'])
        new_row_dict[t+'obp'] = round(obp,3)
        
        # team slugging percentage
        n_1b = totals['h']-(totals['2b']+totals['3b']+totals['hr'])
        slg = (n_1b + 2*totals['2b'] + 3*totals['3b'] + 4*totals['hr'])/totals['ab']
        new_row_dict[t+'slg'] = round(slg,3)
        
        # team strikeout rate
        # NOTE : PA calculation excludes reaching on fielding error
        pa = totals['ab']+totals['bb']+totals['ibb']+totals['sac_bunt']+totals['sac_fly']+totals['hbp']+totals['catch_int']
        k_rate = totals['k']/pa
        new_row_dict[t+'k_rate'] = round(k_rate,3)
        
        # team walk rate
        bb_rate = (totals['bb']+totals['ibb'])/pa
        new_row_dict[t+'bb_rate'] = round(bb_rate,3)
    
    hitting_2018 = hitting_2018.append(new_row_dict, ignore_index=True)

hitting_2018.to_csv(path_or_buf = 'data/hitting_2018.csv', index=False)

## 2017 Pitching Data

In [13]:
hitting_2017 = pd.DataFrame(columns=(hitting_col+y_col))

# list of offensive stats (removing v_ from column names)
hit_stats = list(keep_2017.columns.values[9:26])
for i in range(len(hit_stats)):
    hit_stats[i] = hit_stats[i][2:]

for i in range(keep_2017.shape[0]):
    game = keep_2017.iloc[i,:]
    teams = ['v_', 'h_']
    
    # to be used to add new row to hitting dataframe
    new_row_dict = {}
    new_row_dict['date'] = game['date']
    new_row_dict['home_win'] = int(game['h_score']>game['v_score'])
    new_row_dict['h_team'] = game['h_team']
    new_row_dict['v_team'] = game['v_team']
    
    for t in teams:
        t_name = game[t+'team']
        t_game_num = game[t+'game_num']
        
        # boolean for filtering last n games
        # games when team was away
        away = (bs_2017_cut['v_team']==t_name) & (bs_2017_cut['v_game_num']>=(t_game_num-lookback)) & (bs_2017_cut['v_game_num']<t_game_num)
        # when team was home
        home = (bs_2017_cut['h_team']==t_name) & (bs_2017_cut['h_game_num']>=(t_game_num-lookback)) & (bs_2017_cut['h_game_num']<t_game_num)
         
        last_away = bs_2017_cut.loc[away,:]
        last_home = bs_2017_cut.loc[home,:]
        
        last_n_dict = {}
        for stat in hit_stats:
            last_n_dict[stat] = list(last_away['v_'+stat].values)+list(last_home['h_'+stat].values)
        
        # last n games for given team
        last_n = pd.DataFrame.from_dict(last_n_dict)
        totals = last_n.sum()
        
        # team on base percentage
        obp = (totals['h']+totals['bb']+totals['ibb']+totals['hbp'])/(totals['ab']+totals['bb']+totals['ibb']+totals['hbp']+totals['sac_fly'])
        new_row_dict[t+'obp'] = round(obp,3)
        
        # team slugging percentage
        n_1b = totals['h']-(totals['2b']+totals['3b']+totals['hr'])
        slg = (n_1b + 2*totals['2b'] + 3*totals['3b'] + 4*totals['hr'])/totals['ab']
        new_row_dict[t+'slg'] = round(slg,3)
        
        # team strikeout rate
        # NOTE : PA calculation excludes reaching on fielding error
        pa = totals['ab']+totals['bb']+totals['ibb']+totals['sac_bunt']+totals['sac_fly']+totals['hbp']+totals['catch_int']
        k_rate = totals['k']/pa
        new_row_dict[t+'k_rate'] = round(k_rate,3)
        
        # team walk rate
        bb_rate = (totals['bb']+totals['ibb'])/pa
        new_row_dict[t+'bb_rate'] = round(bb_rate,3)
    
    hitting_2017 = hitting_2017.append(new_row_dict, ignore_index=True)

hitting_2017.to_csv(path_or_buf = 'data/hitting_2017.csv', index=False)

## 2016 Pitching Data

In [14]:
hitting_2016 = pd.DataFrame(columns=(hitting_col+y_col))

# list of offensive stats (removing v_ from column names)
hit_stats = list(keep_2016.columns.values[9:26])
for i in range(len(hit_stats)):
    hit_stats[i] = hit_stats[i][2:]

for i in range(keep_2016.shape[0]):
    game = keep_2016.iloc[i,:]
    teams = ['v_', 'h_']
    
    # to be used to add new row to hitting dataframe
    new_row_dict = {}
    new_row_dict['date'] = game['date']
    new_row_dict['home_win'] = int(game['h_score']>game['v_score'])
    new_row_dict['h_team'] = game['h_team']
    new_row_dict['v_team'] = game['v_team']
    
    for t in teams:
        t_name = game[t+'team']
        t_game_num = game[t+'game_num']
        
        # boolean for filtering last n games
        # games when team was away
        away = (bs_2016_cut['v_team']==t_name) & (bs_2016_cut['v_game_num']>=(t_game_num-lookback)) & (bs_2016_cut['v_game_num']<t_game_num)
        # when team was home
        home = (bs_2016_cut['h_team']==t_name) & (bs_2016_cut['h_game_num']>=(t_game_num-lookback)) & (bs_2016_cut['h_game_num']<t_game_num)
         
        last_away = bs_2016_cut.loc[away,:]
        last_home = bs_2016_cut.loc[home,:]
        
        last_n_dict = {}
        for stat in hit_stats:
            last_n_dict[stat] = list(last_away['v_'+stat].values)+list(last_home['h_'+stat].values)
        
        # last n games for given team
        last_n = pd.DataFrame.from_dict(last_n_dict)
        totals = last_n.sum()
        
        # team on base percentage
        obp = (totals['h']+totals['bb']+totals['ibb']+totals['hbp'])/(totals['ab']+totals['bb']+totals['ibb']+totals['hbp']+totals['sac_fly'])
        new_row_dict[t+'obp'] = round(obp,3)
        
        # team slugging percentage
        n_1b = totals['h']-(totals['2b']+totals['3b']+totals['hr'])
        slg = (n_1b + 2*totals['2b'] + 3*totals['3b'] + 4*totals['hr'])/totals['ab']
        new_row_dict[t+'slg'] = round(slg,3)
        
        # team strikeout rate
        # NOTE : PA calculation excludes reaching on fielding error
        pa = totals['ab']+totals['bb']+totals['ibb']+totals['sac_bunt']+totals['sac_fly']+totals['hbp']+totals['catch_int']
        k_rate = totals['k']/pa
        new_row_dict[t+'k_rate'] = round(k_rate,3)
        
        # team walk rate
        bb_rate = (totals['bb']+totals['ibb'])/pa
        new_row_dict[t+'bb_rate'] = round(bb_rate,3)
    
    hitting_2016 = hitting_2016.append(new_row_dict, ignore_index=True)

hitting_2016.to_csv(path_or_buf = 'data/hitting_2016.csv', index=False)