In [None]:
def make_game_batting_order(game_df):
    game_df = game_df.sort_values(by=['at_bat_number', 'pitch_number'])
    all_batters = game_df['batter'].unique().tolist()
    #re-set the at_bat_number for the game to be sequential starting at 1
    at_bat_keys = game_df['at_bat_number'].unique().tolist()
    at_bat_values = range(1, len(at_bat_keys)+1)
    at_bat_map = dict(zip(at_bat_keys, at_bat_values))
    game_df['at_bat_number'] = game_df['at_bat_number'].replace(at_bat_map)
    
    #get the first 9 batter ids
    first_9_batter_subset = game_df[game_df['at_bat_number'] < 10]
    first_9_batters = first_9_batter_subset['batter'].unique().tolist()
    
    #map the batter id to batting order position 1-9
    batting_order_map = dict(zip(first_9_batters, range(1,10)))
    
    #for anyone else who bats later in the game, assign 'PH' (pinch hitter) to their batting order slot
    other_batters = list(set(all_batters) - set(first_9_batters))
    if len(other_batters) > 0:
        for batter in other_batters:
            batting_order_map[batter] = 'PH'
            
  
    try:
        game_df['batting_order_slot'] = game_df['batter'].apply(lambda x: batting_order_map[x])    
    except KeyError:
        game_df = None
        return game_df
    
    game_df['pitcher_AB'] = game_df['batter'].apply(lambda x: True if x in pitcher_list else False)
    game_df['batting_order_slot'] = game_df['batting_order_slot'].where(game_df['pitcher_AB'] == False, other='pitcher')
    return game_df

In [None]:
def make_game_pitchcount_and_trailing_pitch_features(pitcher_df, pitcher_list):
    df = pitcher_df.copy()
    
    print('#pitches in df before: ' + str(len(df)))
    
    pitcher_tendencies_overall, pitcher_tendencies_by_count = get_pitch_tendencies(df)
    games = df['game_pk'].unique().tolist()
    
    #take the first game and make the pitch count feature
    first_game_df = df[df['game_pk'] == games[0]].copy()
    first_game_df['pitch_count'] = range(1, first_game_df.shape[0] + 1)
    
    #make the L1_pitch type feature:
    first_game_df['L1_pitch_type'] = first_game_df['pitch_type'].shift(periods=1)
    first_game_df['L1_pitch_result'] = first_game_df['type'].shift(periods=1)
    first_game_df['L1_pitch_result'] = first_game_df['L1_pitch_result'].replace({np.nan:'first pitch'})
    first_game_df['L1_pitch_zone'] = first_game_df['zone'].shift(periods=1)
    first_game_df['L1_pitch_zone'] = first_game_df['L1_pitch_zone'].fillna(-1)
    
    #overall strike % (to fill in for first 5 pitches L5_strike_perc)
    overall_strike_perc = df['type'].value_counts(normalize=True)['S'] * 100
    
    #make the trailing 5 pitches:
    for index, row in first_game_df.iterrows():
        #fill NaNs for L1_pitch using same method as when pitch_type was missing
        if row['pitch_count'] == 1:
            random_pitch = random.choices(population=list(pitcher_tendencies_overall.keys()), 
                               weights=list(pitcher_tendencies_overall.values()), 
                               k=1)[0]
            first_game_df.at[index, 'L1_pitch_type'] = random_pitch
            
        #for the first 5 rows, use overall pitcher tendencies    
        if row['pitch_count'] < 6:
            #fill with overall tendencies
            for pitch in list(pitcher_tendencies_overall.keys()):
                feature = 'L5_' + pitch + '_perc'
                first_game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100  
                #strike %
                first_game_df.at[index, 'L5_strike_perc'] = overall_strike_perc
        else:
            current_pitch = first_game_df.at[index, 'pitch_count']
            #make a subset of the prev 5 pitches
            subset = first_game_df[(first_game_df['pitch_count'] > current_pitch - 6) & (first_game_df['pitch_count'] < current_pitch)]
            #grab the value count percentages for the last 5 pitches
            subset_percentages = subset['pitch_type'].value_counts(normalize=True).to_dict()
            try:
                L5_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
            except KeyError:
                L5_strike_perc = 0
                
            first_game_df.at[index, 'L5_strike_perc'] = L5_strike_perc
            
            #iterate over all possible pitch types this pitcher throws:
            for pitch in list(pitcher_tendencies_overall.keys()):
                feature = 'L5_' + pitch + '_perc'
                #if he has thrown that pitch type in last 5
                try:
                    first_game_df.at[index, feature] = subset_percentages[pitch] * 100
                #except for when he hasnt thrown that type in last 5
                except:
                    first_game_df.at[index, feature] = 0
                    
    #apply the battting order features to the game:                
    first_game_df = make_game_batting_order(first_game_df)
    
    #iterate the same process for the rest of his games:
    for game in games[1:]:
        game_df = df[df['game_pk'] == game].copy() #get df for that game only
        game_df['pitch_count'] = range(1, game_df.shape[0] + 1) #make the pitch count for the game
        game_df['L1_pitch_type'] = game_df['pitch_type'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['type'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['L1_pitch_result'].replace({np.nan:'first pitch'})
        game_df['L1_pitch_zone'] = game_df['zone'].shift(periods=1)
        game_df['L1_pitch_zone'] = game_df['L1_pitch_zone'].fillna(0)
        
        #make the trailing 5 pitches:
        for index, row in game_df.iterrows():
            #fill NaNs for L1_pitch using same method as when pitch_type was missing
            if row['pitch_count'] == 1:
                random_pitch = random.choices(population=list(pitcher_tendencies_overall.keys()), 
                               weights=list(pitcher_tendencies_overall.values()), 
                               k=1)[0]
                game_df.at[index, 'L1_pitch_type'] = random_pitch
            
            if row['pitch_count'] < 6:
                #fill with overall tendencies
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100
                    #strike %
                    game_df.at[index, 'L5_strike_perc'] = overall_strike_perc
            else:
                current_pitch = game_df.at[index, 'pitch_count']
                subset = game_df[(game_df['pitch_count'] > current_pitch - 6) & (game_df['pitch_count'] < current_pitch)]
                subset_percentages = subset['pitch_type'].value_counts(normalize=True).to_dict()
                
                try:
                    L5_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                except KeyError:
                    L5_strike_perc = 0
                game_df.at[index, 'L5_strike_perc'] = L5_strike_perc
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    try:
                        game_df.at[index, feature] = subset_percentages[pitch] * 100
                    except:
                        game_df.at[index, feature] = 0
                        
        #apply the battting order features to the game:                
        game_df = make_game_batting_order(game_df)
        if game_df.empty:
            print('skipping game because of bat data: ' + str(game))
            continue
        
        #concatenate that game w/ updated pitch count and trailing pitches w/ prev games
        if game_df['game_pk'].values[0] == games[1]:
            new_df = pd.concat([first_game_df, game_df]) #concat the game_df w/ the first game
        else:
            new_df = pd.concat([new_df, game_df]) #concat the game_df w/ the previous games
    
    print('# pitches in df after: ' + str(len(new_df)))
    
    return new_df

In [None]:
batter_cols = ['fastball_perc_faced','fastball_chase_perc','fastball_bip_swung_perc', 'fastball_taken_strike_perc',
               'fastball_est_woba', 'fastball_babip', 'fastball_iso_value', 'breaking_perc_faced', 'breaking_chase_perc',
               'breaking_bip_swung_perc', 'breaking_taken_strike_perc', 'breaking_est_woba', 'breaking_babip', 
               'breaking_iso_value', 'offspeed_perc_faced', 'offspeed_chase_perc', 'offspeed_bip_swung_perc',
               'offspeed_taken_strike_perc', 'offspeed_est_woba', 'offspeed_babip', 'offspeed_iso_value',
               'pitchout_perc_faced']

def fill_batting_nans(pitcher_df, batting_order_slot_map):
    df = pitcher_df.copy()
    for slot in df['batting_order_slot'].unique().tolist():
        subset = df[df['batting_order_slot'] == slot].copy()
        df = df.drop(subset.index)
        for col in batter_cols:
            subset[col] = subset[col].fillna(batting_order_slot_map[slot][col])
        df = pd.concat([df, subset])
        print('finished w/ slot: ' + str(slot))
    return df

In [None]:
def make_game_pitchcount_and_trailing_pitch_features_and_batting_order(pitcher_df, pitcher_list):
    df = pitcher_df.copy()
    all_games = []
    
    print('#pitches in df before: ' + str(len(df)))
    
    pitcher_tendencies_overall, pitcher_tendencies_by_count = get_pitch_tendencies(df)
    games = df['game_pk'].unique().tolist()
    
    for game in games:
        
        #take the first game and make the pitch count feature
        game_df = df[df['game_pk'] == game].copy()
        game_df['pitch_count'] = range(1, game_df.shape[0] + 1)
    
        #make the L1_pitch type feature:
        game_df['L1_pitch_type'] = game_df['pitch_cat'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['type'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['L1_pitch_result'].replace({np.nan:'first pitch'})
        game_df['L1_pitch_zone'] = game_df['zone'].shift(periods=1)
        game_df['L1_ball_high'] = game_df['ball_high'].shift(periods=1)
        game_df['L1_ball_low'] = game_df['ball_low'].shift(periods=1)
        game_df['L1_ball_left'] = game_df['ball_left'].shift(periods=1)
        game_df['L1_ball_right'] = game_df['ball_right'].shift(periods=1)
        
        game_df[['L1_pitch_zone', 'L1_ball_high', 'L1_ball_low', 'L1_ball_left', 'L1_ball_right']] = game_df[['L1_pitch_zone', 'L1_ball_high', 'L1_ball_low', 'L1_ball_left', 'L1_ball_right']].fillna(-1)
        #game_df['L1_pitch_zone'] = game_df['L1_pitch_zone'].fillna(-1)

        #overall strike % (to fill in for first 5 pitches L5_strike_perc)
        overall_strike_perc = df['type'].value_counts(normalize=True)['S'] * 100

        #make the trailing 5 pitches:
        for index, row in game_df.iterrows():
            #fill NaNs for L1_pitch using same method as when pitch_type was missing
            if row['pitch_count'] == 1:
                random_pitch = random.choices(population=list(pitcher_tendencies_overall.keys()), 
                                   weights=list(pitcher_tendencies_overall.values()), 
                                   k=1)[0]
                game_df.at[index, 'L1_pitch_type'] = random_pitch

            #for the first 5 rows, use overall pitcher tendencies    
            if row['pitch_count'] < 6:
                #fill with overall tendencies
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100
                    feature = 'L15_' + pitch + '_perc'
                    game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100
                #strike %
                game_df.at[index, 'L5_strike_perc'] = overall_strike_perc
                game_df.at[index, 'L15_strike_perc'] = overall_strike_perc

            else:
                current_pitch = game_df.at[index, 'pitch_count']
                #make a subset of the prev 5 pitches
                subset = game_df[(game_df['pitch_count'] > current_pitch - 6) & (game_df['pitch_count'] < current_pitch)]
                #grab the value count percentages for the last 5 pitches
                subset_percentages = subset['pitch_cat'].value_counts(normalize=True).to_dict()
                try:
                    L5_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                except KeyError:
                    L5_strike_perc = 0

                game_df.at[index, 'L5_strike_perc'] = L5_strike_perc

                #iterate over all possible pitch types this pitcher throws:
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    #if he has thrown that pitch type in last 5
                    try:
                        game_df.at[index, feature] = subset_percentages[pitch] * 100
                    #except for when he hasnt thrown that type in last 5
                    except:
                        game_df.at[index, feature] = 0


                if row['pitch_count'] < 16:
                    #make a subset of the prev 15 pitches
                    subset = game_df[(game_df['pitch_count'] < current_pitch)]
                    #grab the value count percentages for the last 15 pitches
                    subset_percentages = subset['pitch_cat'].value_counts(normalize=True).to_dict()
                    try:
                        L15_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                    except KeyError:
                        L15_strike_perc = 0

                    game_df.at[index, 'L15_strike_perc'] = L15_strike_perc

                    #iterate over all possible pitch types this pitcher throws:
                    for pitch in list(pitcher_tendencies_overall.keys()):
                        feature = 'L15_' + pitch + '_perc'
                        #if he has thrown that pitch type in last 15
                        try:
                            game_df.at[index, feature] = subset_percentages[pitch] * 100
                        #except for when he hasnt thrown that type in last 5
                        except:
                            game_df.at[index, feature] = 0
                else:
                    #make a subset of the prev 15 pitches
                    subset = game_df[(game_df['pitch_count'] > current_pitch - 16) & (game_df['pitch_count'] < current_pitch)]
                    #grab the value count percentages for the last 5 pitches
                    subset_percentages = subset['pitch_cat'].value_counts(normalize=True).to_dict()
                    try:
                        L15_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                    except KeyError:
                        L15_strike_perc = 0

                    game_df.at[index, 'L15_strike_perc'] = L15_strike_perc

                    #iterate over all possible pitch types this pitcher throws:
                    for pitch in list(pitcher_tendencies_overall.keys()):
                        feature = 'L15_' + pitch + '_perc'
                        #if he has thrown that pitch type in last 5
                        try:
                            game_df.at[index, feature] = subset_percentages[pitch] * 100
                        #except for when he hasnt thrown that type in last 5
                        except:
                            game_df.at[index, feature] = 0
                            
        #apply the battting order features to the game:                
        game_df = make_game_batting_order(game_df)
    
        all_games.append(game_df)
    
    new_df = pd.concadef make_game_pitchcount_and_trailing_pitch_features_and_batting_order(pitcher_df, pitcher_list):
    df = pitcher_df.copy()
    all_games = []
    
    print('#pitches in df before: ' + str(len(df)))
    
    pitcher_tendencies_overall, pitcher_tendencies_by_count = get_pitch_tendencies(df)
    games = df['game_pk'].unique().tolist()
    
    for game in games:
        
        #take the first game and make the pitch count feature
        game_df = df[df['game_pk'] == game].copy()
        game_df['pitch_count'] = range(1, game_df.shape[0] + 1)
    
        #make the L1_pitch type feature:
        game_df['L1_pitch_type'] = game_df['pitch_cat'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['type'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['L1_pitch_result'].replace({np.nan:'first pitch'})
        game_df['L1_pitch_zone'] = game_df['zone'].shift(periods=1)
        game_df['L1_ball_high'] = game_df['ball_high'].shift(periods=1)
        game_df['L1_ball_low'] = game_df['ball_low'].shift(periods=1)
        game_df['L1_ball_left'] = game_df['ball_left'].shift(periods=1)
        game_df['L1_ball_right'] = game_df['ball_right'].shift(periods=1)
        
        game_df[['L1_pitch_zone', 'L1_ball_high', 'L1_ball_low', 'L1_ball_left', 'L1_ball_right']] = game_df[['L1_pitch_zone', 'L1_ball_high', 'L1_ball_low', 'L1_ball_left', 'L1_ball_right']].fillna(-1)
        #game_df['L1_pitch_zone'] = game_df['L1_pitch_zone'].fillna(-1)

        #overall strike % (to fill in for first 5 pitches L5_strike_perc)
        overall_strike_perc = df['type'].value_counts(normalize=True)['S'] * 100

        #make the trailing 5 pitches:
        for index, row in game_df.iterrows():
            #fill NaNs for L1_pitch using same method as when pitch_type was missing
            if row['pitch_count'] == 1:
                random_pitch = random.choices(population=list(pitcher_tendencies_overall.keys()), 
                                   weights=list(pitcher_tendencies_overall.values()), 
                                   k=1)[0]
                game_df.at[index, 'L1_pitch_type'] = random_pitch

            #for the first 5 rows, use overall pitcher tendencies    
            if row['pitch_count'] < 6:
                #fill with overall tendencies
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100
                    feature = 'L15_' + pitch + '_perc'
                    game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100
                #strike %
                game_df.at[index, 'L5_strike_perc'] = overall_strike_perc
                game_df.at[index, 'L15_strike_perc'] = overall_strike_perc

            else:
                current_pitch = game_df.at[index, 'pitch_count']
                #make a subset of the prev 5 pitches
                subset = game_df[(game_df['pitch_count'] > current_pitch - 6) & (game_df['pitch_count'] < current_pitch)]
                #grab the value count percentages for the last 5 pitches
                subset_percentages = subset['pitch_cat'].value_counts(normalize=True).to_dict()
                try:
                    L5_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                except KeyError:
                    L5_strike_perc = 0

                game_df.at[index, 'L5_strike_perc'] = L5_strike_perc

                #iterate over all possible pitch types this pitcher throws:
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    #if he has thrown that pitch type in last 5
                    try:
                        game_df.at[index, feature] = subset_percentages[pitch] * 100
                    #except for when he hasnt thrown that type in last 5
                    except:
                        game_df.at[index, feature] = 0


                if row['pitch_count'] < 16:
                    #make a subset of the prev 15 pitches
                    subset = game_df[(game_df['pitch_count'] < current_pitch)]
                    #grab the value count percentages for the last 15 pitches
                    subset_percentages = subset['pitch_cat'].value_counts(normalize=True).to_dict()
                    try:
                        L15_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                    except KeyError:
                        L15_strike_perc = 0

                    game_df.at[index, 'L15_strike_perc'] = L15_strike_perc

                    #iterate over all possible pitch types this pitcher throws:
                    for pitch in list(pitcher_tendencies_overall.keys()):
                        feature = 'L15_' + pitch + '_perc'
                        #if he has thrown that pitch type in last 15
                        try:
                            game_df.at[index, feature] = subset_percentages[pitch] * 100
                        #except for when he hasnt thrown that type in last 5
                        except:
                            game_df.at[index, feature] = 0
                else:
                    #make a subset of the prev 15 pitches
                    subset = game_df[(game_df['pitch_count'] > current_pitch - 16) & (game_df['pitch_count'] < current_pitch)]
                    #grab the value count percentages for the last 5 pitches
                    subset_percentages = subset['pitch_cat'].value_counts(normalize=True).to_dict()
                    try:
                        L15_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                    except KeyError:
                        L15_strike_perc = 0

                    game_df.at[index, 'L15_strike_perc'] = L15_strike_perc

                    #iterate over all possible pitch types this pitcher throws:
                    for pitch in list(pitcher_tendencies_overall.keys()):
                        feature = 'L15_' + pitch + '_perc'
                        #if he has thrown that pitch type in last 5
                        try:
                            game_df.at[index, feature] = subset_percentages[pitch] * 100
                        #except for when he hasnt thrown that type in last 5
                        except:
                            game_df.at[index, feature] = 0
                            
        #apply the battting order features to the game:                
        game_df = make_game_batting_order(game_df)
    
        all_games.append(game_df)
    
    new_df = pd.concat(all_games).sort_values(by=['game_date', 'game_pk', 'at_bat_number', 'pitch_number'])
    
    print('# pitches in df after: ' + str(len(new_df)))
    
    return new_dft(all_games).sort_values(by=['game_date', 'game_pk', 'at_bat_number', 'pitch_number'])
    
    print('# pitches in df after: ' + str(len(new_df)))
    
    return new_df

In [None]:
def make_prev_ab_walk_basehit_run_and_homerun_features(pitcher_df):
    
    all_games = []
    #iterate over each game
    for game in pitcher_df['game_pk'].unique():
        #make subset df for that game
        game_df = pitcher_df[pitcher_df['game_pk'] == game].copy()
        #initialize columns to False:
        game_df['prev_ab_run_scored'] = False
        game_df['prev_ab_homerun'] = False
        game_df['prev_ab_walk'] = False
        game_df['prev_ab_basehit'] = False
        game_df['prev_ab_strikeout'] = False
        
        #this gets the 
        at_bats = game_df['at_bat_number'].sort_values().unique()
        
        #initialize empty dicts
        run_scored = []
        homeruns = []
        walks = []
        basehits = []
        strikeouts = []
        
        walks = ['walk', 'hit_by_pitch']
        basehits = ['single', 'double', 'triple', 'home_run']
        
        #starting w/ 2nd AB, iterate thru to the end of the at_bats:
        for ab in at_bats[2:]:
            #get the index for the last pitch of the prev AB
            prev_ab_last_pitch_index = game_df[game_df['at_bat_number'] == ab-1]['pitch_number'].index.max()
            #check if the last pitch resulted in a walk or hit by pitch:
            if game_df.loc[prev_ab_last_pitch_index]['events'] in walks:
                #if so, add an entry
                walks.append(ab)
            #check if last pitch gave up a basehit:
            elif game_df.loc[prev_ab_last_pitch_index]['events'] in basehits:
                basehits.append(ab)
            elif game_df.loc[prev_ab_last_pitch_index]['events'] == 'strikeout':
                strikeouts.append(ab)
            
            #to check if prev AB resulted in a run scoring: compare score before and after the AB
            prev_score = game_df[game_df['at_bat_number'] == ab-1]['bat_score'].values[0]
            current_score = game_df[game_df['at_bat_number'] == ab]['bat_score'].values[0]
            
            if current_score > prev_score:
                run_scored.append(ab)
                 
                #check if last AB gave up a homerun:
                if game_df.loc[prev_ab_last_pitch_index]['events'] == 'home_run':
                    homeruns.append(ab)
                    
        #iterate over each at_bat, and add the features to the df where appropriate
        for ab in at_bats:
            idx = game_df[game_df['at_bat_number'] == ab].index
            if ab in walks:
                game_df.at[idx, 'prev_ab_walk'] = True
            elif ab in basehits:
                game_df.at[idx, 'prev_ab_basehit'] = True
            elif ab in strikeouts:
                game_df.at[idx, 'prev_ab_strikeout'] = True
            if ab in run_scored:
                game_df.at[idx, 'prev_ab_run_scored'] = True
                if ab in homeruns:
                    game_df.at[idx, 'prev_ab_homerun'] = True
        all_games.append(game_df)
        
    return pd.concat(all_games).sort_values(by=['game_date', 'game_pk', 'pitch_count'])