In [None]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
%%time
df14 = pd.read_pickle('pitches_2014.pkl', compression='zip')
df15 = pd.read_pickle('pitches_2015.pkl', compression='zip')
df16 = pd.read_pickle('pitches_2016.pkl', compression='zip')
df17 = pd.read_pickle('pitches_2017.pkl', compression='zip')
df18 = pd.read_pickle('pitches_2018.pkl', compression='zip')
combined = pd.concat([df14, df15, df16, df17, df18])

#dropping a game w/ faulty data:
bad_game = combined[combined['game_pk']==530207.0]
combined = combined.drop(bad_game.index)
combined = combined.sort_values(by = ['game_date', 'game_pk', 'at_bat_number', 'pitch_number']).reset_index()

drop_cols = ['pitcher.1','fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8','fielder_9']
combined = combined.drop(columns=drop_cols)

#fix some bad data for pitch_type
pitch_types = ['FF','SL', 'FT', 'CH', 'SI', 'CU', 'FC', 'KC', 'FS', 'KN', 'IN', 'FO', 'PO','EP','SC','UN', 'FA']
bad_data = combined[~(combined.pitch_type.isin(pitch_types))].pitch_type.value_counts().index.tolist()
combined['pitch_type'] = combined.pitch_type.replace(to_replace=bad_data, value=np.nan)

CPU times: user 37 s, sys: 10.7 s, total: 47.6 s
Wall time: 42.4 s


In [None]:
combined.shape

(3616851, 74)

In [None]:
combined.info()

In [None]:
#Functions
def downcast_dtypes(df):
    df = df.copy()
    int_cols = df.select_dtypes('int').columns.tolist()
    float_cols = df.select_dtypes('float').columns.tolist()
    obj_cols = df.select_dtypes('object').columns.tolist()
    cat_cols = []
    for col in obj_cols:
        if col == 'pitch_type':
            continue
        if len(df[col].unique()) < len(df)/2:
            cat_cols.append(col)
      
    ints = df[int_cols].apply(pd.to_numeric,downcast='unsigned')
    floats = df[float_cols].apply(pd.to_numeric,downcast='float')
    cats = df[cat_cols].astype('category')
  
    df = df.drop(columns=int_cols + float_cols + cat_cols)
    for d in [ints, floats, cats]:
        df = pd.concat([df, d], axis=1)
    return df

def make_game_features(df):
    df = df.copy()
    
    #convert balls and strikes columns to strings (first to int)
    df['balls'] = df['balls'].astype('int').astype('str')
    df['strikes'] = df['strikes'].astype('int').astype('str')

    #create count feature:
    df['_count'] = df['balls'] + df['strikes']

    #create count category feature:
    count_map = {'00':'neutral', '21':'neutral', '32':'neutral', '10':'behind', '20':'behind',
             '30':'behind', '31':'behind', '01':'ahead', '02':'ahead', '11':'ahead',
             '12':'ahead', '22':'ahead'}
    df['count_cat'] = df['_count'].replace(count_map)
    
    #cant decide whether to drop the balls and strikes feature, keeping for now- check feature importances later
    #df = df.drop(columns=['balls', 'strikes'])
    
    #make score_differential feature
    df['score_diff'] = df['fld_score'] - df['bat_score']
    
    #make baserunners features 1/0 instead of baserunner id:
    df['on_1b_id'] = df['on_1b'] #save the runners id in case we later add feature for speedy runners
    df['on_1b'] = df['on_1b'] * 0 + 1
    df['on_1b'] = df['on_1b'].fillna(0)
    df['on_2b'] = df['on_2b'] * 0 + 1
    df['on_2b'] = df['on_2b'].fillna(0)
    df['on_3b'] = df['on_3b'] * 0 + 1
    df['on_3b'] = df['on_3b'].fillna(0)
    
    #make bases_loaded feature:
    df['bases_loaded'] = df['on_1b'] + df['on_2b'] + df['on_3b']
    df['bases_loaded'] = df['bases_loaded'].apply(lambda x: 1 if x == 3 else 0)

    return df

def gen_pitcher_percentages(df):
    df = df.copy()
    #get all the unique pitcher names in the df
    pitcher_list = df.pitcher.unique().tolist()
    #initialize empty dictionary to store each pitcher and their pitches and percentages for each pitch
    pitcher_dict = {}
    #iterate over each pitcher:
    for pitcher in pitcher_list:
        #assign the normalized value_counts to a variable
        pitch_percentages = df[df.pitcher == pitcher].pitch_type.value_counts(normalize=True)
        #convert that Series object to a dict and assign it as the value to the pitcher dictionary 
        #(pitcher name as key)
        if pitcher == 664068.0:
            pitcher_dict[664068.0] = {'FF': 1.0}
        else:
            pitcher_dict[pitcher] = pitch_percentages.to_dict()
    return pitcher_dict

def fill_pitch_type_nans(df, pitcher_dict):
    df = df.copy()
    #grab the rows where pitch_type is null:
    nulls = df[df.pitch_type.isna()]
    
    #iterate over each null row
    for index, row in nulls.iterrows():
        #use the % for that pitcher for each pitch type he throws to generate a random pitch type with 
        #that % as weight
        try:
            pitch = random.choices(population=list(pitcher_dict[row.pitcher].keys()), 
                               weights=list(pitcher_dict[row.pitcher].values()), 
                               k=1)[0]
        except KeyError:
            pvc = df[df['pitcher']==row.pitcher]['pitch_type'].value_counts(normalize=True)
            pitch_types = pvc.index.tolist()
            percentages = pvc.values.tolist()
            pitch = random.choices(population=pitch_types, weights=percentages, k=1)[0]
        except IndexError:
            print(index)
            print(row.pitcher)
                
        #fill the NaN value with the randomly generated pitch
        df.at[index, 'pitch_type'] = pitch
    return df

def make_pitch_type_cat(df):
    df = df.copy()
    
    #create map for pitch type into categories:
    pitch_type_map = {'FA':'fastball', 'FF':'fastball', 'FT':'fastball', 'FC':'fastball',
                      'FS':'fastball', 'SI':'fastball', 'SF':'fastball', 'SL':'breaking',
                      'CB':'breaking', 'CU':'breaking', 'SC':'breaking', 'KC':'breaking',
                      'CH':'offspeed', 'KN':'offspeed', 'EP':'offspeed', 'FO':'breaking', 
                      'PO':'pitchout'}

    #create pitch cateogory feature
    df['pitch_cat'] = df['pitch_type']
    df['pitch_cat'] = df['pitch_cat'].replace(pitch_type_map)
    return df

def make_strikezone_swung_and_chase_features(df):
    df = df.copy()
    
    #create swung column
    def swung(x):
        swung = ['foul','hit_into_play','swinging_strike','hit_into_play_no_out',
                 'hit_into_play_score','foul_tip','swinging_strike_blocked',
                 'foul_bunt','missed_bunt']
        return 1 if x in swung else 0
    df['batter_swung'] = df['description'].apply(swung)
    
    #initialize in_strikezone and chased features:
    df['in_strikezone'] = 1
    df['chased'] = 0
    
    df['ball_high'] = df['plate_z'] > df['sz_top']
    df['ball_low'] = df['plate_z'] < df['sz_bot']
    df['ball_left'] = df['plate_x'].apply(lambda x: x < -0.73)
    df['ball_right'] = df['plate_x'].apply(lambda x: x > 0.73)
    df['in_strikezone'] = df['ball_high'] + df['ball_low'] + df['ball_left'] + df['ball_right']
    df['in_strikezone'] = df['in_strikezone'].apply(lambda x: 0 if x > 0 else 1)
    
    nulls_subset = df[df['in_strikezone'].isna()]
    for index, row in nulls_subset.iterrows():
        if row.type == 'B':
            df.at[index, 'in_strike_zone'] = False
    
    df['chased'] = df['batter_swung'] - df['in_strikezone']
    df['chased'] = df['chased'].apply(lambda x: 1 if x == 1 else 0)
    return df

def make_batters_scouting_report(df):
    df = df.copy()
    #make list of the unique batter ids
    batters = list(df['batter'].unique())
    #initialize empty dictionary to store the batter stats
    batters_dict = {}
    #set a break flag to False for error-checking
    brk = False
    #iterate thru each unique batter
    for batter in batters:
        if brk:
            break
        #make subset of the df for that batter and assign to variable batter_df    
        batter_df = df[df['batter'] == batter]
        
        #assign all pitch categories to list:
        all_pitch_cats = ['fastball', 'breaking', 'offspeed', 'pitchout']
        #assign the pitch categories to a list
        pitch_cats = batter_df['pitch_cat'].unique().tolist()
        
        
        #get the normalized value counts of pitches by category that batter has faced
        vc = batter_df.pitch_cat.value_counts(normalize=True)
        #initialize empty dict for each batter
        batter_dict = {}
        
        #if there are any pitch categories the batter has not faced, 
        unfaced_cats = list(set(all_pitch_cats) - set(pitch_cats))
        
        #assign NaNs to his dictionary for that category
        for cat in unfaced_cats:
            if cat == 'pitchout':
                batter_dict[cat + '_perc_faced'] = 0
            else:
                batter_dict[cat + '_perc_faced'] = np.nan
                batter_dict[cat + '_chase_perc'] = np.nan
                batter_dict[cat + '_bip_swung_perc'] = np.nan
                batter_dict[cat + '_taken_strike_perc'] = np.nan
                batter_dict[cat + '_est_woba'] = np.nan
                batter_dict[cat + '_babip'] = np.nan
                batter_dict[cat + '_iso_value'] = np.nan
        
        for cat in pitch_cats:
            if brk:
                break
        
            #assign the % of pitches faced by the batter for that category to his batter dict
            
            batter_dict[cat + '_perc_faced'] = vc[cat] * 100
        
            #continue out of the loop for pitchout category since ball in play stats are NaN
            if cat == 'pitchout':
                continue
        
            #grab subset of batter df for the pitch category
            cat_df = batter_df[batter_df['pitch_cat'] == cat]
            
            #if he has faced less than 100 pitches of that type, add it to unfaced_category and fill w NaN
            if len(cat_df) < 100:
                unfaced_cats.append(cat)
                continue
        
            #calculate batters chase % for pitch type category on balls outside the strikezone
            out_of_strikezone = len(cat_df[cat_df['in_strikezone'] == 0]) #num of times ball was out of zone
            chased_count = len(cat_df[cat_df['chased'] == 1]) #num of times batter chased
            try:
                chase_perc = (chased_count / out_of_strikezone) * 100
            except ZeroDivisionError:
                chase_perc = np.nan
            #assign the chase perc to the batter dict
            batter_dict[cat + '_chase_perc'] = chase_perc
        
            #calc ball in play % for each swing for each pitch cat:
            ball_in_play_count = len(cat_df[cat_df['type'] == 'X']) #type X means ball hit into play
            swung_count = cat_df['batter_swung'].sum() #counts all the 1s in the swung column
            #assign the ball in play % per swing to the batter dict
            batter_dict[cat + '_bip_swung_perc'] = (ball_in_play_count / swung_count) * 100
        
            #calculate taken strike %
            taken_strike_count = len(cat_df[(cat_df['in_strikezone'] == 1) & (cat_df['batter_swung'] == 0)])
            pitches_in_zone_count = cat_df['in_strikezone'].sum() #counts the 1s in the in zone col
            #assign to batter_dict
            batter_dict[cat + '_taken_strike_perc'] = (taken_strike_count / pitches_in_zone_count) * 100
        
            #for each pitch type category, get the batters stats on balls hit in play
            stats = ['estimated_woba_using_speedangle', 'babip_value', 'iso_value']
            for stat in stats:
                #drop Nans from the stat column and assign to new subset, for each stat
                stat_cat_df = cat_df.dropna(subset=[stat])
                if stat == 'estimated_woba_using_speedangle':
                    #get the mean avg_est_woba
                    avg_est_woba = stat_cat_df['estimated_woba_using_speedangle'].mean()
                    #assign that value to the batters dictionary
                    batter_dict[cat + '_est_woba'] = avg_est_woba
                    if avg_est_woba == np.nan:
                        print(batter)
                        brk = True
                        break
                elif stat == 'babip_value':
                    avg_babip = stat_cat_df['babip_value'].mean()
                    batter_dict[cat + '_babip'] = avg_babip
                else:
                    avg_iso_value = stat_cat_df['iso_value'].mean()
                    batter_dict[cat + '_iso_value'] = avg_iso_value
            
        #assign the batter dictionary to the main dictionary of all batters
        batters_dict[batter] = batter_dict
    if not brk:
        print('iteration completed successfully')
        
    #make df from the batters dict
    batters_df = pd.DataFrame.from_dict(batters_dict, orient='index')
    batters_df = batters_df.reset_index().rename(columns={'index':'batter'})
    
    #merge df and batters df on batter col
    df = pd.merge(df, batters_df, how='left', on='batter')
    
    return df

def pre_process_step1(df_combined_raw):
    df = df_combined_raw.copy()
    #convert the pitch type for UN (unknown) to np.nan
    df['pitch_type'] = df['pitch_type'].replace({'UN':np.nan})
    #fix some faulty data that has number of balls listed as 4:
    df['balls'] = df['balls'].replace({4.0: 3.0})
    
    #count, count_cat, score_diff, on_base 1/0, bases_loaded
    df = make_game_features(df)
    
    #batter_swung, in_strikezone, chased
    df = make_strikezone_swung_and_chase_features(df)
    
    return df

In [None]:
%%time
combined = downcast_dtypes(combined)

CPU times: user 18.5 s, sys: 5.09 s, total: 23.6 s
Wall time: 20.4 s


In [None]:
%%time
combined = pre_process_step1(combined)

  .format(op=op_str, alt_op=unsupported[op_str]))


CPU times: user 20.9 s, sys: 4.53 s, total: 25.4 s
Wall time: 20.7 s


In [None]:
%%time
pitcher_dict = gen_pitcher_percentages(combined)
combined = fill_pitch_type_nans(combined, pitcher_dict)
combined = make_pitch_type_cat(combined)

CPU times: user 24.1 s, sys: 2.22 s, total: 26.3 s
Wall time: 24 s


In [None]:
combined.pitch_cat.value_counts()

fastball    2280263
breaking     938880
offspeed     387091
IN             9658
pitchout        959
Name: pitch_cat, dtype: int64

In [None]:
%%time
combined = make_batters_scouting_report(combined)



iteration completed successfully
CPU times: user 2min 11s, sys: 3.81 s, total: 2min 15s
Wall time: 2min 13s


In [None]:
combined.head()

Unnamed: 0,pitch_type,game_date,sv_id,level_0,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,player_name,events,description,des,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,pitch_name,if_fielding_alignment,of_fielding_alignment,_count,count_cat,score_diff,on_1b_id,bases_loaded,batter_swung,in_strikezone,chased,ball_high,ball_low,ball_left,ball_right,pitch_cat,fastball_perc_faced,fastball_chase_perc,fastball_bip_swung_perc,fastball_taken_strike_perc,fastball_est_woba,fastball_babip,fastball_iso_value,breaking_perc_faced,breaking_chase_perc,breaking_bip_swung_perc,breaking_taken_strike_perc,breaking_est_woba,breaking_babip,breaking_iso_value,offspeed_perc_faced,offspeed_chase_perc,offspeed_bip_swung_perc,offspeed_taken_strike_perc,offspeed_est_woba,offspeed_babip,offspeed_iso_value,pitchout_perc_faced,IN_perc_faced,IN_chase_perc,IN_bip_swung_perc,IN_taken_strike_perc,IN_est_woba,IN_babip,IN_iso_value
0,FF,2014-03-22,,714304,620,,,,624577.0,489119.0,,,0,0,2014.0,,,,,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,380537.0,,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Wade Miley,,called_strike,,R,L,ARI,LAD,S,,Top,,,,0,neutral,0.0,,0,0,1,0,False,False,False,False,fastball,66.139955,26.753898,38.268156,25.693089,0.354743,0.214332,0.142027,25.013437,40.230621,40.392478,22.32376,0.406318,0.196581,0.225071,8.534881,41.98895,37.850467,20.318725,0.425526,0.164502,0.238095,0.064495,0.247232,,,,,,
1,CU,2014-03-22,,714303,619,,,,624577.0,489119.0,,,0,1,2014.0,,,,,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,380537.0,,,,,,,,,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Wade Miley,,foul,,R,L,ARI,LAD,S,,Top,,,,1,ahead,0.0,,0,1,1,0,False,False,False,False,breaking,66.139955,26.753898,38.268156,25.693089,0.354743,0.214332,0.142027,25.013437,40.230621,40.392478,22.32376,0.406318,0.196581,0.225071,8.534881,41.98895,37.850467,20.318725,0.425526,0.164502,0.238095,0.064495,0.247232,,,,,,
2,FF,2014-03-22,,714302,618,,,,624577.0,489119.0,,2.0,0,2,2014.0,,,,,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,380537.0,,,,0.0,,0.0,0.0,,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Wade Miley,strikeout,swinging_strike_blocked,"Yasiel Puig strikes out swinging, catcher Migu...",R,L,ARI,LAD,S,,Top,,,,2,ahead,0.0,,0,1,1,0,False,False,False,False,fastball,66.139955,26.753898,38.268156,25.693089,0.354743,0.214332,0.142027,25.013437,40.230621,40.392478,22.32376,0.406318,0.196581,0.225071,8.534881,41.98895,37.850467,20.318725,0.425526,0.164502,0.238095,0.064495,0.247232,,,,,,
3,CU,2014-03-22,,714301,617,,,,457759.0,489119.0,,,0,0,2014.0,,,,,0.0,0.0,0.0,1.0,1.0,,,,,,,,,,,,,,,,,,380537.0,,,,,,,,,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Wade Miley,,ball,,R,L,ARI,LAD,B,,Top,,,,0,neutral,0.0,,0,0,1,0,False,False,False,False,breaking,66.758648,25.485902,38.245843,31.560397,0.412909,0.221014,0.169686,23.9184,27.70847,49.159248,34.845133,0.385697,0.253943,0.18612,9.175126,32.615894,46.222222,22.629969,0.395377,0.233083,0.172932,0.0,0.147827,,,,,,
4,FC,2014-03-22,,714300,616,,,,457759.0,489119.0,,,1,0,2014.0,,,,,0.0,0.0,0.0,1.0,1.0,,,,,,,,,,,,,,,,,,380537.0,,,,,,,,,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Wade Miley,,ball,,R,L,ARI,LAD,B,,Top,,,,10,behind,0.0,,0,0,1,0,False,False,False,False,fastball,66.758648,25.485902,38.245843,31.560397,0.412909,0.221014,0.169686,23.9184,27.70847,49.159248,34.845133,0.385697,0.253943,0.18612,9.175126,32.615894,46.222222,22.629969,0.395377,0.233083,0.172932,0.0,0.147827,,,,,,


In [None]:
def make_game_batting_order(game_df):
    game_df = game_df.sort_values(by=['at_bat_number', 'pitch_number'])
    all_batters = game_df['batter'].unique().tolist()
    #re-set the at_bat_number for the game to be sequential starting at 1
    at_bat_keys = game_df['at_bat_number'].unique().tolist()
    at_bat_values = range(1, len(at_bat_keys)+1)
    at_bat_map = dict(zip(at_bat_keys, at_bat_values))
    game_df['team_at_bat_number'] = game_df['at_bat_number'].replace(at_bat_map)
    
    #get the first 9 batter ids
    first_9_batter_subset = game_df[game_df['team_at_bat_number'] < 10]
    first_9_batters = first_9_batter_subset['batter'].unique().tolist()
    
    #map the batter id to batting order position 1-9
    batter_order_map = dict(zip(first_9_batters, range(1,10)))
    
    #for anyone else who bats later in the game, assign 'PH' (pinch hitter) to their batting order slot
    other_batters = list(set(all_batters) - set(first_9_batters))
    if len(other_batters) > 0:
        for batter in other_batters:
            batter_order_map[batter] = 'PH'      
    game_df['batting_order_slot'] = game_df['batter'].apply(lambda x: batter_order_map[x])
    return game_df

def make_batting_order_slot_feature(df):
    games = df['game_pk'].unique().tolist()
    games_list = []
    skipped_games_count = 0
    
    for game in games:
        try:
            game_df = df[df['game_pk'] == game].copy()
            pitcher_list = game_df.pitcher.unique().tolist()
        
            top = game_df[game_df['inning_topbot'] == 'Top']
            top = make_game_batting_order(top)
            bot = game_df[game_df['inning_topbot'] == 'Bot']
            bot = make_game_batting_order(bot)
            new_game_df = pd.concat([top, bot])
            new_game_df['pitcher_AB'] = new_game_df['batter'].apply(lambda x: True if x in pitcher_list else False)
            new_game_df['batting_order_slot'] = new_game_df['batting_order_slot'].where(new_game_df['pitcher_AB'] == False, other='pitcher')
            games_list.append(new_game_df)
        except:
            skipped_games_count += 1
            continue
    print('skipped games count: ' + str(skipped_games_count))
    return pd.concat(games_list).drop(columns=['pitcher_AB'])

In [None]:
first20 = combined.game_pk.unique().tolist()[:20]
test_games = combined[combined.game_pk.isin(first20)]

In [None]:
%%time
test = make_batting_order_slot_feature(test_games)

skipped games count: 0
CPU times: user 1min 35s, sys: 6.31 s, total: 1min 42s
Wall time: 1min 42s


In [None]:
test[['game_pk', 'batter', 'at_bat_number', 'team_at_bat_number', 'batting_order_slot']]

Unnamed: 0,game_pk,batter,at_bat_number,team_at_bat_number,batting_order_slot
0,380537.0,624577.0,1.0,1.0,1
1,380537.0,624577.0,1.0,1.0,1
2,380537.0,624577.0,1.0,1.0,1
3,380537.0,457759.0,2.0,2.0,2
4,380537.0,457759.0,2.0,2.0,2
5,380537.0,457759.0,2.0,2.0,2
6,380537.0,457759.0,2.0,2.0,2
7,380537.0,457759.0,2.0,2.0,2
8,380537.0,434670.0,3.0,3.0,3
9,380537.0,434670.0,3.0,3.0,3


In [None]:
all_games = combined['game_pk'].unique().tolist()

chunk1 = all_games[:1000]
chunk2 = all_games[1000:2000]
chunk3 = all_games[2000:3000]
chunk4 = all_games[3000:4000]
chunk5 = all_games[4000:5000]
chunk6 = all_games[5000:6000]
chunk7 = all_games[6000:7000]
chunk8 = all_games[7000:8000]
chunk9 = all_games[8000:9000]
chunk10 = all_games[9000:10000]
chunk11 = all_games[10000:11000]
chunk12 = all_games[11000:12000]
chunk13 = all_games[12000:]

chunk1_df = combined[combined['game_pk'].isin(chunk1)].copy()
chunk2_df = combined[combined['game_pk'].isin(chunk2)].copy()
chunk3_df = combined[combined['game_pk'].isin(chunk3)].copy()
chunk4_df = combined[combined['game_pk'].isin(chunk4)].copy()
chunk5_df = combined[combined['game_pk'].isin(chunk5)].copy()
chunk6_df = combined[combined['game_pk'].isin(chunk6)].copy()
chunk7_df = combined[combined['game_pk'].isin(chunk7)].copy()
chunk8_df = combined[combined['game_pk'].isin(chunk8)].copy()
chunk9_df = combined[combined['game_pk'].isin(chunk9)].copy()
chunk10_df = combined[combined['game_pk'].isin(chunk10)].copy()
chunk11_df = combined[combined['game_pk'].isin(chunk11)].copy()
chunk12_df = combined[combined['game_pk'].isin(chunk12)].copy()
chunk13_df = combined[combined['game_pk'].isin(chunk13)].copy()

In [None]:
%%time
chunk1_df = make_batting_order_slot_feature(chunk1_df)

In [None]:
%%time
chunk2_df = make_batting_order_slot_feature(chunk2_df)

In [None]:
%%time
chunk3_df = make_batting_order_slot_feature(chunk3_df)

In [None]:
%%time
chunk4_df = make_batting_order_slot_feature(chunk4_df)

In [None]:
%%time
chunk5_df = make_batting_order_slot_feature(chunk5_df)

In [None]:
%%time
chunk6_df = make_batting_order_slot_feature(chunk6_df)

In [None]:
%%time
chunk7_df = make_batting_order_slot_feature(chunk7_df)

In [None]:
%%time
chunk8_df = make_batting_order_slot_feature(chunk8_df)

In [None]:
%%time
chunk9_df = make_batting_order_slot_feature(chunk9_df)

In [None]:
%%time
chunk10_df = make_batting_order_slot_feature(chunk10_df)

In [None]:
%%time
chunk11_df = make_batting_order_slot_feature(chunk11_df)

In [None]:
%%time
chunk12_df = make_batting_order_slot_feature(chunk12_df)

In [None]:
%%time
chunk13_df = make_batting_order_slot_feature(chunk13_df)

In [None]:
batter_cols = ['fastball_perc_faced','fastball_chase_perc','fastball_bip_swung_perc', 'fastball_taken_strike_perc',
               'fastball_est_woba', 'fastball_babip', 'fastball_iso_value', 'breaking_perc_faced', 'breaking_chase_perc',
               'breaking_bip_swung_perc', 'breaking_taken_strike_perc', 'breaking_est_woba', 'breaking_babip', 
               'breaking_iso_value', 'offspeed_perc_faced', 'offspeed_chase_perc', 'offspeed_bip_swung_perc',
               'offspeed_taken_strike_perc', 'offspeed_est_woba', 'offspeed_babip', 'offspeed_iso_value',
               'pitchout_perc_faced']

def make_batting_order_slot_map(df):
    batting_order_slot_map = {}
    for slot in df.batting_order_slot.unique().tolist():
        batting_order_slot_map[slot] = {}
        slot_df = df[df.batting_order_slot == slot]
        
        for col in batter_cols:
            batting_order_slot_map[slot][col] = slot_df[col].mean()
    return batting_order_slot_map

def fill_batting_nans(df, batting_order_slot_map):
    for slot in df['batting_order_slot'].unique().tolist():
        subset = df[df['batting_order_slot'] == slot].copy()
        df = df.drop(subset.index)
        for col in batter_cols:
            subset[col] = subset[col].fillna(batting_order_slot_map[slot][col])
        df = pd.concat([df, subset])
        print('finished w/ slot: ' + str(slot))

In [None]:
%%time
test_map = make_batting_order_slot_map(test)

CPU times: user 75.1 ms, sys: 270 µs, total: 75.4 ms
Wall time: 72.7 ms


In [None]:
all_chunks_df = pd.concat([chunk1, chunk2, chunk3, chunk4, chunk5, chunk6, chunk7, chunk8, chunk9, chunk10, chunk11, chunk12, chunk13])

In [None]:
%%time
batting_order_slot_map = make_batting_order_slot_map(all_chunks_df)

In [None]:
import pickle

pickle.dump(batting_order_slot_map, open("batting_order_slot_map.pkl", "wb"))