In [12]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Main Pre-processing functions:

In [13]:
def downcast_dtypes(df):
    df = df.copy()
    int_cols = df.select_dtypes('int').columns.tolist()
    float_cols = df.select_dtypes('float').columns.tolist()
    obj_cols = df.select_dtypes('object').columns.tolist()
    cat_cols = []
    for col in obj_cols:
        if col == 'pitch_type':
            continue
        if len(df[col].unique()) < len(df)/2:
            cat_cols.append(col)
      
    ints = df[int_cols].apply(pd.to_numeric,downcast='unsigned')
    floats = df[float_cols].apply(pd.to_numeric,downcast='float')
    cats = df[cat_cols].astype('category')
  
    df = df.drop(columns=int_cols + float_cols + cat_cols)
    for d in [ints, floats, cats]:
        df = pd.concat([df, d], axis=1)
    return df

def make_game_features(df):
    df = df.copy()
    
    #convert balls and strikes columns to strings (first to int)
    df['balls'] = df['balls'].astype('int').astype('str')
    df['strikes'] = df['strikes'].astype('int').astype('str')

    #create count feature:
    df['_count'] = df['balls'] + df['strikes']

    #create count category feature:
    count_map = {'00':'neutral', '21':'neutral', '32':'neutral', '10':'behind', '20':'behind',
             '30':'behind', '31':'behind', '01':'ahead', '02':'ahead', '11':'ahead',
             '12':'ahead', '22':'ahead'}
    df['count_cat'] = df['_count'].replace(count_map)
    
    #cant decide whether to drop the balls and strikes feature, keeping for now- check feature importances later
    #df = df.drop(columns=['balls', 'strikes'])
    
    #make score_differential feature
    df['score_diff'] = df['fld_score'] - df['bat_score']
    
    #make baserunners features 1/0 instead of baserunner id:
    df['on_1b_id'] = df['on_1b'] #save the runners id in case we later add feature for speedy runners
    df['on_1b'] = df['on_1b'] * 0 + 1
    df['on_1b'] = df['on_1b'].fillna(0)
    df['on_2b'] = df['on_2b'] * 0 + 1
    df['on_2b'] = df['on_2b'].fillna(0)
    df['on_3b'] = df['on_3b'] * 0 + 1
    df['on_3b'] = df['on_3b'].fillna(0)
    
    #make bases_loaded feature:
    df['bases_loaded'] = df['on_1b'] + df['on_2b'] + df['on_3b']
    df['bases_loaded'] = df['bases_loaded'].apply(lambda x: 1 if x == 3 else 0)

    return df

def gen_pitcher_percentages(df):
    df = df.copy()
    #get all the unique pitcher names in the df
    pitcher_list = df.pitcher.unique().tolist()
    #initialize empty dictionary to store each pitcher and their pitches and percentages for each pitch
    pitcher_dict = {}
    #iterate over each pitcher:
    for pitcher in pitcher_list:
        #assign the normalized value_counts to a variable
        pitch_percentages = df[df.pitcher == pitcher].pitch_type.value_counts(normalize=True)
        #convert that Series object to a dict and assign it as the value to the pitcher dictionary 
        #(pitcher name as key)
        pitcher_dict[pitcher] = pitch_percentages.to_dict()
    return pitcher_dict

def fill_pitch_type_nans(df, pitcher_dict):
    df = df.copy()
    #grab the rows where pitch_type is null:
    nulls = df[df.pitch_type.isna()]
    #For pitchers w/ only NaNs for pitch type, just delete them
    delete_idx_list = []
    
    #iterate over each null row
    for index, row in nulls.iterrows():
        #use the % for that pitcher for each pitch type he throws to generate a random pitch type with 
        #that % as weight
        try:
            pitch = random.choices(population=list(pitcher_dict[row.pitcher].keys()), 
                               weights=list(pitcher_dict[row.pitcher].values()), 
                               k=1)[0]
            #fill the NaN value with the randomly generated pitch
            df.at[index, 'pitch_type'] = pitch
        except KeyError:
            try:
                pvc = df[df['pitcher']==row.pitcher]['pitch_type'].value_counts(normalize=True)
                pitch_types = pvc.index.tolist()
                percentages = pvc.values.tolist()
                pitch = random.choices(population=pitch_types, weights=percentages, k=1)[0]
            except IndexError:
                print('IndexError: deleting ' + str(row.pitcher))
                delete_idx_list.append(index)
                
        except IndexError:
            print('IndexError: ' + str(row.pitcher))
    
    #drop the pitchers w/ all NaNs before returning       
    df = df.drop(delete_idx_list)
    return df

def make_pitch_type_cat(df):
    df = df.copy()
    
    #create map for pitch type into categories:
    pitch_type_map = {'FA':'fastball', 'FF':'fastball', 'FT':'fastball', 'FC':'fastball',
                      'FS':'fastball', 'SI':'fastball', 'SF':'fastball', 'SL':'breaking',
                      'CB':'breaking', 'CU':'breaking', 'SC':'breaking', 'KC':'breaking',
                      'CH':'offspeed', 'KN':'offspeed', 'EP':'offspeed', 'FO':'breaking', 
                      'PO':'pitchout'}

    #create pitch cateogory feature
    df['pitch_cat'] = df['pitch_type']
    df['pitch_cat'] = df['pitch_cat'].replace(pitch_type_map)
    return df

def make_strikezone_swung_and_chase_features(df):
    df = df.copy()
    
    #create swung column
    def swung(x):
        swung = ['foul','hit_into_play','swinging_strike','hit_into_play_no_out',
                 'hit_into_play_score','foul_tip','swinging_strike_blocked',
                 'foul_bunt','missed_bunt']
        return 1 if x in swung else 0
    df['batter_swung'] = df['description'].apply(swung)
    
    #initialize in_strikezone and chased features:
    df['in_strikezone'] = 1
    df['chased'] = 0
    
    df['ball_high'] = df['plate_z'] > df['sz_top']
    df['ball_low'] = df['plate_z'] < df['sz_bot']
    df['ball_left'] = df['plate_x'].apply(lambda x: x < -0.73)
    df['ball_right'] = df['plate_x'].apply(lambda x: x > 0.73)
    df['in_strikezone'] = df['ball_high'] + df['ball_low'] + df['ball_left'] + df['ball_right']
    df['in_strikezone'] = df['in_strikezone'].apply(lambda x: 0 if x > 0 else 1)
    
    nulls_subset = df[df['in_strikezone'].isna()]
    for index, row in nulls_subset.iterrows():
        if row.type == 'B':
            df.at[index, 'in_strike_zone'] = False
    
    df['chased'] = df['batter_swung'] - df['in_strikezone']
    df['chased'] = df['chased'].apply(lambda x: 1 if x == 1 else 0)
    return df

#change this to return the batters_df (and pass in prior)
def make_batters_df(prior_df):
    df = prior_df.copy()
    #make list of the unique batter ids
    batters = list(df['batter'].unique())
    #initialize empty dictionary to store the batter stats
    batters_dict = {}
    #set a break flag to False for error-checking
    brk = False
    #iterate thru each unique batter
    for batter in batters:
        if brk:
            break
        #make subset of the df for that batter and assign to variable batter_df    
        batter_df = df[df['batter'] == batter]
        
        #assign all pitch categories to list:
        all_pitch_cats = ['fastball', 'breaking', 'offspeed', 'pitchout']
        
        #assign the pitch categories to a list
        try:
            pitch_cats = batter_df['pitch_cat'].unique().tolist()
        except KeyError:
            print(batter)
            brk = True
        #get the normalized value counts of pitches by category that batter has faced
        vc = batter_df.pitch_cat.value_counts(normalize=True)
        #initialize empty dict for each batter
        batter_dict = {}
        
        #if there are any pitch categories the batter has not faced, 
        unfaced_cats = list(set(all_pitch_cats) - set(pitch_cats))
        
        for cat in pitch_cats:
            if brk:
                break
        
            #assign the % of pitches faced by the batter for that category to his batter dict
            try:
                batter_dict[cat + '_perc_faced'] = vc[cat] * 100
            except TypeError:
                print(batter)
                return 1
                
            #continue out of the loop for pitchout category since ball in play stats are NaN
            if cat == 'pitchout':
                continue
        
            #grab subset of batter df for the pitch category
            cat_df = batter_df[batter_df['pitch_cat'] == cat]
            
            #if he has faced less than 100 pitches of that type, add it to unfaced_category and fill w NaN
            if len(cat_df) < 100:
                unfaced_cats.append(cat)
                continue
        
            #calculate batters chase % for pitch type category on balls outside the strikezone
            out_of_strikezone = len(cat_df) - cat_df['in_strikezone'].sum() #num of times ball was out of zone
            chased_count = cat_df['chased'].sum() #num of times batter chased
            try:
                chase_perc = (chased_count / out_of_strikezone) * 100
            except ZeroDivisionError:
                chase_perc = np.nan
            #assign the chase perc to the batter dict
            batter_dict[cat + '_chase_perc'] = chase_perc
        
            #calc ball in play % for each swing for each pitch cat:
            ball_in_play_count = len(cat_df[cat_df['type'] == 'X']) #type X means ball hit into play
            swung_count = cat_df['batter_swung'].sum() #counts all the 1s in the swung column
            #assign the ball in play % per swing to the batter dict
            batter_dict[cat + '_bip_swung_perc'] = (ball_in_play_count / swung_count) * 100
        
            #calculate taken strike %
            taken_strike_count = len(cat_df[(cat_df['in_strikezone'] == 1) & (cat_df['batter_swung'] == 0)])
            pitches_in_zone_count = cat_df['in_strikezone'].sum() #counts the 1s in the in zone col
            #assign to batter_dict
            batter_dict[cat + '_taken_strike_perc'] = (taken_strike_count / pitches_in_zone_count) * 100
        
            #for each pitch type category, get the batters stats on balls hit in play
            stats = ['estimated_woba_using_speedangle', 'babip_value', 'iso_value']
            for stat in stats:
                #drop Nans from the stat column and assign to new subset, for each stat
                stat_cat_df = cat_df.dropna(subset=[stat])
                if stat == 'estimated_woba_using_speedangle':
                    #get the mean avg_est_woba
                    avg_est_woba = stat_cat_df['estimated_woba_using_speedangle'].mean()
                    #assign that value to the batters dictionary
                    batter_dict[cat + '_est_woba'] = avg_est_woba
                    if avg_est_woba == np.nan:
                        print(batter)
                        brk = True
                        break
                elif stat == 'babip_value':
                    avg_babip = stat_cat_df['babip_value'].mean()
                    batter_dict[cat + '_babip'] = avg_babip
                else:
                    avg_iso_value = stat_cat_df['iso_value'].mean()
                    batter_dict[cat + '_iso_value'] = avg_iso_value
                    
        #for unfaced or small sample pitch_types: assign NaNs to his dictionary for that category
        for cat in unfaced_cats:
            if cat == 'pitchout':
                batter_dict[cat + '_perc_faced'] = 0
            else:
                batter_dict[cat + '_perc_faced'] = np.nan
                batter_dict[cat + '_chase_perc'] = np.nan
                batter_dict[cat + '_bip_swung_perc'] = np.nan
                batter_dict[cat + '_taken_strike_perc'] = np.nan
                batter_dict[cat + '_est_woba'] = np.nan
                batter_dict[cat + '_babip'] = np.nan
                batter_dict[cat + '_iso_value'] = np.nan
        
        #assign the batter dictionary to the main dictionary of all batters
        batters_dict[batter] = batter_dict
    if not brk:
        print('iteration completed successfully')
        
    #make df from the batters dict
    batters_df = pd.DataFrame.from_dict(batters_dict, orient='index')
    batters_df = batters_df.reset_index().rename(columns={'index':'batter'})
    return batters_df

## Read in the data and downcast the dtypes

In [14]:
drop_cols = ['pitcher.1','fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8','fielder_9']

# df_16 = pd.read_pickle('pitches_2016.pkl', compression='zip').drop(columns=drop_cols)
# df_16 = downcast_dtypes(df_16)
# df_16.head()

In [15]:
df_17 = pd.read_pickle('pitches_2017.pkl', compression='zip').drop(columns=drop_cols)
df_17 = downcast_dtypes(df_17)
df_17.head()

Unnamed: 0,pitch_type,game_date,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,player_name,events,description,des,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,sv_id,pitch_name,if_fielding_alignment,of_fielding_alignment
731854,FT,2017-04-02,18600,90.900002,-1.5453,6.1623,452655.0,425844.0,4.0,,0.0,0.0,2017.0,-1.0694,1.5,-0.6365,2.1534,,,,0.0,1.0,,,425772.0,4.4417,-131.7883,-7.1356,-13.156,27.995001,-13.7729,3.22,1.55,,,,89.726997,2284.0,5.764,490110.0,54.737099,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zack Greinke,,called_strike,,L,R,ARI,SF,S,,Top,170402_201037,2-Seam Fastball,Standard,Standard
732138,FF,2017-04-02,18601,92.099998,-1.4858,6.3335,458731.0,502042.0,14.0,,0.0,0.0,2017.0,-0.5572,1.2666,1.0026,1.2965,,,,0.0,1.0,,,519083.0,7.4457,-133.322403,-9.5222,-8.2198,29.3731,-15.351,3.0,1.42,,,,91.261002,2106.0,6.119,490106.0,54.381199,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Chris Archer,,ball,,L,R,TB,NYY,B,,Top,170402_171214,4-Seam Fastball,Standard,Standard
732475,FF,2017-04-02,18599,94.900002,-1.3276,5.5862,656941.0,593372.0,12.0,,0.0,0.0,2017.0,-0.7759,1.1009,0.2816,4.3344,,,,0.0,1.0,,,425877.0,5.7734,-137.729797,0.008,-10.848,29.457899,-18.7847,3.3157,1.6822,,,,93.314003,2146.0,5.49,490099.0,55.069,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,,ball,,L,R,STL,CHC,B,,Top,170403_003853,4-Seam Fastball,Standard,Standard
731853,FT,2017-04-02,18597,90.5,-1.5921,6.0637,452655.0,425844.0,12.0,,0.0,1.0,2017.0,-1.1172,1.0645,0.5402,3.5,,,,0.0,1.0,,,425772.0,7.6422,-131.444199,-2.7317,-14.4423,25.370001,-19.494101,3.22,1.55,,,,90.516998,2272.0,6.198,490110.0,54.303299,,,,,,,,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zack Greinke,,ball,,L,R,ARI,SF,B,,Top,170402_201103,2-Seam Fastball,Standard,Standard
732137,FF,2017-04-02,18598,92.400002,-1.629,6.32,458731.0,502042.0,2.0,,1.0,0.0,2017.0,-0.5129,1.3955,0.1398,2.9266,,,,0.0,1.0,,,519083.0,5.5672,-133.954605,-5.6951,-7.411,32.118198,-14.5277,3.0,1.42,,,,91.377998,2119.0,6.119,490106.0,54.3806,,,,,,,,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Chris Archer,,called_strike,,L,R,TB,NYY,S,,Top,170402_171240,4-Seam Fastball,Standard,Standard


In [16]:
df_18 = pd.read_pickle('pitches_2018.pkl', compression='zip').drop(columns=drop_cols)

#drop a particular pitcher w/ only 16 pitches (all NaNs)
drop_idx = df_18[df_18['pitcher'] == 664068.0].index
df_18 = df_18.drop(drop_idx)

df_18 = downcast_dtypes(df_18)
df_18.head()

Unnamed: 0,pitch_type,game_date,sv_id,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,player_name,events,description,des,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,pitch_name,if_fielding_alignment,of_fielding_alignment
727504,SI,2018-03-29,180329_171659,22954,98.699997,-0.6368,6.4662,451594.0,592789.0,4.0,,0.0,0.0,2018.0,-1.3476,0.9089,-0.6062,2.7052,,,,0.0,1.0,,,608700.0,3.1848,-143.609406,-6.8584,-19.236799,33.880901,-18.3169,3.9092,1.8803,223.0,78.800003,53.0,98.362999,2134.0,6.148,529419.0,54.351501,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Noah Syndergaard,,foul,,L,R,NYM,STL,S,,Top,Sinker,Standard,Standard
727789,FF,2018-03-29,180329_231108,22955,91.199997,1.3504,6.2721,457706.0,477132.0,1.0,,0.0,0.0,2018.0,-0.2986,1.5682,-0.6425,3.2651,,,,0.0,1.0,,,518735.0,-4.3868,-132.764206,-5.1104,-2.6582,27.403,-12.6002,3.4237,1.5418,,,,91.795998,2390.0,6.547,529418.0,53.954899,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Clayton Kershaw,,called_strike,,R,L,LAD,SF,S,,Top,4-Seam Fastball,Standard,Standard
728097,FF,2018-03-29,180329_201258,22959,89.900002,-2.1275,5.153,514917.0,527054.0,4.0,,0.0,0.0,2018.0,-0.667,1.204,-0.4357,2.2506,,,,0.0,1.0,,,452095.0,5.5448,-130.854294,-3.7592,-8.6641,25.0471,-17.944901,3.0886,1.3618,,,,89.684998,2274.0,5.935,529416.0,54.564899,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Julio Teheran,,called_strike,,L,R,ATL,PHI,S,,Top,4-Seam Fastball,Standard,Standard
728405,FF,2018-03-29,180329_200915,22965,92.199997,1.5791,6.2811,458731.0,457918.0,5.0,,0.0,0.0,2018.0,0.318,1.5706,-0.2548,2.5514,,,,0.0,1.0,,,431145.0,-5.298,-133.791595,-6.8374,5.0621,33.853199,-12.3381,3.0851,1.6099,,,,90.789001,2372.0,5.983,529415.0,54.516998,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,J.A. Happ,,called_strike,,L,L,TOR,NYY,S,,Top,4-Seam Fastball,Standard,Standard
728723,FF,2018-03-29,180329_190843,22958,91.300003,-1.4971,6.037,572821.0,605164.0,5.0,,0.0,0.0,2018.0,-0.7111,1.6393,-0.2528,2.126,,,,0.0,1.0,,,543376.0,4.6064,-132.519104,-7.2677,-9.3163,32.796299,-11.7962,2.949,1.2867,273.0,,,90.0,2432.0,5.996,529414.0,54.5047,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Dylan Bundy,,foul,,R,R,BAL,MIN,S,,Top,4-Seam Fastball,Infield shift,Standard


In [18]:
df_19 = pd.read_pickle('pitches_2019.pkl', compression='zip').drop(columns=drop_cols)
df_19.head()

Unnamed: 0,pitch_type,game_date,sv_id,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,player_name,events,description,des,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,pitch_name,if_fielding_alignment,of_fielding_alignment
612415,FF,2019-03-28,190328_201138,22529,96.0,-2.9039,5.2153,624428.0,622491.0,7.0,3.0,0.0,0.0,2019.0,-0.859,0.8596,-0.3394,1.8839,,,,0.0,1.0,152.350006,165.759995,571466.0,8.4903,-139.424896,-5.0628,-12.5679,26.538799,-20.4739,3.301,1.9909,5.0,89.699997,-23.0,94.838997,2013.0,5.381,565220.0,55.118599,0.116,0.112,0.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Luis Castillo,field_out,hit_into_play,"Adam Frazier grounds out, first baseman Joey V...",L,R,CIN,PIT,X,ground_ball,Top,4-Seam Fastball,Standard,Standard
612414,FF,2019-03-28,190328_201218,22515,95.800003,-2.7622,5.1903,466320.0,622491.0,12.0,,0.0,0.0,2019.0,-1.0101,0.9756,0.8866,2.7563,,,,1.0,1.0,,,571466.0,11.7184,-138.959503,-2.9584,-15.2994,28.0977,-19.306801,3.316,1.5126,,,,95.033997,2130.0,5.804,565220.0,54.6959,,,,,,,,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Luis Castillo,,called_strike,,L,R,CIN,PIT,S,,Top,4-Seam Fastball,Strategic,Standard
612413,CH,2019-03-28,190328_201232,22511,88.599998,-2.8765,5.0955,466320.0,622491.0,14.0,,0.0,1.0,2019.0,-1.1744,0.0629,0.1219,0.4152,,,,1.0,1.0,,,571466.0,9.4729,-128.585297,-5.4328,-14.3972,23.2579,-30.745899,3.3,1.5,,,,87.273003,2240.0,5.314,565220.0,55.185699,,,,,,,,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Luis Castillo,,swinging_strike,,L,R,CIN,PIT,S,,Top,Changeup,Strategic,Standard
612412,FF,2019-03-28,190328_201256,22488,96.400002,-2.7735,5.244,466320.0,622491.0,1.0,,0.0,2.0,2019.0,-1.0959,0.9177,-0.7734,3.4057,,,,1.0,1.0,,,571466.0,7.4728,-140.043594,-1.2863,-15.3898,29.5305,-20.602301,3.301,1.504,257.0,82.300003,44.0,94.236,2135.0,4.966,565220.0,55.5331,,,,,,,,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Luis Castillo,,foul,,L,R,CIN,PIT,S,,Top,4-Seam Fastball,Strategic,Standard
612411,CH,2019-03-28,190328_201311,22468,88.0,-2.8687,4.9348,466320.0,622491.0,13.0,2.0,0.0,2.0,2019.0,-1.1851,0.263,-0.3882,1.1706,,,,1.0,1.0,,,571466.0,8.2811,-127.775597,-3.5437,-14.2871,23.6924,-28.914101,3.3,1.5,,,,87.117996,2248.0,5.628,565220.0,54.8713,,,0.0,1.0,0.0,0.0,,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Luis Castillo,strikeout,swinging_strike,Melky Cabrera strikes out swinging.,L,R,CIN,PIT,S,,Top,Changeup,Strategic,Standard


#### Combine 2017 and 2018 and 2019 into one df and clean up some faulty data

In [19]:
combined = pd.concat([df_17, df_18, df_19], sort=False)
combined = combined.sort_values(by = ['game_date', 'game_pk', 'at_bat_number', 'pitch_number']).reset_index(drop=True)

In [20]:
%%time
#iterate over every game and find the ones w/ all Nans for pitch_type and delete them from combined df
games = combined.game_pk.unique().tolist()
for game in games:
    game_df = combined[combined.game_pk == game]
    #if the game is all Nans for pitch_type
    if len(game_df.pitch_type.value_counts()) == 0:
        #delete it from combined
        combined = combined.drop(game_df.index)
        
#reset index again:
combined = combined.reset_index(drop=True)

CPU times: user 42 s, sys: 4.25 s, total: 46.3 s
Wall time: 40.3 s


In [21]:
combined.head()

Unnamed: 0,pitch_type,game_date,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,player_name,events,description,des,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,sv_id,pitch_name,if_fielding_alignment,of_fielding_alignment
0,FF,2017-04-02,18599,94.900002,-1.3276,5.5862,656941.0,593372.0,12.0,,0.0,0.0,2017.0,-0.7759,1.1009,0.2816,4.3344,,,,0.0,1.0,,,425877.0,5.7734,-137.729797,0.008,-10.848,29.457899,-18.7847,3.3157,1.6822,,,,93.314003,2146.0,5.49,490099.0,55.069,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,,ball,,L,R,STL,CHC,B,,Top,170403_003853,4-Seam Fastball,Standard,Standard
1,FT,2017-04-02,18596,95.900002,-1.3091,5.4787,656941.0,593372.0,7.0,,1.0,0.0,2017.0,-1.5891,0.8433,-0.5788,2.3241,,,,0.0,1.0,,,425877.0,5.2887,-139.217804,-4.4869,-21.2332,30.9744,-20.7889,3.6986,2.0002,,,,94.237,2144.0,5.465,490099.0,55.067799,,,,,,,,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,,foul,,L,R,STL,CHC,S,,Top,170403_003907,2-Seam Fastball,Standard,Standard
2,FT,2017-04-02,18593,97.300003,-1.3152,5.1777,656941.0,593372.0,13.0,,1.0,1.0,2017.0,-1.3374,0.5487,-0.0143,1.3261,,,,0.0,1.0,,,425877.0,6.3532,-141.192307,-5.9444,-18.8818,30.681299,-23.9426,3.3593,1.5809,,,,96.186996,2015.0,5.62,490099.0,54.799,,,,,,,,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,,ball,,L,R,STL,CHC,B,,Top,170403_003927,2-Seam Fastball,Standard,Standard
3,SL,2017-04-02,18590,84.599998,-1.2537,5.3882,656941.0,593372.0,8.0,9.0,2.0,1.0,2017.0,1.1019,-0.1662,0.1358,1.9434,,,,0.0,1.0,243.080002,171.740005,425877.0,1.1491,-122.821503,-1.3416,10.3319,24.074301,-33.798302,3.7489,1.8682,256.0,96.099998,18.0,82.258003,2194.0,4.722,490099.0,55.676899,0.381,0.465,0.9,1.0,1.0,0.0,4.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,single,hit_into_play_no_out,Kyle Schwarber singles on a line drive to righ...,L,R,STL,CHC,X,line_drive,Top,170403_003943,Slider,Infield shift,Standard
4,FT,2017-04-02,18587,98.400002,-1.2265,5.149,592178.0,593372.0,9.0,,0.0,0.0,2017.0,-1.278,0.6793,0.4214,1.8336,,,656941.0,0.0,1.0,,,425877.0,7.1795,-142.5811,-4.9189,-18.5739,33.201401,-22.3067,3.2699,1.6918,,,,96.696999,2066.0,5.481,490099.0,54.922001,,,,,,,,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,,called_strike,,R,R,STL,CHC,S,,Top,170403_004031,2-Seam Fastball,Standard,Standard


In [None]:
#If using 2016 data:

#fix some bad data for pitch_type

# pitch_types = ['FF','SL', 'FT', 'CH', 'SI', 'CU', 'FC', 'KC', 'FS', 'KN', 'IN', 'FO', 'PO','EP','SC','UN', 'FA']
# bad_data = combined[~(combined.pitch_type.isin(pitch_types))].pitch_type.value_counts().index.tolist()
# combined['pitch_type'] = combined.pitch_type.replace(to_replace=bad_data, value=np.nan)

In [22]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071739 entries, 0 to 2071738
Data columns (total 73 columns):
pitch_type                         object
game_date                          datetime64[ns]
index                              uint16
release_speed                      float32
release_pos_x                      float32
release_pos_z                      float32
batter                             float32
pitcher                            float32
zone                               float32
hit_location                       float32
balls                              float32
strikes                            float32
game_year                          float32
pfx_x                              float32
pfx_z                              float32
plate_x                            float32
plate_z                            float32
on_3b                              float32
on_2b                              float32
on_1b                              float32
outs_when_up          

In [23]:
#clear some memory:
import gc

#del df_16
del df_17
del df_18
del df_19

gc.collect()

15

## 2-step pre-processing functions:
Step 1 features for all rows:

Step 2 features that iterate over each month of 2017 and 2018 so that aggregates use prior data only (preventing leakage of future data into the training set)

In [24]:
def pre_process_step1(combined):
    df = combined.copy()
    #convert the pitch type for UN (unknown) to np.nan
    df['pitch_type'] = df['pitch_type'].replace({'UN':np.nan})
    #fix some faulty data that has number of balls listed as 4:
    df['balls'] = df['balls'].replace({4.0: 3.0})
    
    #count, count_cat, score_diff, on_base 1/0, bases_loaded
    df = make_game_features(df)
    
    #batter_swung, in_strikezone, chased
    df = make_strikezone_swung_and_chase_features(df)
    
    #get aggregate pitcher %s dict from prior data:
    pitcher_dict = gen_pitcher_percentages(df)
    
    #fil the NaNs for pitch_type using randomized guess from pitcher tendencies
    df = fill_pitch_type_nans(df, pitcher_dict)
        
    #pitch_type category feature
    df = make_pitch_type_cat(df)
    
    return df

#pass in list of periods to update the data (and fill NaNs) using prior aggregates:
def pre_process_step2(pre_processed_step1, start_dates, end_dates):
    df = pre_processed_step1.copy()
    
    #initialize empty list to store dfs (concat them together later)
    df_list = []
    
    #iterate over each period
    for i in range(len(start_dates)):
        #make the prior and current dfs:
        prior_df = df[df['game_date'] < start_dates[i]]
        current_df = df[(df['game_date'] >= start_dates[i]) & (df['game_date'] <= end_dates[i])]
        
        #add the batter scouting report
        batters_df = make_batters_df(prior_df)
        current_df = pd.merge(current_df, batters_df, how='left', on='batter')
        
        #append the df to the list
        df_list.append(current_df)
    
    step2_df = pd.concat(df_list, sort=False)
    return step2_df

In [25]:
#2017 season: '2017-04-02', '2017-10-02'
#2018 season: '2018-03-29' - '2018-10-01'
#2019 season: '2019-03-28' - '2018-08-31'

#if we decide to use 2.5 seasons of training instead of 1.5, here the dates for 2017:
#2017_start_dates = ['2017-04-02', '2017-05-01', '2017-06-01', '2017-07-01', '2017-08-01', '2017-09-01']
#2017_end_dates = ['2017-04-30', '2017-05-31', '2017-06-30', '2017-07-31', '2017-08-31', '2017-10-02']

start_dates = ['2018-03-29', '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01', '2018-09-01', '2019-03-28', '2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01']
end_dates =  ['2018-04-30', '2018-05-31', '2018-06-30', '2018-07-31', '2018-08-31', '2018-10-01', '2019-04-30', '2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31']

In [26]:
%%time
step1_df = pre_process_step1(combined)

  .format(op=op_str, alt_op=unsupported[op_str]))


CPU times: user 28.6 s, sys: 4.29 s, total: 32.9 s
Wall time: 27.2 s


In [None]:
#clear some memory
#del combined
#gc.collect()

In [27]:
step1_df.head()

Unnamed: 0,pitch_type,game_date,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,player_name,events,description,des,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,sv_id,pitch_name,if_fielding_alignment,of_fielding_alignment,_count,count_cat,score_diff,on_1b_id,bases_loaded,batter_swung,in_strikezone,chased,ball_high,ball_low,ball_left,ball_right,pitch_cat
0,FF,2017-04-02,18599,94.900002,-1.3276,5.5862,656941.0,593372.0,12.0,,0,0,2017.0,-0.7759,1.1009,0.2816,4.3344,0.0,0.0,0.0,0.0,1.0,,,425877.0,5.7734,-137.729797,0.008,-10.848,29.457899,-18.7847,3.3157,1.6822,,,,93.314003,2146.0,5.49,490099.0,55.069,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,,ball,,L,R,STL,CHC,B,,Top,170403_003853,4-Seam Fastball,Standard,Standard,0,neutral,0.0,,0,0,0,0,True,False,False,False,fastball
1,FT,2017-04-02,18596,95.900002,-1.3091,5.4787,656941.0,593372.0,7.0,,1,0,2017.0,-1.5891,0.8433,-0.5788,2.3241,0.0,0.0,0.0,0.0,1.0,,,425877.0,5.2887,-139.217804,-4.4869,-21.2332,30.9744,-20.7889,3.6986,2.0002,,,,94.237,2144.0,5.465,490099.0,55.067799,,,,,,,,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,,foul,,L,R,STL,CHC,S,,Top,170403_003907,2-Seam Fastball,Standard,Standard,10,behind,0.0,,0,1,1,0,False,False,False,False,fastball
2,FT,2017-04-02,18593,97.300003,-1.3152,5.1777,656941.0,593372.0,13.0,,1,1,2017.0,-1.3374,0.5487,-0.0143,1.3261,0.0,0.0,0.0,0.0,1.0,,,425877.0,6.3532,-141.192307,-5.9444,-18.8818,30.681299,-23.9426,3.3593,1.5809,,,,96.186996,2015.0,5.62,490099.0,54.799,,,,,,,,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,,ball,,L,R,STL,CHC,B,,Top,170403_003927,2-Seam Fastball,Standard,Standard,11,ahead,0.0,,0,0,0,0,False,True,False,False,fastball
3,SL,2017-04-02,18590,84.599998,-1.2537,5.3882,656941.0,593372.0,8.0,9.0,2,1,2017.0,1.1019,-0.1662,0.1358,1.9434,0.0,0.0,0.0,0.0,1.0,243.080002,171.740005,425877.0,1.1491,-122.821503,-1.3416,10.3319,24.074301,-33.798302,3.7489,1.8682,256.0,96.099998,18.0,82.258003,2194.0,4.722,490099.0,55.676899,0.381,0.465,0.9,1.0,1.0,0.0,4.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,single,hit_into_play_no_out,Kyle Schwarber singles on a line drive to righ...,L,R,STL,CHC,X,line_drive,Top,170403_003943,Slider,Infield shift,Standard,21,neutral,0.0,,0,1,1,0,False,False,False,False,breaking
4,FT,2017-04-02,18587,98.400002,-1.2265,5.149,592178.0,593372.0,9.0,,0,0,2017.0,-1.278,0.6793,0.4214,1.8336,0.0,0.0,1.0,0.0,1.0,,,425877.0,7.1795,-142.5811,-4.9189,-18.5739,33.201401,-22.3067,3.2699,1.6918,,,,96.696999,2066.0,5.481,490099.0,54.922001,,,,,,,,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carlos Martinez,,called_strike,,R,R,STL,CHC,S,,Top,170403_004031,2-Seam Fastball,Standard,Standard,0,neutral,0.0,656941.0,0,0,1,0,False,False,False,False,fastball


In [28]:
len(step1_df)

2071739

In [29]:
%%time
step2_df = pre_process_step2(step1_df, start_dates, end_dates)

iteration completed successfully
iteration completed successfully
iteration completed successfully
iteration completed successfully
iteration completed successfully
iteration completed successfully
iteration completed successfully
iteration completed successfully
iteration completed successfully
iteration completed successfully
iteration completed successfully
CPU times: user 8min 31s, sys: 7.27 s, total: 8min 38s
Wall time: 8min 31s


In [30]:
step2_df.head()

Unnamed: 0,pitch_type,game_date,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,player_name,events,description,des,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,sv_id,pitch_name,if_fielding_alignment,of_fielding_alignment,_count,count_cat,score_diff,on_1b_id,bases_loaded,batter_swung,in_strikezone,chased,ball_high,ball_low,ball_left,ball_right,pitch_cat,fastball_perc_faced,fastball_chase_perc,fastball_bip_swung_perc,fastball_taken_strike_perc,fastball_est_woba,fastball_babip,fastball_iso_value,breaking_perc_faced,breaking_chase_perc,breaking_bip_swung_perc,breaking_taken_strike_perc,breaking_est_woba,breaking_babip,breaking_iso_value,offspeed_perc_faced,offspeed_chase_perc,offspeed_bip_swung_perc,offspeed_taken_strike_perc,offspeed_est_woba,offspeed_babip,offspeed_iso_value,pitchout_perc_faced
0,FF,2018-03-29,22964,93.099998,-2.2252,6.4152,605141.0,502042.0,1.0,8.0,0,0,2018.0,-0.431,1.4322,-0.2809,2.6616,0.0,0.0,0.0,0.0,1.0,102.809998,27.52,467092.0,5.843,-135.164597,-6.8271,-6.4579,29.7174,-13.7278,3.0589,1.2038,406.0,104.5,32.0,91.989998,2046.0,5.769,529406.0,54.729198,0.842,1.701,0.0,1.0,0.0,0.0,6.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Chris Archer,field_out,hit_into_play,Mookie Betts flies out sharply to center field...,R,R,TB,BOS,X,fly_ball,Top,180329_200052,4-Seam Fastball,Standard,Standard,0,neutral,0.0,,0,1,1,0,False,False,False,False,fastball,61.837088,21.648461,57.456829,46.074646,0.369519,0.2,0.193407,30.121317,26.220615,50.0,49.050633,0.305425,0.203883,0.140777,8.041594,25.342466,47.959184,29.069767,0.293979,0.233333,0.116667,0.0
1,FF,2018-03-29,22950,93.900002,-2.2895,6.4137,643217.0,502042.0,4.0,,0,0,2018.0,-0.3673,1.5538,-0.653,2.2512,0.0,0.0,0.0,1.0,1.0,,,467092.0,4.9796,-136.315994,-8.2866,-5.6329,31.5082,-11.552,3.5175,1.6148,,,,92.737999,2160.0,5.844,529406.0,54.653801,,,,,,,,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Chris Archer,,called_strike,,L,R,TB,BOS,S,,Top,180329_200139,4-Seam Fastball,Standard,Standard,0,neutral,0.0,,0,0,1,0,False,False,False,False,fastball,64.343164,27.072403,39.142091,32.874828,0.360633,0.204938,0.153086,23.286097,29.820051,43.442623,41.552511,0.394717,0.217391,0.136646,12.370739,37.053571,52.147239,19.191919,0.319059,0.196078,0.068627,0.0
2,FF,2018-03-29,22931,94.800003,-2.3369,6.3277,643217.0,502042.0,1.0,,0,1,2018.0,-0.6497,1.4661,-0.7479,3.0937,0.0,0.0,0.0,1.0,1.0,,,467092.0,5.5442,-137.7314,-5.9192,-9.4397,32.793499,-12.6904,3.4211,1.5121,325.0,90.300003,36.0,93.860001,2213.0,5.986,529406.0,54.512001,,,,,,,,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Chris Archer,,foul,,L,R,TB,BOS,S,,Top,180329_200201,4-Seam Fastball,Standard,Standard,1,ahead,0.0,,0,1,0,1,False,False,True,False,fastball,64.343164,27.072403,39.142091,32.874828,0.360633,0.204938,0.153086,23.286097,29.820051,43.442623,41.552511,0.394717,0.217391,0.136646,12.370739,37.053571,52.147239,19.191919,0.319059,0.196078,0.068627,0.0
3,CH,2018-03-29,22926,87.900002,-2.4264,6.271,643217.0,502042.0,4.0,4.0,0,2,2018.0,-1.0422,1.2354,-0.7209,2.6909,0.0,0.0,0.0,1.0,1.0,149.050003,155.059998,467092.0,6.2163,-127.734001,-5.0041,-12.5646,27.7714,-18.041599,3.4226,1.5135,4.0,71.5,-32.0,87.088997,1832.0,5.975,529406.0,54.523201,0.06,0.048,0.0,1.0,0.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Chris Archer,field_out,hit_into_play,"Andrew Benintendi grounds out, second baseman ...",L,R,TB,BOS,X,ground_ball,Top,180329_200217,Changeup,Standard,Standard,2,ahead,0.0,,0,1,1,0,False,False,False,False,offspeed,64.343164,27.072403,39.142091,32.874828,0.360633,0.204938,0.153086,23.286097,29.820051,43.442623,41.552511,0.394717,0.217391,0.136646,12.370739,37.053571,52.147239,19.191919,0.319059,0.196078,0.068627,0.0
4,FF,2018-03-29,22911,94.099998,-2.0053,6.4265,434670.0,502042.0,5.0,,0,0,2018.0,-0.5091,1.6018,0.127,2.8871,0.0,0.0,0.0,2.0,1.0,,,467092.0,6.5712,-136.691605,-6.8497,-7.758,31.332399,-11.186,3.5534,1.5638,191.0,72.300003,56.0,93.032997,2255.0,5.841,529406.0,54.656502,,,,,,,,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Chris Archer,,foul,,R,R,TB,BOS,S,,Top,180329_200305,4-Seam Fastball,Standard,Strategic,0,neutral,0.0,,0,1,1,0,False,False,False,False,fastball,64.202153,32.461356,40.24961,30.696798,0.390663,0.207493,0.15562,27.749181,36.458333,36.789298,23.923445,0.449845,0.175439,0.204678,8.048666,38.181818,25.806452,17.741935,0.564292,0.071429,0.142857,0.0


In [31]:
len(step2_df)

1329790

In [32]:
step2_df.isna().sum()

pitch_type                               0
game_date                                0
index                                    0
release_speed                         2404
release_pos_x                         2581
release_pos_z                         2581
batter                                   0
pitcher                                  0
zone                                  2581
hit_location                       1033408
balls                                    0
strikes                                  0
game_year                                0
pfx_x                                 2581
pfx_z                                 2581
plate_x                               2581
plate_z                               2581
on_3b                                    0
on_2b                                    0
on_1b                                    0
outs_when_up                             0
inning                                   0
hc_x                               1098945
hc_y       

In [35]:
#sort and reset index
step1_df = step1_df.sort_values(by = ['game_date', 'game_pk', 'at_bat_number', 'pitch_number']).reset_index(drop=True)
step2_df = step2_df.sort_values(by = ['game_date', 'game_pk', 'at_bat_number', 'pitch_number']).reset_index(drop=True)

index_middle = len(step2_df) // 2
step2_df1 = step2_df.loc[:index_middle]
step2_df2 = step2_df.loc[index_middle:]


In [36]:
step2_df1.tail()

Unnamed: 0,pitch_type,game_date,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,sv_id,if_fielding_alignment,score_diff,on_1b_id,ball_high,ball_low,ball_left,ball_right,bases_loaded,batter_swung,in_strikezone,chased,fastball_perc_faced,fastball_chase_perc,fastball_bip_swung_perc,fastball_taken_strike_perc,fastball_est_woba,fastball_babip,fastball_iso_value,breaking_perc_faced,breaking_chase_perc,breaking_bip_swung_perc,breaking_taken_strike_perc,breaking_est_woba,breaking_babip,breaking_iso_value,offspeed_perc_faced,offspeed_chase_perc,offspeed_bip_swung_perc,offspeed_taken_strike_perc,offspeed_est_woba,offspeed_babip,offspeed_iso_value,pitchout_perc_faced,balls,strikes,player_name,events,description,des,pitch_name,of_fielding_alignment,_count,count_cat,pitch_cat
664891,FT,2018-09-17,4652,90.599998,-1.6078,4.7808,516416.0,502748.0,11.0,,2018.0,-1.6188,0.6118,-1.793,2.7491,0.0,0.0,0.0,0.0,9.0,,,545358.0,2.9674,-131.815903,-0.4154,-19.355101,29.294201,-25.248699,3.2082,1.4742,,,,90.25,2088.0,6.19,531656.0,54.312401,,,,,,,,66.0,2.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,R,R,HOU,SEA,B,,Top,180918_030034,Standard,-3.0,,False,False,True,False,0,0,0,0,65.229111,31.343283,41.659885,35.43771,0.332371,0.273585,0.116352,26.243568,45.427727,49.91708,24.936386,0.329508,0.259259,0.12963,8.453811,42.672413,40.217392,24.778761,0.301014,0.266667,0.055556,0.073511,0,1,Brad Peacock,,ball,,2-Seam Fastball,Standard,1,ahead,fastball
664892,SL,2018-09-17,4650,81.5,-1.5381,4.8734,516416.0,502748.0,12.0,6.0,2018.0,1.1438,0.6222,1.1783,2.4537,0.0,0.0,0.0,0.0,9.0,118.449997,148.160004,545358.0,3.9082,-118.417397,0.0511,9.6243,24.2679,-26.7857,3.301,1.504,22.0,93.400002,-6.0,80.126999,2731.0,5.596,531656.0,54.906799,0.218,0.198,0.0,1.0,0.0,0.0,2.0,66.0,3.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,R,R,HOU,SEA,X,ground_ball,Top,180918_030059,Standard,-3.0,,False,False,False,True,0,1,0,1,65.229111,31.343283,41.659885,35.43771,0.332371,0.273585,0.116352,26.243568,45.427727,49.91708,24.936386,0.329508,0.259259,0.12963,8.453811,42.672413,40.217392,24.778761,0.301014,0.266667,0.055556,0.073511,1,1,Brad Peacock,field_out,hit_into_play,"Jean Segura grounds out, shortstop Carlos Corr...",Slider,Standard,11,ahead,breaking
664893,FF,2018-09-17,4638,92.699997,-2.2522,5.0368,592325.0,502748.0,11.0,,2018.0,-0.9713,1.336,-0.1524,3.6571,0.0,0.0,0.0,1.0,9.0,,,545358.0,7.4932,-134.888,-0.6564,-13.4687,28.827801,-15.9207,3.3413,1.6754,,,,92.859001,2210.0,6.349,531656.0,54.153,,,,,,,,67.0,1.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,L,R,HOU,SEA,B,,Top,180918_030132,Standard,-3.0,,True,False,False,False,0,0,0,0,61.586704,24.270073,41.240875,35.944702,0.382867,0.276458,0.114471,27.939793,35.660378,34.005039,42.382271,0.305556,0.144681,0.110638,10.473503,41.981133,47.027027,21.311476,0.283483,0.27619,0.133333,0.0,0,0,Brad Peacock,,ball,,4-Seam Fastball,Standard,0,neutral,fastball
664894,FF,2018-09-17,4634,91.699997,-1.8901,4.8707,592325.0,502748.0,11.0,,2018.0,-1.1409,1.0853,-2.0666,3.2466,0.0,0.0,0.0,1.0,9.0,,,545358.0,1.9931,-133.496796,-0.5684,-13.9521,29.215599,-19.4223,3.2304,1.5636,,,,91.411003,2254.0,6.13,531656.0,54.371899,,,,,,,,67.0,2.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,L,R,HOU,SEA,B,,Top,180918_030208,Standard,-3.0,,True,False,True,False,0,0,0,0,61.586704,24.270073,41.240875,35.944702,0.382867,0.276458,0.114471,27.939793,35.660378,34.005039,42.382271,0.305556,0.144681,0.110638,10.473503,41.981133,47.027027,21.311476,0.283483,0.27619,0.133333,0.0,1,0,Brad Peacock,,ball,,4-Seam Fastball,Standard,10,behind,fastball
664895,FF,2018-09-17,4620,89.900002,-1.8928,4.7787,592325.0,502748.0,14.0,,2018.0,-1.003,1.2042,0.9505,2.1177,0.0,0.0,0.0,1.0,9.0,,,545358.0,9.1558,-130.530502,-3.1341,-13.3871,27.573799,-17.9781,3.2304,1.586,,,,89.760002,2159.0,6.336,531656.0,54.166599,,,,,,,,67.0,3.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,L,R,HOU,SEA,S,,Top,180918_030229,Infield shift,-3.0,,False,False,False,True,0,0,0,0,61.586704,24.270073,41.240875,35.944702,0.382867,0.276458,0.114471,27.939793,35.660378,34.005039,42.382271,0.305556,0.144681,0.110638,10.473503,41.981133,47.027027,21.311476,0.283483,0.27619,0.133333,0.0,2,0,Brad Peacock,,called_strike,,4-Seam Fastball,Standard,20,behind,fastball


In [37]:
step2_df2.head()

Unnamed: 0,pitch_type,game_date,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,sv_id,if_fielding_alignment,score_diff,on_1b_id,ball_high,ball_low,ball_left,ball_right,bases_loaded,batter_swung,in_strikezone,chased,fastball_perc_faced,fastball_chase_perc,fastball_bip_swung_perc,fastball_taken_strike_perc,fastball_est_woba,fastball_babip,fastball_iso_value,breaking_perc_faced,breaking_chase_perc,breaking_bip_swung_perc,breaking_taken_strike_perc,breaking_est_woba,breaking_babip,breaking_iso_value,offspeed_perc_faced,offspeed_chase_perc,offspeed_bip_swung_perc,offspeed_taken_strike_perc,offspeed_est_woba,offspeed_babip,offspeed_iso_value,pitchout_perc_faced,balls,strikes,player_name,events,description,des,pitch_name,of_fielding_alignment,_count,count_cat,pitch_cat
664895,FF,2018-09-17,4620,89.900002,-1.8928,4.7787,592325.0,502748.0,14.0,,2018.0,-1.003,1.2042,0.9505,2.1177,0.0,0.0,0.0,1.0,9.0,,,545358.0,9.1558,-130.530502,-3.1341,-13.3871,27.573799,-17.9781,3.2304,1.586,,,,89.760002,2159.0,6.336,531656.0,54.166599,,,,,,,,67.0,3.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,L,R,HOU,SEA,S,,Top,180918_030229,Infield shift,-3.0,,False,False,False,True,0,0,0,0,61.586704,24.270073,41.240875,35.944702,0.382867,0.276458,0.114471,27.939793,35.660378,34.005039,42.382271,0.305556,0.144681,0.110638,10.473503,41.981133,47.027027,21.311476,0.283483,0.27619,0.133333,0.0,2,0,Brad Peacock,,called_strike,,4-Seam Fastball,Standard,20,behind,fastball
664896,SL,2018-09-17,4618,81.300003,-1.9925,4.9306,592325.0,502748.0,9.0,5.0,2018.0,0.7133,0.9364,0.5299,1.7909,0.0,0.0,0.0,1.0,9.0,112.860001,166.440002,545358.0,4.2779,-118.266098,-2.2037,5.8336,21.5194,-23.319099,3.319,1.513,17.0,65.199997,-4.0,80.863998,2844.0,5.91,531656.0,54.5928,0.079,0.058,0.0,1.0,0.0,0.0,2.0,67.0,4.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,L,R,HOU,SEA,X,ground_ball,Top,180918_030256,Infield shift,-3.0,,False,False,False,False,0,1,1,0,61.586704,24.270073,41.240875,35.944702,0.382867,0.276458,0.114471,27.939793,35.660378,34.005039,42.382271,0.305556,0.144681,0.110638,10.473503,41.981133,47.027027,21.311476,0.283483,0.27619,0.133333,0.0,2,1,Brad Peacock,field_out,hit_into_play,"Ben Gamel grounds out, third baseman Alex Breg...",Slider,Standard,21,neutral,breaking
664897,FT,2018-09-17,4605,91.800003,-1.4662,4.8081,592387.0,502748.0,6.0,,2018.0,-1.5582,0.766,0.2827,2.7514,0.0,0.0,0.0,2.0,9.0,,,545358.0,7.8017,-133.407806,-0.9759,-20.303499,29.6931,-23.049801,3.6515,1.6977,,,,91.689003,2201.0,6.376,531656.0,54.126301,,,,,,,,68.0,1.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,R,R,HOU,SEA,S,,Top,180918_030340,Infield shift,-3.0,,False,False,False,False,0,0,1,0,57.416019,34.208496,43.774704,38.619202,0.434406,0.208696,0.215652,34.005169,40.617577,36.209553,35.232067,0.358647,0.185792,0.117486,8.552972,47.706421,43.684212,23.893805,0.367578,0.219048,0.209524,0.02584,0,0,Brad Peacock,,called_strike,,2-Seam Fastball,Standard,0,neutral,fastball
664898,SL,2018-09-17,4601,79.699997,-1.6469,4.7313,592387.0,502748.0,14.0,9.0,2018.0,1.622,0.5466,1.1768,1.9661,0.0,0.0,0.0,2.0,9.0,154.229996,121.120003,545358.0,3.1899,-115.801102,-0.2343,13.4147,24.3874,-27.662901,3.575,1.681,138.0,66.5,15.0,78.198997,2787.0,5.607,531656.0,54.894901,0.382,0.313,0.9,1.0,1.0,0.0,2.0,68.0,2.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,R,R,HOU,SEA,X,ground_ball,Top,180918_030410,Infield shift,-3.0,,False,False,False,True,0,1,0,1,57.416019,34.208496,43.774704,38.619202,0.434406,0.208696,0.215652,34.005169,40.617577,36.209553,35.232067,0.358647,0.185792,0.117486,8.552972,47.706421,43.684212,23.893805,0.367578,0.219048,0.209524,0.02584,0,1,Brad Peacock,single,hit_into_play_no_out,Ryon Healy singles on a ground ball to right f...,Slider,Standard,1,ahead,breaking
664899,FT,2018-09-17,4587,91.900002,-2.4106,4.81,572122.0,502748.0,13.0,7.0,2018.0,-1.608,0.8578,-1.1009,2.153,0.0,0.0,1.0,2.0,9.0,100.019997,93.510002,545358.0,6.771,-133.628296,-2.6945,-20.635,30.1406,-21.604799,3.371,1.535,138.0,82.900002,9.0,91.546997,2307.0,6.226,531656.0,54.275902,0.45,0.448,0.9,1.0,1.0,0.0,4.0,69.0,1.0,1.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,L,R,HOU,SEA,X,ground_ball,Top,180918_030454,Infield shift,-3.0,592387.0,False,False,True,False,0,1,0,1,57.378807,30.36989,44.272949,32.716049,0.398369,0.181435,0.178622,30.030029,25.786924,35.0,36.062717,0.331256,0.139752,0.145963,12.591163,29.691877,43.382355,27.826086,0.373526,0.215686,0.215686,0.0,0,0,Brad Peacock,single,hit_into_play_no_out,Kyle Seager singles on a ground ball to left f...,2-Seam Fastball,Standard,0,neutral,fastball


In [47]:
#export pre_processed_df to pkl
#step1_df.to_pickle(path=("step1_pre_processed_df.pkl"),compression='zip')
step2_df1.to_pickle(path=("pre_pitcher_pre_processed_df_part1.pkl"),compression='zip')
step2_df2.to_pickle(path=("pre_pitcher_pre_processed_df_part2.pkl"),compression='zip')