In [1]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Blueprint:
Step 1: Read in the Data

Step 2: Make Game Features: Count (count & count cat), score_diff, and baserunners features

Step 3: Generate pitcher overall percentages, fill the NaNs for missing pitch types, and make pitch_type_cat feature

Step 4: Make strikezone, swung, and chase features, + Batter Scouting Report and Merge those features into the df (percentage faced, est_woba, est_babip, iso_value, and chase %, by pitch_type_category)

Step 5: Make a python Class for Pitcher (filter out a specific pitcher from the df)
Make Pitcher Scouting Report and Merge those Features into the df (pitch_type tendencies, overall and by count category)

Step 6: Make Game pitch count column, and make trailing pitch type features (L1, L5, L10): figure out strategy for 1st 5 and first 15 pitches of each game (maybe use overall tendencies, maybe use count tendencies, or maybe get historical first5/first 10 tendencies)

Step 7: Pitcher/Batter matchup history

Step 8: Write code that does step 1-8, iteratively, updating each month of 2018 and into the 2019 season (to prevent leakage of future information into the training set)

Step 9: Make list of features we will use for model input vector + (merge w/ umpire data if we can get it)

Step 10: Train Models
    
    
    
    
    
#potential other features to engineer in future: 

-either on base or not- replace the baserunner id w/ 1 or 0.. potentially down the road can use stolen base stats to categorize the baserunner into 2 or 3 tiers maybe (since if a speedy player is one first base, pitcher may be less likely to throw as many breaking balls for example).. not a huge priority feature tho

-under the trailing pitches section, add a feature for when the prev pitch:
    - was a base hit
    - was a home run
    - gave up a run (or runs): check whether the post_bat_score > bat_score

-pitcher feature for battting stats onballs in play:
    -est_woba/babip/isovalue/chase% for diff pitches in this game, prev game, prev 5      games, prev season, and/or historical 

## Step 1: Read in the Data

In [2]:
fname = 'pitches_2017.pkl'
df_17 = pd.read_pickle(fname, compression='zip').drop(columns=['pitcher.1', 'fielder_2.1'])

#convert the pitch type for UN (unknown) to np.nan
df_17['pitch_type'] = df_17['pitch_type'].replace({'UN':np.nan})

#fix some faulty data that has number of balls listed as 4:
df_17['balls'] = df_17['balls'].replace({4.0: 3.0})

In [3]:
def downcast_dtypes(df):
    df = df.copy()
    int_cols = df.select_dtypes('int').columns.tolist()
    float_cols = df.select_dtypes('float').columns.tolist()
    obj_cols = df.select_dtypes('object').columns.tolist()
    cat_cols = []
    for col in obj_cols:
        if col == 'pitch_type':
            continue
        if len(df[col].unique()) < len(df)/2:
            cat_cols.append(col)
      
    ints = df[int_cols].apply(pd.to_numeric,downcast='unsigned')
    floats = df[float_cols].apply(pd.to_numeric,downcast='float')
    cats = df[cat_cols].astype('category')
  
    df = df.drop(columns=int_cols + float_cols + cat_cols)
    for d in [ints, floats, cats]:
        df = pd.concat([df, d], axis=1)
    return df

In [4]:
df_17 = downcast_dtypes(df_17)
df_17.head().T

Unnamed: 0,0,1,2,3,4
pitch_type,FC,FC,FF,CH,FT
game_date,2017-10-01 00:00:00,2017-10-01 00:00:00,2017-10-01 00:00:00,2017-10-01 00:00:00,2017-10-01 00:00:00
index,593,614,618,636,650
release_speed,91.9,91.8,94.7,82.8,95.9
release_pos_x,-1.1948,-1.0777,-0.6208,-0.9901,-0.8903
release_pos_z,5.9921,6.1772,6.3387,6.0626,6.1341
batter,595885,595885,595885,595885,595885
pitcher,532077,532077,532077,532077,532077
zone,1,3,12,13,13
hit_location,6,,,,


## Step 2: Make Game Features: Count (count & count cat), score_diff, and baserunners features:

In [5]:
def make_game_features(df):
    df = df.copy()
    
    #convert balls and strikes columns to strings (first to int)
    df['balls'] = df['balls'].astype('int').astype('str')
    df['strikes'] = df['strikes'].astype('int').astype('str')

    #create count feature:
    df['_count'] = df['balls'] + df['strikes']

    #create count category feature:
    count_map = {'00':'neutral', '21':'neutral', '32':'neutral', '10':'behind', '20':'behind',
             '30':'behind', '31':'behind', '01':'ahead', '02':'ahead', '11':'ahead',
             '12':'ahead', '22':'ahead'}
    df['count_cat'] = df['_count'].replace(count_map)
    
    #cant decide whether to drop the balls and strikes feature, keeping for now- check feature importances later
    #df = df.drop(columns=['balls', 'strikes'])
    
    #make score_differential feature
    df['score_diff'] = df['fld_score'] - df['bat_score']
    
    #make baserunners features 1/0 instead of baserunner id:
    df['on_1b_id'] = df['on_1b'] #save the runners id in case we later add feature for speedy runners
    df['on_1b'] = df['on_1b'] * 0 + 1
    df['on_1b'] = df['on_1b'].fillna(0)
    df['on_2b'] = df['on_2b'] * 0 + 1
    df['on_2b'] = df['on_2b'].fillna(0)
    df['on_3b'] = df['on_3b'] * 0 + 1
    df['on_3b'] = df['on_3b'].fillna(0)
    
    #make bases_loaded feature:
    df['bases_loaded'] = df['on_1b'] + df['on_2b'] + df['on_3b']
    df['bases_loaded'] = df['bases_loaded'].apply(lambda x: 1 if x == 3 else 0)

    return df

df_17 = make_game_features(df_17)

In [6]:
df_17[['on_1b', 'on_2b', 'on_3b', 'bases_loaded']].describe()

Unnamed: 0,on_1b,on_2b,on_3b,bases_loaded
count,721243.0,721243.0,721243.0,721243.0
mean,0.305566,0.184063,0.092969,0.022856
std,0.460647,0.387536,0.290389,0.149446
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0


## Step 3: Generate pitcher overall percentages, fill the NaNs for missing pitch types, and make pitch_type_cat feature

In [7]:
def gen_pitcher_percentages(df):
    df = df.copy()
    #get all the unique pitcher names in the df
    pitcher_list = df.pitcher.unique().tolist()
    #initialize empty dictionary to store each pitcher and their pitches and percentages for each pitch
    pitcher_dict = {}
    #iterate over each pitcher:
    for pitcher in pitcher_list:
        #assign the normalized value_counts to a variable
        pitch_percentages = df[df.pitcher == pitcher].pitch_type.value_counts(normalize=True)
        #convert that Series object to a dict and assign it as the value to the pitcher dictionary 
        #(pitcher name as key)
        pitcher_dict[pitcher] = pitch_percentages.to_dict()
    return pitcher_dict

#generate the dictionary of pitch type % by pitcher
pitcher_dict = gen_pitcher_percentages(df_17)

def fill_pitch_type_nans(df, pitcher_dict):
    df = df.copy()
    #grab the rows where pitch_type is null:
    nulls = df[df.pitch_type.isna()]
    
    #iterate over each null row
    for index, row in nulls.iterrows():
        #use the % for that pitcher for each pitch type he throws to generate a random pitch type with 
        #that % as weight
        pitch = random.choices(population=list(pitcher_dict[row.pitcher].keys()), 
                               weights=list(pitcher_dict[row.pitcher].values()), 
                               k=1)[0]
        #fill the NaN value with the randomly generated pitch
        df.at[index, 'pitch_type'] = pitch
    return df

df_17 = fill_pitch_type_nans(df_17, pitcher_dict)

def make_pitch_type_cat(df):
    df = df.copy()
    
    #create map for pitch type into categories:
    pitch_type_map = {'FA':'fastball', 'FF':'fastball', 'FT':'fastball', 'FC':'fastball',
                      'FS':'fastball', 'SI':'fastball', 'SF':'fastball', 'SL':'breaking',
                      'CB':'breaking', 'CU':'breaking', 'SC':'breaking', 'KC':'breaking',
                      'CH':'offspeed', 'KN':'offspeed', 'EP':'offspeed', 'FO':'breaking', 
                      'PO':'pitchout'}

    #create pitch cateogory feature
    df['pitch_cat'] = df['pitch_type']
    df['pitch_cat'] = df['pitch_cat'].replace(pitch_type_map)
    
    return df

df_17 = make_pitch_type_cat(df_17)

## Step 4: Make strikezone, swung, and chase features, + Batter Scouting Report and Merge those features into the df (percentage faced, est_woba, est_babip, iso_value, and chase %, by pitch_type_category)

In [8]:
#refactored to vectorize (80x faster)

def make_strikezone_swung_and_chase_features(df):
    df = df.copy()
    
    #create swung column
    def swung(x):
        swung = ['foul','hit_into_play','swinging_strike','hit_into_play_no_out',
                 'hit_into_play_score','foul_tip','swinging_strike_blocked',
                 'foul_bunt','missed_bunt']
        return 1 if x in swung else 0
    df['batter_swung'] = df['description'].apply(swung)
    
    #initialize in_strikezone and chased features:
    df['in_strikezone'] = 1
    df['chased'] = 0
    
    df['ball_high'] = df['plate_z'] > df['sz_top']
    df['ball_low'] = df['plate_z'] < df['sz_bot']
    df['ball_left'] = df['plate_x'].apply(lambda x: x < -0.73)
    df['ball_right'] = df['plate_x'].apply(lambda x: x > 0.73)
    df['in_strikezone'] = df['ball_high'] + df['ball_low'] + df['ball_left'] + df['ball_right']
    df['in_strikezone'] = df['in_strikezone'].apply(lambda x: 0 if x > 0 else 1)
    
    nulls_subset = df[df['in_strikezone'].isna()]
    for index, row in nulls_subset.iterrows():
        if row.type == 'B':
            df.at[index, 'in_strike_zone'] = False
    
    df['chased'] = df['batter_swung'] - df['in_strikezone']
    df['chased'] = df['chased'].apply(lambda x: 1 if x == 1 else 0)
    return df

In [9]:
%%time
df_17 = make_strikezone_swung_and_chase_features(df_17)

  .format(op=op_str, alt_op=unsupported[op_str]))


CPU times: user 1.76 s, sys: 160 ms, total: 1.92 s
Wall time: 2.06 s


In [10]:

'''def make_batters_dict(df):
    df = df.copy()
    #make list of the unique batter ids
    batters = list(df['batter'].unique())
    #initialize empty dictionary to store the batter stats
    batters_dict = {}
    #set a break flag to False for error-checking
    brk = False
    #iterate thru each unique batter
    for batter in batters:
        if brk:
            break
        #make subset of the df for that batter and assign to variable batter_df    
        batter_df = df[df['batter'] == batter]
        
        #assign all pitch categories to list:
        all_pitch_cats = ['fastball', 'breaking', 'offspeed', 'pitchout']
        #assign the pitch categories to a list
        pitch_cats = batter_df['pitch_cat'].unique().tolist()
        #get the normalized value counts of pitches by category that batter has faced
        vc = batter_df.pitch_cat.value_counts(normalize=True)
        #initialize empty dict for each batter
        batter_dict = {}
        
        #if there are any pitch categories the batter has not faced, 
        unfaced_cats = list(set(all_pitch_cats) - set(pitch_cats))
        
        #assign NaNs to his dictionary for that category
        for cat in unfaced_cats:
            if cat == 'pitchout':
                batter_dict[cat + '_perc_faced'] = 0
            else:
                batter_dict[cat + '_perc_faced'] = np.nan
                batter_dict[cat + '_chase_perc'] = np.nan
                batter_dict[cat + '_bip_swung_perc'] = np.nan
                batter_dict[cat + '_taken_strike_perc'] = np.nan
                batter_dict[cat + '_est_woba'] = np.nan
                batter_dict[cat + '_babip'] = np.nan
                batter_dict[cat + '_iso_value'] = np.nan
        
        for cat in pitch_cats:
            if brk:
                break
        
            #assign the % of pitches faced by the batter for that category to his batter dict
            
            batter_dict[cat + '_perc_faced'] = vc[cat] * 100
        
            #continue out of the loop for pitchout category since ball in play stats are NaN
            if cat == 'pitchout':
                continue
        
            #grab subset of batter df for the pitch category
            cat_df = batter_df[batter_df['pitch_cat'] == cat]
        
            #calculate batters chase % for pitch type category on balls outside the strikezone
            out_of_strikezone = len(cat_df[cat_df['in_strikezone'] == 0]) #num of times ball was out of zone
            chased_count = len(cat_df[cat_df['chased'] == 1]) #num of times batter chased
            try:
                chase_perc = (chased_count / out_of_strikezone) * 100
            except ZeroDivisionError:
                chase_perc = np.nan
            #assign the chase perc to the batter dict
            batter_dict[cat + '_chase_perc'] = chase_perc
        
            #calc ball in play % for each swing for each pitch cat:
            ball_in_play_count = len(cat_df[cat_df['type'] == 'X']) #type X means ball hit into play
            swung_count = cat_df['batter_swung'].sum() #counts all the 1s in the swung column
            #assign the ball in play % per swing to the batter dict
            batter_dict[cat + '_bip_swung_perc'] = (ball_in_play_count / swung_count) * 100
        
            #calculate taken strike %
            taken_strike_count = len(cat_df[(cat_df['in_strikezone'] == 1) & (cat_df['batter_swung'] == 0)])
            pitches_in_zone_count = cat_df['in_strikezone'].sum() #counts the 1s in the in zone col
            #assign to batter_dict
            batter_dict[cat + '_taken_strike_perc'] = (taken_strike_count / pitches_in_zone_count) * 100
        
            #for each pitch type category, get the batters stats on balls hit in play
            stats = ['estimated_woba_using_speedangle', 'babip_value', 'iso_value']
            for stat in stats:
                #drop Nans from the stat column and assign to new subset, for each stat
                stat_cat_df = cat_df.dropna(subset=[stat])
                if stat == 'estimated_woba_using_speedangle':
                    #get the mean avg_est_woba
                    avg_est_woba = stat_cat_df['estimated_woba_using_speedangle'].mean()
                    #assign that value to the batters dictionary
                    batter_dict[cat + '_est_woba'] = avg_est_woba
                    if avg_est_woba == np.nan:
                        print(batter)
                        brk = True
                        break
                elif stat == 'babip_value':
                    avg_babip = stat_cat_df['babip_value'].mean()
                    batter_dict[cat + '_babip'] = avg_babip
                else:
                    avg_iso_value = stat_cat_df['iso_value'].mean()
                    batter_dict[cat + '_iso_value'] = avg_iso_value
            
        #assign the batter dictionary to the main dictionary of all batters
        batters_dict[batter] = batter_dict
    if not brk:
        print('iteration completed successfully')
    return batters_dict

batters_dict = make_batters_dict(df_17)

batters_df = pd.DataFrame.from_dict(batters_dict, orient='index')
batters_df = batters_df.reset_index().rename(columns={'index':'batter'})'''

"def make_batters_dict(df):\n    df = df.copy()\n    #make list of the unique batter ids\n    batters = list(df['batter'].unique())\n    #initialize empty dictionary to store the batter stats\n    batters_dict = {}\n    #set a break flag to False for error-checking\n    brk = False\n    #iterate thru each unique batter\n    for batter in batters:\n        if brk:\n            break\n        #make subset of the df for that batter and assign to variable batter_df    \n        batter_df = df[df['batter'] == batter]\n        \n        #assign all pitch categories to list:\n        all_pitch_cats = ['fastball', 'breaking', 'offspeed', 'pitchout']\n        #assign the pitch categories to a list\n        pitch_cats = batter_df['pitch_cat'].unique().tolist()\n        #get the normalized value counts of pitches by category that batter has faced\n        vc = batter_df.pitch_cat.value_counts(normalize=True)\n        #initialize empty dict for each batter\n        batter_dict = {}\n        \n    

In [11]:
batters = df_17.batter.unique().tolist()
count = 0
for batter in batters:
    if len(df_17[df_17['batter'] == batter]) < 150:
        count += 1
print(count / len(batters))

0.40229885057471265


In [12]:
print(count / len(batters))

0.40229885057471265


In [13]:
def add_batters_scouting_report(df):
    df = df.copy()
    #make list of the unique batter ids
    batters = list(df['batter'].unique())
    #initialize empty dictionary to store the batter stats
    batters_dict = {}
    #set a break flag to False for error-checking
    brk = False
    #iterate thru each unique batter
    for batter in batters:
        if brk:
            break
        #make subset of the df for that batter and assign to variable batter_df    
        batter_df = df[df['batter'] == batter]
        
        #assign all pitch categories to list:
        all_pitch_cats = ['fastball', 'breaking', 'offspeed', 'pitchout']
        #assign the pitch categories to a list
        pitch_cats = batter_df['pitch_cat'].unique().tolist()
        
        
        #get the normalized value counts of pitches by category that batter has faced
        vc = batter_df.pitch_cat.value_counts(normalize=True)
        #initialize empty dict for each batter
        batter_dict = {}
        
        #if there are any pitch categories the batter has not faced, 
        unfaced_cats = list(set(all_pitch_cats) - set(pitch_cats))
        
        #assign NaNs to his dictionary for that category
        for cat in unfaced_cats:
            if cat == 'pitchout':
                batter_dict[cat + '_perc_faced'] = 0
            else:
                batter_dict[cat + '_perc_faced'] = np.nan
                batter_dict[cat + '_chase_perc'] = np.nan
                batter_dict[cat + '_bip_swung_perc'] = np.nan
                batter_dict[cat + '_taken_strike_perc'] = np.nan
                batter_dict[cat + '_est_woba'] = np.nan
                batter_dict[cat + '_babip'] = np.nan
                batter_dict[cat + '_iso_value'] = np.nan
        
        for cat in pitch_cats:
            if brk:
                break
        
            #assign the % of pitches faced by the batter for that category to his batter dict
            
            batter_dict[cat + '_perc_faced'] = vc[cat] * 100
        
            #continue out of the loop for pitchout category since ball in play stats are NaN
            if cat == 'pitchout':
                continue
        
            #grab subset of batter df for the pitch category
            cat_df = batter_df[batter_df['pitch_cat'] == cat]
        
            #calculate batters chase % for pitch type category on balls outside the strikezone
            out_of_strikezone = len(cat_df[cat_df['in_strikezone'] == 0]) #num of times ball was out of zone
            chased_count = len(cat_df[cat_df['chased'] == 1]) #num of times batter chased
            try:
                chase_perc = (chased_count / out_of_strikezone) * 100
            except ZeroDivisionError:
                chase_perc = np.nan
            #assign the chase perc to the batter dict
            batter_dict[cat + '_chase_perc'] = chase_perc
        
            #calc ball in play % for each swing for each pitch cat:
            ball_in_play_count = len(cat_df[cat_df['type'] == 'X']) #type X means ball hit into play
            swung_count = cat_df['batter_swung'].sum() #counts all the 1s in the swung column
            #assign the ball in play % per swing to the batter dict
            batter_dict[cat + '_bip_swung_perc'] = (ball_in_play_count / swung_count) * 100
        
            #calculate taken strike %
            taken_strike_count = len(cat_df[(cat_df['in_strikezone'] == 1) & (cat_df['batter_swung'] == 0)])
            pitches_in_zone_count = cat_df['in_strikezone'].sum() #counts the 1s in the in zone col
            #assign to batter_dict
            batter_dict[cat + '_taken_strike_perc'] = (taken_strike_count / pitches_in_zone_count) * 100
        
            #for each pitch type category, get the batters stats on balls hit in play
            stats = ['estimated_woba_using_speedangle', 'babip_value', 'iso_value']
            for stat in stats:
                #drop Nans from the stat column and assign to new subset, for each stat
                stat_cat_df = cat_df.dropna(subset=[stat])
                if stat == 'estimated_woba_using_speedangle':
                    #get the mean avg_est_woba
                    avg_est_woba = stat_cat_df['estimated_woba_using_speedangle'].mean()
                    #assign that value to the batters dictionary
                    batter_dict[cat + '_est_woba'] = avg_est_woba
                    if avg_est_woba == np.nan:
                        print(batter)
                        brk = True
                        break
                elif stat == 'babip_value':
                    avg_babip = stat_cat_df['babip_value'].mean()
                    batter_dict[cat + '_babip'] = avg_babip
                else:
                    avg_iso_value = stat_cat_df['iso_value'].mean()
                    batter_dict[cat + '_iso_value'] = avg_iso_value
            
        #assign the batter dictionary to the main dictionary of all batters
        batters_dict[batter] = batter_dict
    if not brk:
        print('iteration completed successfully')
        
    #make df from the batters dict
    batters_df = pd.DataFrame.from_dict(batters_dict, orient='index')
    batters_df = batters_df.reset_index().rename(columns={'index':'batter'})
    
    #merge df and batters df on batter col
    df = pd.merge(df, batters_df, how='left', on='batter')
    
    return df

In [14]:
%%time
df_17 = add_batters_scouting_report(df_17)



iteration completed successfully
CPU times: user 2min 40s, sys: 1.28 s, total: 2min 41s
Wall time: 2min 52s


In [15]:
df_17.isna().sum()

pitch_type                              0
game_date                               0
index                                   0
release_speed                        2317
release_pos_x                        2388
release_pos_z                        2388
batter                                  0
pitcher                                 0
zone                                 2432
hit_location                       559834
balls                                   0
strikes                                 0
game_year                               0
pfx_x                                2388
pfx_z                                2388
plate_x                              2432
plate_z                              2432
on_3b                                   0
on_2b                                   0
on_1b                                   0
outs_when_up                            0
inning                                  0
hc_x                               594017
hc_y                              

In [16]:
df_17.head().T

Unnamed: 0,0,1,2,3,4
pitch_type,FC,FC,FF,CH,FT
game_date,2017-10-01 00:00:00,2017-10-01 00:00:00,2017-10-01 00:00:00,2017-10-01 00:00:00,2017-10-01 00:00:00
index,593,614,618,636,650
release_speed,91.9,91.8,94.7,82.8,95.9
release_pos_x,-1.1948,-1.0777,-0.6208,-0.9901,-0.8903
release_pos_z,5.9921,6.1772,6.3387,6.0626,6.1341
batter,595885,595885,595885,595885,595885
pitcher,532077,532077,532077,532077,532077
zone,1,3,12,13,13
hit_location,6,,,,


## Step 5: Make Pitcher Scouting Report and Merge those Features into the df (pitch_type tendencies, overall and by count category)

Choose a pitcher to model: Justin Verlander example:

In [17]:
pitcher_df = df_17[df_17.pitcher == 434378.0].sort_values(by = ['game_date', 'at_bat_number', 'pitch_number'])

In [18]:
pitcher_df.isna().sum()

pitch_type                            0
game_date                             0
index                                 0
release_speed                         0
release_pos_x                         0
release_pos_z                         0
batter                                0
pitcher                               0
zone                                  0
hit_location                       2786
balls                                 0
strikes                               0
game_year                             0
pfx_x                                 0
pfx_z                                 0
plate_x                               0
plate_z                               0
on_3b                                 0
on_2b                                 0
on_1b                                 0
outs_when_up                          0
inning                                0
hc_x                               2977
hc_y                               2977
fielder_2                             0


In [19]:
def get_pitch_tendencies(pitcher_df):
    #assign the normalized value counts for this pitchers pitch types to a dictionary
    pitcher_tendencies_overall = pitcher_df['pitch_type'].value_counts(normalize=True).to_dict()
    
    #initialize empty dict for count categories tendencies
    pitcher_tendencies_by_count = {}
    
    #loop over each count category and get the pitchers tendencies and add to the dict
    for cat in pitcher_df['count_cat'].unique().tolist():
        subset = pitcher_df[pitcher_df['count_cat'] == cat]
        pitcher_tendencies_by_count[cat] = subset['pitch_type'].value_counts(normalize=True).to_dict()
    return pitcher_tendencies_overall, pitcher_tendencies_by_count

In [20]:
overall, by_count = get_pitch_tendencies(pitcher_df)

In [21]:
overall

{'FF': 0.5766071934296233,
 'SL': 0.21467006513735487,
 'CU': 0.1600113282356273,
 'CH': 0.041348060039648825,
 'FC': 0.003964882469555367,
 'FT': 0.0033984706881903144}

In [22]:
#by_count['neutral']['FF']

In [23]:
#pitcher_df.head(2)

In [24]:
def make_tendency_features(pitcher_df):
    df = pitcher_df.copy()
    #get the overall and by_count pitch_type tendencies
    pitcher_tendencies_overall, pitcher_tendencies_by_count = get_pitch_tendencies(df)
    
    pitch_types = df.pitch_type.value_counts().index.tolist()
    for pitch_type in pitch_types:
        overall_feature = 'overall_' + pitch_type + '_perc'
        count_cat_feature = 'count_cat_' + pitch_type + '_perc'
        
        def get_overall_perc(x):
            return pitcher_tendencies_overall[x]
        def get_by_count_perc(x):
            return pitcher_tendencies_by_count[x][pitch_type]
        
        df[overall_feature] = pitch_type
        df[overall_feature] = df[overall_feature].apply(get_overall_perc)
        df[count_cat_feature] = df['count_cat'].apply(get_by_count_perc)
    return df

pitcher_df = make_tendency_features(pitcher_df)
pitcher_df.head()

Unnamed: 0,pitch_type,game_date,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,player_name,events,description,des,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,sv_id,pitch_name,if_fielding_alignment,of_fielding_alignment,_count,count_cat,score_diff,on_1b_id,bases_loaded,pitch_cat,batter_swung,in_strikezone,chased,ball_high,ball_low,ball_left,ball_right,pitchout_perc_faced,fastball_perc_faced,fastball_chase_perc,fastball_bip_swung_perc,fastball_taken_strike_perc,fastball_est_woba,fastball_babip,fastball_iso_value,offspeed_perc_faced,offspeed_chase_perc,offspeed_bip_swung_perc,offspeed_taken_strike_perc,offspeed_est_woba,offspeed_babip,offspeed_iso_value,breaking_perc_faced,breaking_chase_perc,breaking_bip_swung_perc,breaking_taken_strike_perc,breaking_est_woba,breaking_babip,breaking_iso_value,overall_FF_perc,count_cat_FF_perc,overall_SL_perc,count_cat_SL_perc,overall_CU_perc,count_cat_CU_perc,overall_CH_perc,count_cat_CH_perc,overall_FC_perc,count_cat_FC_perc,overall_FT_perc,count_cat_FT_perc
717221,FF,2017-04-04,14433,94.0,-2.5011,6.3799,573135.0,434378.0,6.0,,0,0,2017.0,-1.638,1.3832,0.4384,2.4509,0.0,0.0,0.0,0.0,1.0,,,543510.0,10.9326,-135.909302,-7.256,-22.2188,29.3508,-14.0833,3.25,1.44,,,,92.447998,2587.0,5.68,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,54.8186,,,,,,,,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,,called_strike,,R,R,CWS,DET,S,,Bot,170404_182235,4-Seam Fastball,Strategic,Standard,0,neutral,0.0,,0,fastball,0,1,0,False,False,False,False,0.08673,63.746748,29.951691,37.735849,39.563863,0.278183,0.168539,0.039326,9.80052,27.868852,40.0,17.307692,0.220375,0.235294,0.088235,26.366002,22.222222,39.63964,40.0,0.200636,0.097222,0.041667,0.576607,0.671927,0.21467,0.172757,0.160011,0.11794,0.041348,0.033223,0.003965,0.001661,0.003398,0.002492
717220,FF,2017-04-04,14423,95.099998,-2.3831,6.313,573135.0,434378.0,14.0,,0,1,2017.0,-1.4071,1.666,1.8448,1.8521,0.0,0.0,0.0,0.0,1.0,,,543510.0,13.8935,-137.156998,-9.4298,-20.4625,29.761999,-9.7076,3.25,1.44,,,,93.507004,2534.0,5.795,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,54.703201,,,,,,,,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,,ball,,R,R,CWS,DET,B,,Bot,170404_182256,4-Seam Fastball,Standard,Standard,1,ahead,0.0,,0,fastball,0,0,0,False,False,False,True,0.08673,63.746748,29.951691,37.735849,39.563863,0.278183,0.168539,0.039326,9.80052,27.868852,40.0,17.307692,0.220375,0.235294,0.088235,26.366002,22.222222,39.63964,40.0,0.200636,0.097222,0.041667,0.576607,0.490929,0.21467,0.249588,0.160011,0.213854,0.041348,0.038483,0.003965,0.003299,0.003398,0.003848
717219,CU,2017-04-04,14412,79.199997,-2.6276,6.1709,573135.0,434378.0,1.0,7.0,1,1,2017.0,0.4102,-0.9814,-0.3723,2.7628,0.0,0.0,0.0,0.0,1.0,66.059998,135.050003,543510.0,4.0855,-114.870598,1.0709,2.6878,21.7332,-40.947102,3.25,1.44,,90.300003,-17.0,77.067001,2842.0,4.989,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,55.509602,0.163,0.164,0.9,1.0,1.0,0.0,2.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,single,hit_into_play_no_out,Tyler Saladino singles on a ground ball to lef...,R,R,CWS,DET,X,ground_ball,Bot,170404_182315,Curveball,Standard,Standard,11,ahead,0.0,,0,breaking,1,1,0,False,False,False,False,0.08673,63.746748,29.951691,37.735849,39.563863,0.278183,0.168539,0.039326,9.80052,27.868852,40.0,17.307692,0.220375,0.235294,0.088235,26.366002,22.222222,39.63964,40.0,0.200636,0.097222,0.041667,0.576607,0.490929,0.21467,0.249588,0.160011,0.213854,0.041348,0.038483,0.003965,0.003299,0.003398,0.003848
717218,SL,2017-04-04,14402,86.800003,-2.5506,6.3337,641313.0,434378.0,3.0,,0,0,2017.0,-0.148,0.6201,0.6356,2.9052,0.0,0.0,1.0,0.0,1.0,,,543510.0,7.7677,-125.945602,-3.2419,-2.8956,21.923599,-25.368099,3.32,1.51,,,,85.759003,2573.0,5.454,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,55.044899,,,,,,,,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,,swinging_strike,,R,R,CWS,DET,S,,Bot,170404_182445,Slider,Standard,Standard,0,neutral,0.0,573135.0,0,breaking,1,1,0,False,False,False,False,0.0,55.64478,40.143885,37.242472,27.122153,0.416532,0.24,0.193846,7.605102,46.808511,37.373737,17.910448,0.373757,0.34,0.12,36.750118,49.010989,35.779817,34.055728,0.273192,0.166667,0.068376,0.576607,0.671927,0.21467,0.172757,0.160011,0.11794,0.041348,0.033223,0.003965,0.001661,0.003398,0.002492
717217,SL,2017-04-04,14395,89.300003,-2.5144,6.3712,641313.0,434378.0,3.0,,0,1,2017.0,-0.2297,0.923,0.596,2.8399,0.0,0.0,1.0,0.0,1.0,,,543510.0,7.9815,-129.521896,-4.4926,-4.072,25.070101,-21.385799,3.32,1.51,130.0,75.300003,13.0,88.084,2488.0,5.537,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,54.9613,,,,,,,,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,,foul,,R,R,CWS,DET,S,,Bot,170404_182522,Slider,Standard,Standard,1,ahead,0.0,573135.0,0,breaking,1,1,0,False,False,False,False,0.0,55.64478,40.143885,37.242472,27.122153,0.416532,0.24,0.193846,7.605102,46.808511,37.373737,17.910448,0.373757,0.34,0.12,36.750118,49.010989,35.779817,34.055728,0.273192,0.166667,0.068376,0.576607,0.490929,0.21467,0.249588,0.160011,0.213854,0.041348,0.038483,0.003965,0.003299,0.003398,0.003848


In [25]:
pitcher_df.dtypes

pitch_type                                 object
game_date                          datetime64[ns]
index                                      uint16
release_speed                             float32
release_pos_x                             float32
release_pos_z                             float32
batter                                    float32
pitcher                                   float32
zone                                      float32
hit_location                              float32
balls                                      object
strikes                                    object
game_year                                 float32
pfx_x                                     float32
pfx_z                                     float32
plate_x                                   float32
plate_z                                   float32
on_3b                                     float32
on_2b                                     float32
on_1b                                     float32


In [26]:
#potential other pitcher features to engineer in future: 

#for each pitch type a pitcher throws:
    #by count category?
    #aggregate prior ball in play % per pitch type, strike %, and chase %
    

#est_woba/babip/isovalue/chase% for diff pitches in this game, prev game, prev 5 games, prev season, and/or historical -

## Step 6: Make Game pitch count column, and make trailing pitch type features (L1, L5, L10): figure out strategy for 1st 5 and first 10 picthes of each game (maybe use overall tendencies, maybe use count tendencies, or maybe get historical first5/first 10 tendencies)

In [36]:
#horrible function name but whatever- still need to add code to get trailing 15 pitch percentages
def make_game_batting_order(game_df):
    game_df = game_df.sort_values(by=['at_bat_number', 'pitch_number'])
    all_batters = game_df['batter'].unique().tolist()
    #re-set the at_bat_number for the game to be sequential starting at 1
    at_bat_keys = game_df['at_bat_number'].unique().tolist()
    at_bat_values = range(1, len(at_bat_keys)+1)
    at_bat_map = dict(zip(at_bat_keys, at_bat_values))
    game_df['at_bat_number'] = game_df['at_bat_number'].replace(at_bat_map)
    
    #get the first 9 batter ids
    first_9_batter_subset = game_df[game_df['at_bat_number'] < 10]
    first_9_batters = first_9_batter_subset['batter'].unique().tolist()
    
    #map the batter id to batting order position 1-9
    batter_order_map = dict(zip(first_9_batters, range(1,10)))
    
    #for anyone else who bats later in the game, assign 'PH' (pinch hitter) to their batting order slot
    other_batters = list(set(all_batters) - set(first_9_batters))
    if len(other_batters) > 0:
        for batter in other_batters:
            batter_order_map[batter] = 'PH'
            
    game_df['batting_order_slot'] = game_df['batter'].apply(lambda x: batter_order_map[x])
    return game_df

def make_game_pitchcount_and_trailing_pitch_features(pitcher_df):
    df = pitcher_df.copy()
    pitcher_tendencies_overall, pitcher_tendencies_by_count = get_pitch_tendencies(df)
    games = df['game_pk'].unique().tolist()
    
    #take the first game and make the pitch count feature
    first_game_df = df[df['game_pk'] == games[0]].copy()
    first_game_df['pitch_count'] = range(1, first_game_df.shape[0] + 1)
    
    #make the L1_pitch type feature:
    first_game_df['L1_pitch_type'] = first_game_df['pitch_type'].shift(periods=1)
    first_game_df['L1_pitch_result'] = first_game_df['type'].shift(periods=1)
    first_game_df['L1_pitch_result'] = first_game_df['L1_pitch_result'].replace({np.nan:'first pitch'})
    first_game_df['L1_pitch_zone'] = first_game_df['zone'].shift(periods=1)
    first_game_df['L1_pitch_zone'] = first_game_df['L1_pitch_zone'].fillna(-1)
    
    #overall strike % (to fill in for first 5 pitches L5_strike_perc)
    overall_strike_perc = df['type'].value_counts(normalize=True)['S'] * 100
    
    #make the trailing 5 pitches:
    for index, row in first_game_df.iterrows():
        #fill NaNs for L1_pitch using same method as when pitch_type was missing
        if row['pitch_count'] == 1:
            random_pitch = random.choices(population=list(pitcher_tendencies_overall.keys()), 
                               weights=list(pitcher_tendencies_overall.values()), 
                               k=1)[0]
            first_game_df.at[index, 'L1_pitch_type'] = random_pitch
            
        #for the first 5 rows, use overall pitcher tendencies    
        if row['pitch_count'] < 6:
            #fill with overall tendencies
            for pitch in list(pitcher_tendencies_overall.keys()):
                feature = 'L5_' + pitch + '_perc'
                first_game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100  
                #strike %
                first_game_df.at[index, 'L5_strike_perc'] = overall_strike_perc
        else:
            current_pitch = first_game_df.at[index, 'pitch_count']
            #make a subset of the prev 5 pitches
            subset = first_game_df[(first_game_df['pitch_count'] > current_pitch - 6) & (first_game_df['pitch_count'] < current_pitch)]
            #grab the value count percentages for the last 5 pitches
            subset_percentages = subset['pitch_type'].value_counts(normalize=True).to_dict()
            try:
                L5_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
            except KeyError:
                L5_strike_perc = 0
                
            first_game_df.at[index, 'L5_strike_perc'] = L5_strike_perc
            
            #iterate over all possible pitch types this pitcher throws:
            for pitch in list(pitcher_tendencies_overall.keys()):
                feature = 'L5_' + pitch + '_perc'
                #if he has thrown that pitch type in last 5
                try:
                    first_game_df.at[index, feature] = subset_percentages[pitch] * 100
                #except for when he hasnt thrown that type in last 5
                except:
                    first_game_df.at[index, feature] = 0
                    
    #apply the battting order features to the game:                
    first_game_df = make_game_batting_order(first_game_df)
    
    #iterate the same process for the rest of his games:
    for game in games[1:]:
        game_df = df[df['game_pk'] == game].copy() #get df for that game only
        game_df['pitch_count'] = range(1, game_df.shape[0] + 1) #make the pitch count for the game
        game_df['L1_pitch_type'] = game_df['pitch_type'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['type'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['L1_pitch_result'].replace({np.nan:'first pitch'})
        game_df['L1_pitch_zone'] = game_df['zone'].shift(periods=1)
        game_df['L1_pitch_zone'] = game_df['L1_pitch_zone'].fillna(0)
        
        #make the trailing 5 pitches:
        for index, row in game_df.iterrows():
            #fill NaNs for L1_pitch using same method as when pitch_type was missing
            if row['pitch_count'] == 1:
                random_pitch = random.choices(population=list(pitcher_tendencies_overall.keys()), 
                               weights=list(pitcher_tendencies_overall.values()), 
                               k=1)[0]
                game_df.at[index, 'L1_pitch_type'] = random_pitch
            
            if row['pitch_count'] < 6:
                #fill with overall tendencies
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100
                    #strike %
                    game_df.at[index, 'L5_strike_perc'] = overall_strike_perc
            else:
                current_pitch = game_df.at[index, 'pitch_count']
                subset = game_df[(game_df['pitch_count'] > current_pitch - 6) & (game_df['pitch_count'] < current_pitch)]
                subset_percentages = subset['pitch_type'].value_counts(normalize=True).to_dict()
                
                try:
                    L5_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                except KeyError:
                    L5_strike_perc = 0
                game_df.at[index, 'L5_strike_perc'] = L5_strike_perc
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    try:
                        game_df.at[index, feature] = subset_percentages[pitch] * 100
                    except:
                        game_df.at[index, feature] = 0
                        
        #apply the battting order features to the game:                
        game_df = make_game_batting_order(game_df)
        
        #concatenate that game w/ updated pitch count and trailing pitches w/ prev games
        if game_df['game_pk'].values[0] == games[1]:
            new_df = pd.concat([first_game_df, game_df]) #concat the game_df w/ the first game
        else:
            new_df = pd.concat([new_df, game_df]) #concat the game_df w/ the previous games
    return new_df

In [37]:
%%time
pitcher_df = make_game_pitchcount_and_trailing_pitch_features(pitcher_df)

CPU times: user 1min 13s, sys: 1.88 s, total: 1min 15s
Wall time: 1min 17s


In [39]:
pitcher_df.head()

Unnamed: 0,pitch_type,game_date,index,release_speed,release_pos_x,release_pos_z,batter,pitcher,zone,hit_location,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,player_name,events,description,des,stand,p_throws,home_team,away_team,type,bb_type,inning_topbot,sv_id,pitch_name,if_fielding_alignment,of_fielding_alignment,_count,count_cat,score_diff,on_1b_id,bases_loaded,pitch_cat,batter_swung,in_strikezone,chased,ball_high,ball_low,ball_left,ball_right,pitchout_perc_faced,fastball_perc_faced,fastball_chase_perc,fastball_bip_swung_perc,fastball_taken_strike_perc,fastball_est_woba,fastball_babip,fastball_iso_value,offspeed_perc_faced,offspeed_chase_perc,offspeed_bip_swung_perc,offspeed_taken_strike_perc,offspeed_est_woba,offspeed_babip,offspeed_iso_value,breaking_perc_faced,breaking_chase_perc,breaking_bip_swung_perc,breaking_taken_strike_perc,breaking_est_woba,breaking_babip,breaking_iso_value,overall_FF_perc,count_cat_FF_perc,overall_SL_perc,count_cat_SL_perc,overall_CU_perc,count_cat_CU_perc,overall_CH_perc,count_cat_CH_perc,overall_FC_perc,count_cat_FC_perc,overall_FT_perc,count_cat_FT_perc,pitch_count,L1_pitch_type,L1_pitch_result,L1_pitch_zone,L5_FF_perc,L5_strike_perc,L5_SL_perc,L5_CU_perc,L5_CH_perc,L5_FC_perc,L5_FT_perc,batting_order_slot
717221,FF,2017-04-04,14433,94.0,-2.5011,6.3799,573135.0,434378.0,6.0,,0,0,2017.0,-1.638,1.3832,0.4384,2.4509,0.0,0.0,0.0,0.0,1.0,,,543510.0,10.9326,-135.909302,-7.256,-22.2188,29.3508,-14.0833,3.25,1.44,,,,92.447998,2587.0,5.68,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,54.8186,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,,called_strike,,R,R,CWS,DET,S,,Bot,170404_182235,4-Seam Fastball,Strategic,Standard,0,neutral,0.0,,0,fastball,0,1,0,False,False,False,False,0.08673,63.746748,29.951691,37.735849,39.563863,0.278183,0.168539,0.039326,9.80052,27.868852,40.0,17.307692,0.220375,0.235294,0.088235,26.366002,22.222222,39.63964,40.0,0.200636,0.097222,0.041667,0.576607,0.671927,0.21467,0.172757,0.160011,0.11794,0.041348,0.033223,0.003965,0.001661,0.003398,0.002492,1,FF,first pitch,-1.0,57.660719,49.815916,21.467007,16.001133,4.134806,0.396488,0.339847,1
717220,FF,2017-04-04,14423,95.099998,-2.3831,6.313,573135.0,434378.0,14.0,,0,1,2017.0,-1.4071,1.666,1.8448,1.8521,0.0,0.0,0.0,0.0,1.0,,,543510.0,13.8935,-137.156998,-9.4298,-20.4625,29.761999,-9.7076,3.25,1.44,,,,93.507004,2534.0,5.795,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,54.703201,,,,,,,,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,,ball,,R,R,CWS,DET,B,,Bot,170404_182256,4-Seam Fastball,Standard,Standard,1,ahead,0.0,,0,fastball,0,0,0,False,False,False,True,0.08673,63.746748,29.951691,37.735849,39.563863,0.278183,0.168539,0.039326,9.80052,27.868852,40.0,17.307692,0.220375,0.235294,0.088235,26.366002,22.222222,39.63964,40.0,0.200636,0.097222,0.041667,0.576607,0.490929,0.21467,0.249588,0.160011,0.213854,0.041348,0.038483,0.003965,0.003299,0.003398,0.003848,2,FF,S,6.0,57.660719,49.815916,21.467007,16.001133,4.134806,0.396488,0.339847,1
717219,CU,2017-04-04,14412,79.199997,-2.6276,6.1709,573135.0,434378.0,1.0,7.0,1,1,2017.0,0.4102,-0.9814,-0.3723,2.7628,0.0,0.0,0.0,0.0,1.0,66.059998,135.050003,543510.0,4.0855,-114.870598,1.0709,2.6878,21.7332,-40.947102,3.25,1.44,,90.300003,-17.0,77.067001,2842.0,4.989,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,55.509602,0.163,0.164,0.9,1.0,1.0,0.0,2.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,single,hit_into_play_no_out,Tyler Saladino singles on a ground ball to lef...,R,R,CWS,DET,X,ground_ball,Bot,170404_182315,Curveball,Standard,Standard,11,ahead,0.0,,0,breaking,1,1,0,False,False,False,False,0.08673,63.746748,29.951691,37.735849,39.563863,0.278183,0.168539,0.039326,9.80052,27.868852,40.0,17.307692,0.220375,0.235294,0.088235,26.366002,22.222222,39.63964,40.0,0.200636,0.097222,0.041667,0.576607,0.490929,0.21467,0.249588,0.160011,0.213854,0.041348,0.038483,0.003965,0.003299,0.003398,0.003848,3,FF,B,14.0,57.660719,49.815916,21.467007,16.001133,4.134806,0.396488,0.339847,1
717218,SL,2017-04-04,14402,86.800003,-2.5506,6.3337,641313.0,434378.0,3.0,,0,0,2017.0,-0.148,0.6201,0.6356,2.9052,0.0,0.0,1.0,0.0,1.0,,,543510.0,7.7677,-125.945602,-3.2419,-2.8956,21.923599,-25.368099,3.32,1.51,,,,85.759003,2573.0,5.454,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,55.044899,,,,,,,,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,,swinging_strike,,R,R,CWS,DET,S,,Bot,170404_182445,Slider,Standard,Standard,0,neutral,0.0,573135.0,0,breaking,1,1,0,False,False,False,False,0.0,55.64478,40.143885,37.242472,27.122153,0.416532,0.24,0.193846,7.605102,46.808511,37.373737,17.910448,0.373757,0.34,0.12,36.750118,49.010989,35.779817,34.055728,0.273192,0.166667,0.068376,0.576607,0.671927,0.21467,0.172757,0.160011,0.11794,0.041348,0.033223,0.003965,0.001661,0.003398,0.002492,4,CU,X,1.0,57.660719,49.815916,21.467007,16.001133,4.134806,0.396488,0.339847,2
717217,SL,2017-04-04,14395,89.300003,-2.5144,6.3712,641313.0,434378.0,3.0,,0,1,2017.0,-0.2297,0.923,0.596,2.8399,0.0,0.0,1.0,0.0,1.0,,,543510.0,7.9815,-129.521896,-4.4926,-4.072,25.070101,-21.385799,3.32,1.51,130.0,75.300003,13.0,88.084,2488.0,5.537,490102.0,408234.0,435079.0,592206.0,578428.0,457708.0,592444.0,543484.0,54.9613,,,,,,,,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Justin Verlander,,foul,,R,R,CWS,DET,S,,Bot,170404_182522,Slider,Standard,Standard,1,ahead,0.0,573135.0,0,breaking,1,1,0,False,False,False,False,0.0,55.64478,40.143885,37.242472,27.122153,0.416532,0.24,0.193846,7.605102,46.808511,37.373737,17.910448,0.373757,0.34,0.12,36.750118,49.010989,35.779817,34.055728,0.273192,0.166667,0.068376,0.576607,0.490929,0.21467,0.249588,0.160011,0.213854,0.041348,0.038483,0.003965,0.003299,0.003398,0.003848,5,SL,S,3.0,57.660719,49.815916,21.467007,16.001133,4.134806,0.396488,0.339847,2


## Step 7: Pitcher/Batter matchup history

In [30]:
#Todo

## Step 8: Write code that does step 1-8, iteratively, updating each month of 2018 and into the 2019 season (to prevent leakage of future information into the training set)

In [31]:
#Todo 