In [228]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [229]:
fname = 'pitches_2017.pkl'
df_17 = pd.read_pickle(fname, compression='zip')

#convert the pitch type for UN (unknown) to np.nan
df_17['pitch_type'] = df_17['pitch_type'].replace({'UN':np.nan})

In [230]:
df_17.head()

Unnamed: 0,index,pitch_type,game_date,release_speed,player_name,batter,pitcher,events,description,zone,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,fielder_2,hit_distance_sc,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,593,FC,2017-10-01,91.9,Roberto Osuna,595885.0,532077.0,field_out,hit_into_play,1.0,L,R,NYY,TOR,X,6.0,popup,3.0,2.0,,,,2.0,9.0,Bot,607257.0,236.0,90.975,2422.0,5.941,492528.0,532077.0,607257.0,475253.0,572365.0,446381.0,620446.0,485567.0,606192.0,608701.0,0.023,0.023,0.0,1.0,0.0,0.0,63.0,6.0,Cutter,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,Infield shift,Strategic
1,614,FC,2017-10-01,91.8,Roberto Osuna,595885.0,532077.0,,foul,3.0,L,R,NYY,TOR,S,,,3.0,1.0,,,,2.0,9.0,Bot,607257.0,,91.493,2415.0,6.005,492528.0,532077.0,607257.0,475253.0,572365.0,446381.0,620446.0,485567.0,606192.0,608701.0,,,,,,,63.0,5.0,Cutter,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,Infield shift,Strategic
2,618,FF,2017-10-01,94.7,Roberto Osuna,595885.0,532077.0,,ball,12.0,L,R,NYY,TOR,B,,,2.0,1.0,,,,2.0,9.0,Bot,607257.0,20.0,94.15,2450.0,6.126,492528.0,532077.0,607257.0,475253.0,572365.0,446381.0,620446.0,485567.0,606192.0,608701.0,,,,,,,63.0,4.0,4-Seam Fastball,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,Infield shift,Strategic
3,636,CH,2017-10-01,82.8,Roberto Osuna,595885.0,532077.0,,ball,13.0,L,R,NYY,TOR,B,,,1.0,1.0,,,,2.0,9.0,Bot,607257.0,,82.564,2063.0,6.284,492528.0,532077.0,607257.0,475253.0,572365.0,446381.0,620446.0,485567.0,606192.0,608701.0,,,,,,,63.0,3.0,Changeup,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,Infield shift,Strategic
4,650,FT,2017-10-01,95.9,Roberto Osuna,595885.0,532077.0,,ball,13.0,L,R,NYY,TOR,B,,,0.0,1.0,,,,2.0,9.0,Bot,607257.0,,95.761,2554.0,6.33,492528.0,532077.0,607257.0,475253.0,572365.0,446381.0,620446.0,485567.0,606192.0,608701.0,,,,,,,63.0,2.0,2-Seam Fastball,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,Infield shift,Strategic


In [231]:
def gen_pitcher_percentages(df):
    df = df.copy()
    #get all the unique pitcher names in the df
    pitcher_list = df.player_name.unique().tolist()
    #initialize empty dictionary to store each pitcher and their pitches and percentages for each pitch
    pitcher_dict = {}
    #iterate over each pitcher:
    for pitcher in pitcher_list:
        #assign the normalized value_counts to a variable
        pitch_percentages = df[df.player_name == pitcher].pitch_type.value_counts(normalize=True)
        #convert that Series object to a dict and assign it as the value to the pitcher dictionary 
        #(pitcher name as key)
        pitcher_dict[pitcher] = pitch_percentages.to_dict()
    return pitcher_dict

def fill_pitch_type_nans(df):
    df = df.copy()
    #grab the rows where pitch_type is null:
    nulls = df[df.pitch_type.isna()]
    
    #generate the dictionary of pitch type % by pitcher
    pitcher_dict = gen_pitcher_percentages(df)
    
    #iterate over each null row
    for index, row in nulls.iterrows():
        #use the % for that pitcher for each pitch type he throws to generate a random pitch type with 
        #that % as weight
        pitch = random.choices(population=list(pitcher_dict[row.player_name].keys()), 
                               weights=list(pitcher_dict[row.player_name].values()), 
                               k=1)[0]
        #fill the NaN value with the randomly generated pitch
        df.at[index, 'pitch_type'] = pitch
    return df

In [232]:
df_17.pitch_type.isna().sum()

2215

In [233]:
%%time
df_17 = fill_pitch_type_nans(df_17)

CPU times: user 45.4 s, sys: 297 ms, total: 45.7 s
Wall time: 45.9 s


In [234]:
df_17.pitch_type.isna().sum()

0

In [235]:
df_17.pitch_type.value_counts()

FF    253995
SL    117659
FT     94623
CH     71769
CU     56714
SI     52402
FC     37615
KC     21399
FS     11780
KN      2697
FO       247
EP       187
PO       117
SC        39
Name: pitch_type, dtype: int64

## Categorize pitch type into fastball, breaking, or offspeed, (or pitchout)

#### Fastballs
FA = fastball, FF = four-seam fastball, FT = two-seam fastball, FC = fastball (cutter), 

FS / SI / SF = fastball (sinker, split-fingered)

#### Breaking
SL = slider, CB / CU = curveball, KC = knuckle-curve, SC = screwball

#### Offspeed
CH = changeup, KN = knuckleball, EP = eephus

#### Pitchout
PO / FO = pitch out



In [236]:
#create map for pitch type into categories:
pitch_type_map = {'FA':'fastball', 'FF':'fastball', 'FT':'fastball', 'FC':'fastball',
                  'FS':'fastball', 'SI':'fastball', 'SF':'fastball', 'SL':'breaking',
                  'CB':'breaking', 'CU':'breaking', 'SC':'breaking', 'KC':'breaking',
                  'CH':'offspeed', 'KN':'offspeed', 'EP':'offspeed', 'FO':'pitchout', 
                  'PO':'pitchout'}

#create pitch cateogory feature
df_17['pitch_cat'] = df_17['pitch_type']
df_17['pitch_cat'] = df_17['pitch_cat'].replace(pitch_type_map)

In [237]:
df_17['pitch_cat'].value_counts(dropna=False)

fastball    450415
breaking    195811
offspeed     74653
pitchout       364
Name: pitch_cat, dtype: int64