In [1]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
fname = 'pitches_2017.pkl'
df_17 = pd.read_pickle(fname, compression='zip')

#convert the pitch type for UN (unknown) to np.nan
df_17['pitch_type'] = df_17['pitch_type'].replace({'UN':np.nan})

In [3]:
def strikezone_wrangle(df):
    df = df.copy()
    
    #create swung column
    def swung(x):
        swung = ['foul','hit_into_play','swinging_strike','hit_into_play_no_out',
                 'hit_into_play_score','foul_tip','swinging_strike_blocked',
                 'foul_bunt','missed_bunt']
        return 1 if x in swung else 0
    df['batter_swung'] = df['description'].apply(swung)
    
    #initialize in_strikezone and chased features:
    df['in_strikezone'] = 1
    df['chased'] = 0
    
    #iterate thru each row
    for index, row in df.iterrows():
        #if ball is outside the strikezone, change the value for that row to 0
        if row.plate_z > row.sz_top or row.plate_z < row.sz_bot or row.plate_x < -0.73 or row.plate_x > 0.73:
            df.at[index, 'in_strikezone'] = 0
        #if batter_swung at ball outside the strike zone, change chased value to 1:
        if df.at[index, 'batter_swung'] == 1 and df.at[index, 'in_strikezone'] == 0:
            df.at[index, 'chased'] = 1
    return df

In [4]:
%%time
df_17 = strikezone_wrangle(df_17)

CPU times: user 4min 30s, sys: 7.57 s, total: 4min 38s
Wall time: 4min 53s


In [5]:
def gen_pitcher_percentages(df):
    df = df.copy()
    #get all the unique pitcher names in the df
    pitcher_list = df.player_name.unique().tolist()
    #initialize empty dictionary to store each pitcher and their pitches and percentages for each pitch
    pitcher_dict = {}
    #iterate over each pitcher:
    for pitcher in pitcher_list:
        #assign the normalized value_counts to a variable
        pitch_percentages = df[df.player_name == pitcher].pitch_type.value_counts(normalize=True)
        #convert that Series object to a dict and assign it as the value to the pitcher dictionary 
        #(pitcher name as key)
        pitcher_dict[pitcher] = pitch_percentages.to_dict()
    return pitcher_dict

def fill_pitch_type_nans(df):
    df = df.copy()
    #grab the rows where pitch_type is null:
    nulls = df[df.pitch_type.isna()]
    
    #generate the dictionary of pitch type % by pitcher
    pitcher_dict = gen_pitcher_percentages(df)
    
    #iterate over each null row
    for index, row in nulls.iterrows():
        #use the % for that pitcher for each pitch type he throws to generate a random pitch type with 
        #that % as weight
        pitch = random.choices(population=list(pitcher_dict[row.player_name].keys()), 
                               weights=list(pitcher_dict[row.player_name].values()), 
                               k=1)[0]
        #fill the NaN value with the randomly generated pitch
        df.at[index, 'pitch_type'] = pitch
    return df

In [6]:
df_17.pitch_type.isna().sum()

2215

In [7]:
%%time
df_17 = fill_pitch_type_nans(df_17)

CPU times: user 1min 17s, sys: 868 ms, total: 1min 18s
Wall time: 1min 18s


In [8]:
df_17.pitch_type.isna().sum()

0

In [9]:
df_17.pitch_type.value_counts()

FF    254009
SL    117639
FT     94618
CH     71751
CU     56735
SI     52410
FC     37632
KC     21385
FS     11780
KN      2694
FO       248
EP       187
PO       116
SC        39
Name: pitch_type, dtype: int64

## Categorize pitch type into fastball, breaking, or offspeed, (or pitchout)

#### Fastballs
FA = fastball, FF = four-seam fastball, FT = two-seam fastball, FC = fastball (cutter), 

FS / SI / SF = fastball (sinker, split-fingered)

#### Breaking
SL = slider, CB / CU = curveball, KC = knuckle-curve, SC = screwball

#### Offspeed
CH = changeup, KN = knuckleball, EP = eephus

#### Pitchout
PO / FO = pitch out



In [10]:
#create map for pitch type into categories:
pitch_type_map = {'FA':'fastball', 'FF':'fastball', 'FT':'fastball', 'FC':'fastball',
                  'FS':'fastball', 'SI':'fastball', 'SF':'fastball', 'SL':'breaking',
                  'CB':'breaking', 'CU':'breaking', 'SC':'breaking', 'KC':'breaking',
                  'CH':'offspeed', 'KN':'offspeed', 'EP':'offspeed', 'FO':'pitchout', 
                  'PO':'pitchout'}

#create pitch cateogory feature
df_17['pitch_cat'] = df_17['pitch_type']
df_17['pitch_cat'] = df_17['pitch_cat'].replace(pitch_type_map)

In [11]:
df_17['pitch_cat'].value_counts(dropna=False)

fastball    450449
breaking    195798
offspeed     74632
pitchout       364
Name: pitch_cat, dtype: int64

# Batter Scouting Report
## Batter features vs diff pitch type categories:

In [12]:
%%time
def make_batters_dict(df):
    df = df.copy()
    #make list of the unique batter ids
    batters = list(df['batter'].unique())
    #initialize empty dictionary to store the batter stats
    batters_dict = {}
    #set a break flag to False for error-checking
    brk = False
    #iterate thru each unique batter
    for batter in batters:
        if brk:
            break
        #make subset of the df for that batter and assign to variable batter_df    
        batter_df = df[df['batter'] == batter]
        
        #assign all pitch categories to list:
        all_pitch_cats = ['fastball', 'breaking', 'offspeed', 'pitchout']
        #assign the pitch categories to a list
        pitch_cats = batter_df['pitch_cat'].unique().tolist()
        #get the normalized value counts of pitches by category that batter has faced
        vc = batter_df.pitch_cat.value_counts(normalize=True)
        #initialize empty dict for each batter
        batter_dict = {}
        
        #if there are any pitch categories the batter has not faced, 
        unfaced_cats = list(set(all_pitch_cats) - set(pitch_cats))
        
        #assign NaNs to his dictionary for that category
        for cat in unfaced_cats:
            if cat == 'pitchout':
                batter_dict[cat + '_perc_faced'] = 0
            else:
                batter_dict[cat + '_perc_faced'] = np.nan
                batter_dict[cat + '_chase_perc'] = np.nan
                batter_dict[cat + '_bip_swung_perc'] = np.nan
                batter_dict[cat + '_taken_strike_perc'] = np.nan
                batter_dict[cat + '_est_woba'] = np.nan
                batter_dict[cat + '_babip'] = np.nan
                batter_dict[cat + '_iso_value'] = np.nan
        
        for cat in pitch_cats:
            if brk:
                break
        
            #assign the % of pitches faced by the batter for that category to his batter dict
            
            batter_dict[cat + '_perc_faced'] = vc[cat] * 100
        
            #continue out of the loop for pitchout category since ball in play stats are NaN
            if cat == 'pitchout':
                continue
        
            #grab subset of batter df for the pitch category
            cat_df = batter_df[batter_df['pitch_cat'] == cat]
        
            #calculate batters chase % for pitch type category on balls outside the strikezone
            out_of_strikezone = len(cat_df[cat_df['in_strikezone'] == 0]) #num of times ball was out of zone
            chased_count = len(cat_df[cat_df['chased'] == 1]) #num of times batter chased
            try:
                chase_perc = (chased_count / out_of_strikezone) * 100
            except ZeroDivisionError:
                chase_perc = np.nan
            #assign the chase perc to the batter dict
            batter_dict[cat + '_chase_perc'] = chase_perc
        
            #calc ball in play % for each swing for each pitch cat:
            ball_in_play_count = len(cat_df[cat_df['type'] == 'X']) #type X means ball hit into play
            swung_count = cat_df['batter_swung'].sum() #counts all the 1s in the swung column
            #assign the ball in play % per swing to the batter dict
            batter_dict[cat + '_bip_swung_perc'] = (ball_in_play_count / swung_count) * 100
        
            #calculate taken strike %
            taken_strike_count = len(cat_df[(cat_df['in_strikezone'] == 1) & (cat_df['batter_swung'] == 0)])
            pitches_in_zone_count = cat_df['in_strikezone'].sum() #counts the 1s in the in zone col
            #assign to batter_dict
            batter_dict[cat + '_taken_strike_perc'] = (taken_strike_count / pitches_in_zone_count) * 100
        
            #for each pitch type category, get the batters stats on balls hit in play
            stats = ['estimated_woba_using_speedangle', 'babip_value', 'iso_value']
            for stat in stats:
                #drop Nans from the stat column and assign to new subset, for each stat
                stat_cat_df = cat_df.dropna(subset=[stat])
                if stat == 'estimated_woba_using_speedangle':
                    #get the mean avg_est_woba
                    avg_est_woba = stat_cat_df['estimated_woba_using_speedangle'].mean()
                    #assign that value to the batters dictionary
                    batter_dict[cat + '_est_woba'] = avg_est_woba
                    if avg_est_woba == np.nan:
                        print(batter)
                        brk = True
                        break
                elif stat == 'babip_value':
                    avg_babip = stat_cat_df['babip_value'].mean()
                    batter_dict[cat + '_babip'] = avg_babip
                else:
                    avg_iso_value = stat_cat_df['iso_value'].mean()
                    batter_dict[cat + '_iso_value'] = avg_iso_value
            
        #assign the batter dictionary to the main dictionary of all batters
        batters_dict[batter] = batter_dict
    if not brk:
        print('iteration completed successfully')
    return batters_dict

batters_dict = make_batters_dict(df_17)



iteration completed successfully
CPU times: user 1min 22s, sys: 893 ms, total: 1min 23s
Wall time: 1min 24s


#### Example batter:

In [13]:
batter = list(batters_dict.keys())[0]

batters_dict[batter]

{'pitchout_perc_faced': 0,
 'fastball_perc_faced': 62.62068965517241,
 'fastball_chase_perc': 22.134387351778656,
 'fastball_bip_swung_perc': 35.54502369668246,
 'fastball_taken_strike_perc': 22.885572139303484,
 'fastball_est_woba': 0.38004000000000004,
 'fastball_babip': 0.08547008547008547,
 'fastball_iso_value': 0.1794871794871795,
 'offspeed_perc_faced': 12.551724137931034,
 'offspeed_chase_perc': 27.27272727272727,
 'offspeed_bip_swung_perc': 35.13513513513514,
 'offspeed_taken_strike_perc': 24.0,
 'offspeed_est_woba': 0.5106153846153847,
 'offspeed_babip': 0.10526315789473684,
 'offspeed_iso_value': 0.5263157894736842,
 'breaking_perc_faced': 24.82758620689655,
 'breaking_chase_perc': 23.423423423423422,
 'breaking_bip_swung_perc': 30.158730158730158,
 'breaking_taken_strike_perc': 46.3768115942029,
 'breaking_est_woba': 0.37794736842105264,
 'breaking_babip': 0.20588235294117646,
 'breaking_iso_value': 0.08823529411764706}

In [14]:
len(batters_dict)

957

In [17]:
batters_df = pd.DataFrame.from_dict(batters_dict, orient='index')
batters_df = batters_df.reset_index().rename(columns={'index':'batter'})
batters_df.head()

Unnamed: 0,batter,pitchout_perc_faced,fastball_perc_faced,fastball_chase_perc,fastball_bip_swung_perc,fastball_taken_strike_perc,fastball_est_woba,fastball_babip,fastball_iso_value,offspeed_perc_faced,offspeed_chase_perc,offspeed_bip_swung_perc,offspeed_taken_strike_perc,offspeed_est_woba,offspeed_babip,offspeed_iso_value,breaking_perc_faced,breaking_chase_perc,breaking_bip_swung_perc,breaking_taken_strike_perc,breaking_est_woba,breaking_babip,breaking_iso_value
0,112526.0,0.0,64.0625,47.368421,36.363636,40.909091,0.16325,0.0,0.0,1.5625,100.0,0.0,,,,,34.375,73.333333,14.285714,57.142857,0.0675,0.0,0.0
1,134181.0,0.0,61.026616,37.931034,41.317365,26.631854,0.397986,0.265385,0.223077,8.111534,19.047619,36.585366,43.181818,0.382133,0.166667,0.166667,30.86185,28.75817,36.734694,40.331492,0.337958,0.150943,0.122642
2,136860.0,0.0,62.292531,31.048951,40.957447,29.62963,0.363623,0.196141,0.180064,11.20332,45.695364,50.434783,29.230769,0.263638,0.203125,0.0625,26.504149,39.814815,33.606557,38.502674,0.309512,0.152672,0.083969
3,150029.0,0.230415,65.284178,21.929825,37.974684,45.177665,0.444667,0.189655,0.183908,10.061444,31.395349,38.181818,37.777778,0.208571,0.131579,0.052632,24.423963,29.100529,32.824427,41.085271,0.346605,0.116883,0.103896
4,276520.0,0.0,82.142857,31.25,54.285714,32.432432,0.316368,0.173913,0.0,1.190476,,0.0,0.0,,,,16.666667,54.545455,0.0,100.0,,0.0,0.0


In [18]:
batters_df.isna().sum()

batter                          0
pitchout_perc_faced             0
fastball_perc_faced             1
fastball_chase_perc            21
fastball_bip_swung_perc        20
fastball_taken_strike_perc      9
fastball_est_woba              67
fastball_babip                 27
fastball_iso_value             27
offspeed_perc_faced           189
offspeed_chase_perc           214
offspeed_bip_swung_perc       219
offspeed_taken_strike_perc    240
offspeed_est_woba             280
offspeed_babip                242
offspeed_iso_value            242
breaking_perc_faced            73
breaking_chase_perc           112
breaking_bip_swung_perc       113
breaking_taken_strike_perc    119
breaking_est_woba             211
breaking_babip                125
breaking_iso_value            125
dtype: int64

## Next steps:
-merge batters_df, one to many, with the main df (df_17 in this case)
    - join on df['batter'] == batters_df['batter']