In [72]:
import pandas as pd
import numpy as np
import random
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [73]:
cd ..

/home/jm/Desktop/lambda_projects/pitch_predict


In [74]:
cd data

/home/jm/Desktop/lambda_projects/pitch_predict/data


In [75]:
fname = 'pitches_2017.pkl'
df_17 = pd.read_pickle(fname, compression='zip')

#convert the pitch type for UN (unknown) to np.nan
df_17['pitch_type'] = df_17['pitch_type'].replace({'UN':np.nan})

In [76]:
def strikezone_wrangle(df):
    df = df.copy()
    
    #create swung column
    def swung(x):
        swung = ['foul','hit_into_play','swinging_strike','hit_into_play_no_out',
                 'hit_into_play_score','foul_tip','swinging_strike_blocked',
                 'foul_bunt','missed_bunt']
        return 1 if x in swung else 0
    df['batter_swung'] = df['description'].apply(swung)
    
    #initialize in_strikezone and chased features:
    df['in_strikezone'] = 1
    df['chased'] = 0
    
    #iterate thru each row
    for index, row in df.iterrows():
        #if ball is outside the strikezone, change the value for that row to 0
        if row.plate_z > row.sz_top or row.plate_z < row.sz_bot or row.plate_x < -0.73 or row.plate_x > 0.73:
            df.at[index, 'in_strikezone'] = 0
        #if batter_swung at ball outside the strike zone, change chased value to 1:
        if df.at[index, 'batter_swung'] == 1 and df.at[index, 'in_strikezone'] == 0:
            df.at[index, 'chased'] = 1
    return df

In [77]:
%%time
df_17 = strikezone_wrangle(df_17)

CPU times: user 2min 19s, sys: 2.92 s, total: 2min 22s
Wall time: 2min 25s


In [78]:
def gen_pitcher_percentages(df):
    df = df.copy()
    #get all the unique pitcher names in the df
    pitcher_list = df.player_name.unique().tolist()
    #initialize empty dictionary to store each pitcher and their pitches and percentages for each pitch
    pitcher_dict = {}
    #iterate over each pitcher:
    for pitcher in pitcher_list:
        #assign the normalized value_counts to a variable
        pitch_percentages = df[df.player_name == pitcher].pitch_type.value_counts(normalize=True)
        #convert that Series object to a dict and assign it as the value to the pitcher dictionary 
        #(pitcher name as key)
        pitcher_dict[pitcher] = pitch_percentages.to_dict()
    return pitcher_dict

def fill_pitch_type_nans(df):
    df = df.copy()
    #grab the rows where pitch_type is null:
    nulls = df[df.pitch_type.isna()]
    
    #generate the dictionary of pitch type % by pitcher
    pitcher_dict = gen_pitcher_percentages(df)
    
    #iterate over each null row
    for index, row in nulls.iterrows():
        #use the % for that pitcher for each pitch type he throws to generate a random pitch type with 
        #that % as weight
        pitch = random.choices(population=list(pitcher_dict[row.player_name].keys()), 
                               weights=list(pitcher_dict[row.player_name].values()), 
                               k=1)[0]
        #fill the NaN value with the randomly generated pitch
        df.at[index, 'pitch_type'] = pitch
    return df

In [79]:
df_17.pitch_type.isna().sum()

2215

In [80]:
%%time
df_17 = fill_pitch_type_nans(df_17)

CPU times: user 43.2 s, sys: 440 ms, total: 43.7 s
Wall time: 44.1 s


In [81]:
df_17.pitch_type.isna().sum()

0

In [82]:
df_17.pitch_type.value_counts()

FF    253995
SL    117637
FT     94614
CH     71753
CU     56762
SI     52405
FC     37617
KC     21404
FS     11778
KN      2688
FO       248
EP       187
PO       116
SC        39
Name: pitch_type, dtype: int64

## Categorize pitch type into fastball, breaking, or offspeed, (or pitchout)

#### Fastballs
FA = fastball, FF = four-seam fastball, FT = two-seam fastball, FC = fastball (cutter), 

FS / SI / SF = fastball (sinker, split-fingered)

#### Breaking
SL = slider, CB / CU = curveball, KC = knuckle-curve, SC = screwball

#### Offspeed
CH = changeup, KN = knuckleball, EP = eephus

#### Pitchout
PO / FO = pitch out



In [83]:
#create map for pitch type into categories:
pitch_type_map = {'FA':'fastball', 'FF':'fastball', 'FT':'fastball', 'FC':'fastball',
                  'FS':'fastball', 'SI':'fastball', 'SF':'fastball', 'SL':'breaking',
                  'CB':'breaking', 'CU':'breaking', 'SC':'breaking', 'KC':'breaking',
                  'CH':'offspeed', 'KN':'offspeed', 'EP':'offspeed', 'FO':'pitchout', 
                  'PO':'pitchout'}

#create pitch cateogory feature
df_17['pitch_cat'] = df_17['pitch_type']
df_17['pitch_cat'] = df_17['pitch_cat'].replace(pitch_type_map)

In [84]:
df_17['pitch_cat'].value_counts(dropna=False)

fastball    450409
breaking    195842
offspeed     74628
pitchout       364
Name: pitch_cat, dtype: int64

# Batter Scouting Report
## Batter features vs diff pitch type categories:

In [85]:
batters = list(df_17.batter.unique())

In [86]:
len(batters)

957

In [87]:
%%time
batters_dict = {}
brk = False
for batter in batters:
    if brk:
        break
    batter_df = df_17[df_17['batter'] == batter]
    pitch_cats = batter_df['pitch_cat'].value_counts().index.tolist()
    vc = batter_df.pitch_cat.value_counts(normalize=True)
    batter_dict = {}
    
    for cat in pitch_cats:
        if brk:
            break
        batter_dict[cat] = {}
        
        #get the total % of pitches the batter faced for that pitch category
        batter_dict[cat]['perc_faced'] = vc[cat] * 100
        
        #grab subset of batter df for the pitch category
        cat_df = batter_df[batter_df['pitch_cat'] == cat]
        
        #calculate batters chase % for pitch type category on balls outside the strikezone
        out_of_strikezone = len(cat_df[cat_df['in_strikezone'] == 0]) #num of times ball was out of zone
        chased_count = len(cat_df[cat_df['chased'] == 1]) #num of times batter chased
        try:
            chase_perc = chased_count / out_of_strikezone
        except:
            chase_perc = np.nan
            
        #assign the chase perc to the batter dict
        batter_dict[cat]['chase_perc'] = chase_perc * 100
        
        stats = ['estimated_woba_using_speedangle', 'babip_value', 'iso_value']
        for stat in stats:
            stat_cat_df = cat_df.dropna(subset=[stat])
            if stat == 'estimated_woba_using_speedangle':
                avg_est_woba = stat_cat_df['estimated_woba_using_speedangle'].mean()
                batter_dict[cat]['est_woba'] = avg_est_woba
                if avg_est_woba == np.nan:
                    print(batter)
                    brk = True
                    break
            elif stat == 'babip_value':
                avg_babip = stat_cat_df['babip_value'].mean()
                batter_dict[cat]['babip'] = avg_babip
            else:
                avg_iso_value = stat_cat_df['iso_value'].mean()
                batter_dict[cat]['iso_value'] = avg_iso_value
    batters_dict[batter] = batter_dict

CPU times: user 35.2 s, sys: 196 ms, total: 35.3 s
Wall time: 35.4 s


In [88]:
#batter example:
ex_batter = batters[0]

batters_dict[ex_batter]

{'fastball': {'perc_faced': 62.62068965517241,
  'chase_perc': 22.134387351778656,
  'est_woba': 0.37744594594594594,
  'babip': 0.07758620689655173,
  'iso_value': 0.1724137931034483},
 'breaking': {'perc_faced': 24.82758620689655,
  'chase_perc': 23.423423423423422,
  'est_woba': 0.38765,
  'babip': 0.22857142857142856,
  'iso_value': 0.11428571428571428},
 'offspeed': {'perc_faced': 12.551724137931034,
  'chase_perc': 27.27272727272727,
  'est_woba': 0.5106153846153847,
  'babip': 0.10526315789473684,
  'iso_value': 0.5263157894736842}}

In [89]:
len(batters_dict)

957