# Import

## Install Packages 

## Import Packages 

In [None]:
import pandas as pd
import numpy as np
import random 

## Import Data

In [None]:
filenames = [str(i) + '.pkl' for i in range(2010,2019)]

seasons = ['df_' + str(i) for i in range(10,19)]

In [None]:
season_dataframes = {}

for i in list(zip(filenames, seasons)):
        path = "Season_pickles/" + i[0]
        season_dataframes[i[1]] = pd.read_pickle(path, compression='zip')

## Concatenate Data

In [None]:
pitches = pd.concat(season_dataframes.values())

# Clean

## All Instances

**Issue**: There are some instances where no data is recorded

**Solution**: Drop these instances from the data

In [None]:
pitches = pitches.dropna(axis = 0, how = 'all')



---



## Pitch Type

**Feature Name**: `pitch_type`

**Feature Description**: The type of pitch derived from Statcast.

**Issue**: Feature is supposed to contain a 2 character string, but many values (265) are filled with long strings of numerical characters. Example: 160421_181540

**Solution**: Replace values longer than 2 characters in lengeth with np.NaN

In [None]:
pitches['pitch_type'] = pitches.apply(
    lambda row: np.NaN\
        if len(str(row['pitch_type'])) > 2\
        else row['pitch_type'], axis = 1)

**Issue**: Many values of this feature are recorded as 'UN'

**Solution**: Replace value with np.NaN

In [None]:
pitches['pitch_type'] = pitches['pitch_type'].replace({'UN':np.nan})

**Issue**: The pitch type feature is filled with NaN values

**Solution**: We will create a mapping of a pitchers id and his normalized pitch counts. Using these normalized values as weights we will select a random pitch type and fill the NaN value for that pitcher. We will use df.apply, but this could be time optomized by using series vectorization. 

In [None]:
# Create mapping

# List fo unique pitcher ID's
pitcher_list = pitches['pitcher'].unique().tolist()


pitcher_dict = {}
for pitcher in pitcher_list:
    
    # Pitcher's prior pitch type probabilites
    pitch_type_weights = pitches[pitches.pitcher == pitcher]\
                         .pitch_type\
                         .value_counts(normalize=True)
    
    pitcher_dict[pitcher] = pitch_type_weights.to_dict()


# Fill nan values
pitcher_dict = pd.DataFrame(pitcher_dict).fillna(0).to_dict()

In [None]:
# Select replacement pitch type and fill NaN values

def pick_a_pitch(pitcher_id):
    """ 
    Returns a random pitch type label
    Uses pitchers prior pitch type probabilites as weights
    """
    
    population = list(pitcher_dict[pitcher_id].keys())
    weights = list(pitcher_dict[pitcher_id].values())
    
    return random.choices(population, weights, k=1)[0]

# Iterate by instance, fill null values
pitches['pitch_type'] = pitches.apply(
    lambda row: pick_a_pitch(row['pitcher']) \
        if pd.isnull(row['pitch_type']) \
        else row['pitch_type'], axis = 1)

In [None]:
pitch_type_map = {'FA':'fastball', 'FF':'fastball', 'FT':'fastball', 'FC':'fastball',
                  'FS':'fastball', 'SI':'fastball', 'SF':'fastball', 'SL':'breaking',
                  'CB':'breaking', 'CU':'breaking', 'SC':'breaking', 'KC':'breaking',
                  'CH':'offspeed', 'KN':'offspeed', 'EP':'offspeed', 'FO':'breaking', 
                  'PO':'pitchout', 'IN':'pitchout'}
    
pitches['pitch_subtype'] = pitches['pitch_type']
pitches['pitch_type'] = pitches['pitch_type'].map(pitch_type_map)



---



## Count

**Feature**: Count ratio

**Description**: The ratio of balls and strikes for the current at bat

**Issue**: There are two existing features related to the count. We need to represent the count as a categorical feature.

**Solution**: Classifiy the pitchers position reguarding the count (Ahead, Behind, Neutral)

In [None]:
pitches['balls'] = pitches['balls'].replace({4:3, 5:3})

In [None]:
pitches['count_status'] = pitches['balls'].astype('int').astype('str')\
                        + pitches['strikes'].astype('int').astype('str')

In [None]:
count_status_mapping = {
    '00':'neutral', '21':'neutral', '32':'neutral', '10':'behind',
    '20':'behind', '30':'behind', '31':'behind', '01':'ahead',
    '02':'ahead', '11':'ahead', '12':'ahead', '22':'ahead'
}


pitches['count_status'] = pitches['count_status'].map(count_status_mapping)



---



## Score Differential

**Feature**: Score Differential

**Description**: The absolute value of the difference in home team score and away team score

In [None]:
pitches['score_differential'] = abs(pitches['home_score'] - pitches['away_score'])



---



## Bases Loaded

**Feature**: Bases Loaded

**Description**: A binary indication of the bases being loaded or not

In [None]:
pitches['on_1b'] = pitches['on_1b'] * 0 + 1
pitches['on_1b'] = pitches['on_1b'].fillna(0)
pitches['on_2b'] = pitches['on_2b'] * 0 + 1
pitches['on_2b'] = pitches['on_2b'].fillna(0)
pitches['on_3b'] = pitches['on_3b'] * 0 + 1
pitches['on_3b'] = pitches['on_3b'].fillna(0)


pitches['bases_loaded'] = pitches['on_1b'] + pitches['on_2b'] + pitches['on_3b']
pitches['bases_loaded'] = pitches['bases_loaded'].apply(lambda x: 1 if x == 3 else 0)



---



## Swung

**Feature**: swung

**Description**: Binary feature describing wheather or not the batter swung at the pitch or not

In [None]:
swung = ['foul','hit_into_play','swinging_strike','hit_into_play_no_out',
         'hit_into_play_score','foul_tip','swinging_strike_blocked',
         'foul_bunt','missed_bunt']

pitches['batter_swung'] = pitches['description'].apply(lambda x: 1 if x in swung else 0)

In [None]:
pitches['ball_high'] = pitches['plate_z'] > pitches['sz_top']
pitches['ball_low'] = pitches['plate_z'] < pitches['sz_bot']
pitches['ball_left'] = pitches['plate_x'].apply(lambda x: x < -0.73)
pitches['ball_right'] = pitches['plate_x'].apply(lambda x: x > 0.73)

In [None]:
pitches['in_strikezone'] = (pitches['ball_high'].astype(int)
                            + pitches['ball_low'].astype(int)
                            + pitches['ball_left'].astype(int)
                            + pitches['ball_right'].astype(int))

pitches['in_strikezone'] = pitches['in_strikezone'].apply(
    lambda x: 0 
        if x > 0 
        else 1)

In [None]:
pitches['chased'] = pitches['batter_swung'] - pitches['in_strikezone']
pitches['chased'] = pitches['chased'].apply(lambda x: 1 if x == 1 else 0)

## Batters Data

In [None]:
sample_batter = list(pitches['batter'].unique())[0]
sample_batter

In [None]:
batter_df = pitches[pitches['batter'] == sample_batter]
batter_df.head()

In [None]:
next_probs = batter_df.groupby('pitch_type').size().div(len(batter_df))
next_probs

In [None]:
batter_df['pitch_type'].value_counts(normalize = True).to_dict()

In [None]:
pd.DataFrame(batter_df.groupby(['pitch_type', 'chased']).size().div(len(batter_df)).div(next_probs, axis=0, level='pitch_type'))

In [None]:
batter_dict = {}
    
pitch_types = pitches['pitch_type'].unique().tolist()
pitch_type_percentages = batter_df['pitch_type'].value_counts(normalize=True)
for pitch_type in pitch_types:
    batter_dict[pitch_type + '_perc_faced'] = pitch_type_percentages[pitch_type] * 100

In [None]:
batter_dict

In [None]:
for pitch_type in pitch_types:
    cat_df = batter_df[batter_df['pitch_type'] == pitch_type]
    out_of_strikezone = len(cat_df) - cat_df['in_strikezone'].sum()
    
    chased_count = cat_df['chased'].sum()
    chase_perc = (chased_count / out_of_strikezone) * 100
    batter_dict[pitch_type + '_chase_perc'] = chase_perc
    
    ball_in_play_count = len(cat_df[cat_df['type'] == 'X'])
    swung_count = cat_df['batter_swung'].sum()
    batter_dict[pitch_type + '_bip_swung_perc'] = (ball_in_play_count / swung_count) * 100

In [None]:
batter_dict

In [None]:
for pitch_type in pitch_types:
    ball_in_play_count = len(cat_df[cat_df['type'] == 'X'])
    swung_count = cat_df['batter_swung'].sum()
    batter_dict[pitch_type + '_bip_swung_perc'] = (ball_in_play_count / swung_count) * 100

In [None]:
 #calc ball in play % for each swing for each pitch cat:
            ball_in_play_count = len(cat_df[cat_df['type'] == 'X']) #type X means ball hit into play
            swung_count = cat_df['batter_swung'].sum() #counts all the 1s in the swung column
            #assign the ball in play % per swing to the batter dict
            batter_dict[cat + '_bip_swung_perc'] = (ball_in_play_count / swung_count) * 100
        

In [None]:
def make_batters_df(prior_df):
    df = prior_df.copy()
    #make list of the unique batter ids
    batters = list(df['batter'].unique())
    #initialize empty dictionary to store the batter stats
    batters_dict = {}
    #set a break flag to False for error-checking
    brk = False
    #iterate thru each unique batter
    for batter in batters:
        if brk:
            break
        #make subset of the df for that batter and assign to variable batter_df    
        batter_df = df[df['batter'] == batter]
        
        #assign all pitch categories to list:
        all_pitch_cats = ['fastball', 'breaking', 'offspeed', 'pitchout']
        
        #assign the pitch categories to a list
        try:
            pitch_cats = batter_df['pitch_type'].unique().tolist()
        except KeyError:
            print(batter)
            brk = True
        #get the normalized value counts of pitches by category that batter has faced
        vc = batter_df['pitch_type'].value_counts(normalize=True)
        #initialize empty dict for each batter
        batter_dict = {}
        
        #if there are any pitch categories the batter has not faced, 
        unfaced_cats = list(set(all_pitch_cats) - set(pitch_cats))
        
        for cat in pitch_cats:
            if brk:
                break
        
            #assign the % of pitches faced by the batter for that category to his batter dict
            try:
                batter_dict[cat + '_perc_faced'] = vc[cat] * 100
            except TypeError:
                print(batter)
                return 1
                
            #continue out of the loop for pitchout category since ball in play stats are NaN
            if cat == 'pitchout':
                continue
        
            #grab subset of batter df for the pitch category
            cat_df = batter_df[batter_df['pitch_type'] == cat]
            
            #if he has faced less than 100 pitches of that type, add it to unfaced_category and fill w NaN
            if len(cat_df) < 100:
                unfaced_cats.append(cat)
                continue
        
            #calculate batters chase % for pitch type category on balls outside the strikezone
            out_of_strikezone = len(cat_df) - cat_df['in_strikezone'].sum() #num of times ball was out of zone
            chased_count = cat_df['chased'].sum() #num of times batter chased
            try:
                chase_perc = (chased_count / out_of_strikezone) * 100
            except ZeroDivisionError:
                chase_perc = np.nan
            #assign the chase perc to the batter dict
            batter_dict[cat + '_chase_perc'] = chase_perc
        
            #calc ball in play % for each swing for each pitch cat:
            ball_in_play_count = len(cat_df[cat_df['type'] == 'X']) #type X means ball hit into play
            swung_count = cat_df['batter_swung'].sum() #counts all the 1s in the swung column
            #assign the ball in play % per swing to the batter dict
            batter_dict[cat + '_bip_swung_perc'] = (ball_in_play_count / swung_count) * 100
        
            #calculate taken strike %
            taken_strike_count = len(cat_df[(cat_df['in_strikezone'] == 1) & (cat_df['batter_swung'] == 0)])
            pitches_in_zone_count = cat_df['in_strikezone'].sum() #counts the 1s in the in zone col
            #assign to batter_dict
            batter_dict[cat + '_taken_strike_perc'] = (taken_strike_count / pitches_in_zone_count) * 100
        
            #for each pitch type category, get the batters stats on balls hit in play
            stats = ['estimated_woba_using_speedangle', 'babip_value', 'iso_value']
            for stat in stats:
                #drop Nans from the stat column and assign to new subset, for each stat
                stat_cat_df = cat_df.dropna(subset=[stat])
                if stat == 'estimated_woba_using_speedangle':
                    #get the mean avg_est_woba
                    avg_est_woba = stat_cat_df['estimated_woba_using_speedangle'].mean()
                    #assign that value to the batters dictionary
                    batter_dict[cat + '_est_woba'] = avg_est_woba
                    if avg_est_woba == np.nan:
                        print(batter)
                        brk = True
                        break
                elif stat == 'babip_value':
                    avg_babip = stat_cat_df['babip_value'].mean()
                    batter_dict[cat + '_babip'] = avg_babip
                else:
                    avg_iso_value = stat_cat_df['iso_value'].mean()
                    batter_dict[cat + '_iso_value'] = avg_iso_value
                    
        #for unfaced or small sample pitch_types: assign NaNs to his dictionary for that category
        for cat in unfaced_cats:
            if cat == 'pitchout':
                batter_dict[cat + '_perc_faced'] = 0
            else:
                batter_dict[cat + '_perc_faced'] = np.nan
                batter_dict[cat + '_chase_perc'] = np.nan
                batter_dict[cat + '_bip_swung_perc'] = np.nan
                batter_dict[cat + '_taken_strike_perc'] = np.nan
                batter_dict[cat + '_est_woba'] = np.nan
                batter_dict[cat + '_babip'] = np.nan
                batter_dict[cat + '_iso_value'] = np.nan
        
        #assign the batter dictionary to the main dictionary of all batters
        batters_dict[batter] = batter_dict
    if not brk:
        print('iteration completed successfully')
        
    #make df from the batters dict
    batters_df = pd.DataFrame.from_dict(batters_dict, orient='index')
    batters_df = batters_df.reset_index().rename(columns={'index':'batter'})
    return batters_df

In [None]:
batters_df = make_batters_df(pitches)
batters_df.head()

In [None]:
def downcast_dtypes(df):
    df = df.copy()
    int_cols = df.select_dtypes('int').columns.tolist()
    float_cols = df.select_dtypes('float').columns.tolist()
    obj_cols = df.select_dtypes('object').columns.tolist()
    cat_cols = []
    for col in obj_cols:
        if col == 'pitch_type':
            continue
        if len(df[col].unique()) < len(df)/2:
            cat_cols.append(col)
      
    ints = df[int_cols].apply(pd.to_numeric,downcast='unsigned')
    floats = df[float_cols].apply(pd.to_numeric,downcast='float')
    cats = df[cat_cols].astype('category')
  
    df = df.drop(columns=int_cols + float_cols + cat_cols)
    for d in [ints, floats, cats]:
        df = pd.concat([df, d], axis=1)
    return df

In [None]:
def pre_process_step1(combined):
    df = combined.copy()
    #convert the pitch type for UN (unknown) to np.nan
    df['pitch_type'] = df['pitch_type'].replace({'UN':np.nan})
    #fix some faulty data that has number of balls listed as 4:
    df['balls'] = df['balls'].replace({4.0: 3.0})
    
    #count, count_cat, score_diff, on_base 1/0, bases_loaded
    df = make_game_features(df)
    
    #batter_swung, in_strikezone, chased
    df = make_strikezone_swung_and_chase_features(df)
    
    #get aggregate pitcher %s dict from prior data:
    pitcher_dict = gen_pitcher_percentages(df)
    
    #fil the NaNs for pitch_type using randomized guess from pitcher tendencies
    df = fill_pitch_type_nans(df, pitcher_dict)
        
    #pitch_type category feature
    df = make_pitch_type_cat(df)
    
    return df

#pass in list of periods to update the data (and fill NaNs) using prior aggregates:
def pre_process_step2(pre_processed_step1, start_dates, end_dates):
    df = pre_processed_step1.copy()
    
    #initialize empty list to store dfs (concat them together later)
    df_list = []
    
    #iterate over each period
    for i in range(len(start_dates)):
        #make the prior and current dfs:
        prior_df = df[df['game_date'] < start_dates[i]]
        current_df = df[(df['game_date'] >= start_dates[i]) & (df['game_date'] <= end_dates[i])]
        
        #add the batter scouting report
        batters_df = make_batters_df(prior_df)
        current_df = pd.merge(current_df, batters_df, how='left', on='batter')
        
        #append the df to the list
        df_list.append(current_df)
    
    step2_df = pd.concat(df_list, sort=False)
    return step2_df

In [None]:
def get_pitch_tendencies(pitcher_df):
    #assign the normalized value counts for this pitchers pitch types to a dictionary
    pitcher_tendencies_overall = pitcher_df['pitch_type'].value_counts(normalize=True).to_dict()
    
    #initialize empty dict for count categories tendencies
    pitcher_tendencies_by_count = {}
    
    #loop over each count category and get the pitchers tendencies and add to the dict
    for cat in pitcher_df['count_cat'].unique().tolist():
        subset = pitcher_df[pitcher_df['count_cat'] == cat]
        pitcher_tendencies_by_count[cat] = subset['pitch_type'].value_counts(normalize=True).to_dict()
    return pitcher_tendencies_overall, pitcher_tendencies_by_count

In [None]:
def make_tendency_features(pitcher_df, pitcher_tendencies_overall, pitcher_tendencies_by_count):
    df = pitcher_df.copy()
    
    pitch_types = pitcher_tendencies_overall.keys()
    
    for pitch_type in pitch_types:
        overall_feature = 'overall_' + pitch_type + '_perc'
        count_cat_feature = 'count_cat_' + pitch_type + '_perc'
        
        def get_overall_perc(x):
            return pitcher_tendencies_overall[x]
        def get_by_count_perc(x):
            try:
                return pitcher_tendencies_by_count[x][pitch_type]
            except KeyError:
                return 0
        
        df[overall_feature] = pitch_type
        df[overall_feature] = df[overall_feature].apply(get_overall_perc)
        df[count_cat_feature] = df['count_cat'].apply(get_by_count_perc)
    return df

In [None]:
start_dates = ['2018-03-29', '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01', 
               '2018-09-01', '2019-03-28', '2019-05-01', '2019-06-01', '2019-07-01', 
               '2019-08-01']

end_dates =  ['2018-04-30', '2018-05-31', '2018-06-30', '2018-07-31', '2018-08-31', 
              '2018-10-01', '2019-04-30', '2019-05-31', '2019-06-30', '2019-07-31', 
              '2019-08-31']

  
def add_pitcher_scouting_report(pitcher_df, pitcher_df17, start_dates, end_dates):
    df = pd.concat([pitcher_df, pitcher_df17], sort=False)
    
    #initialize empty list to store dfs (concat them together later)
    df_list = []
    
    #iterate over each period
    for i in range(len(start_dates)):
      
        #make the prior and current dfs:
        prior_df = df[df['game_date'] < start_dates[i]]
        current_df = df[(df['game_date'] >= start_dates[i]) & (df['game_date'] <= end_dates[i])]
        
        #get the pitch tendencies from prior:
        pitcher_tendencies_overall, pitcher_tendencies_by_count = get_pitch_tendencies(prior_df)
        
        #make the pitch tendencies features on current:
        current_df = make_tendency_features(current_df, pitcher_tendencies_overall, pitcher_tendencies_by_count)
        
        #append the df to the list
        df_list.append(current_df)
    
    df = pd.concat(df_list, sort=False)
    return df

In [None]:
def make_game_batting_order(game_df):
    game_df = game_df.sort_values(by=['at_bat_number', 'pitch_number'])
    all_batters = game_df['batter'].unique().tolist()
    #re-set the at_bat_number for the game to be sequential starting at 1
    at_bat_keys = game_df['at_bat_number'].unique().tolist()
    at_bat_values = range(1, len(at_bat_keys)+1)
    at_bat_map = dict(zip(at_bat_keys, at_bat_values))
    game_df['at_bat_number'] = game_df['at_bat_number'].replace(at_bat_map)
    
    #get the first 9 batter ids
    first_9_batter_subset = game_df[game_df['at_bat_number'] < 10]
    first_9_batters = first_9_batter_subset['batter'].unique().tolist()
    
    #map the batter id to batting order position 1-9
    batting_order_map = dict(zip(first_9_batters, range(1,10)))
    
    #for anyone else who bats later in the game, assign 'PH' (pinch hitter) to their batting order slot
    other_batters = list(set(all_batters) - set(first_9_batters))
    if len(other_batters) > 0:
        for batter in other_batters:
            batting_order_map[batter] = 'PH'
            
  
    try:
        game_df['batting_order_slot'] = game_df['batter'].apply(lambda x: batting_order_map[x])    
    except KeyError:
        game_df = None
        return game_df
    
    game_df['pitcher_AB'] = game_df['batter'].apply(lambda x: True if x in pitcher_list else False)
    game_df['batting_order_slot'] = game_df['batting_order_slot'].where(game_df['pitcher_AB'] == False, other='pitcher')
    return game_df

In [None]:
def make_game_pitchcount_and_trailing_pitch_features(pitcher_df, pitcher_list):
    df = pitcher_df.copy()
    
    print('#pitches in df before: ' + str(len(df)))
    
    pitcher_tendencies_overall, pitcher_tendencies_by_count = get_pitch_tendencies(df)
    games = df['game_pk'].unique().tolist()
    
    #take the first game and make the pitch count feature
    first_game_df = df[df['game_pk'] == games[0]].copy()
    first_game_df['pitch_count'] = range(1, first_game_df.shape[0] + 1)
    
    #make the L1_pitch type feature:
    first_game_df['L1_pitch_type'] = first_game_df['pitch_type'].shift(periods=1)
    first_game_df['L1_pitch_result'] = first_game_df['type'].shift(periods=1)
    first_game_df['L1_pitch_result'] = first_game_df['L1_pitch_result'].replace({np.nan:'first pitch'})
    first_game_df['L1_pitch_zone'] = first_game_df['zone'].shift(periods=1)
    first_game_df['L1_pitch_zone'] = first_game_df['L1_pitch_zone'].fillna(-1)
    
    #overall strike % (to fill in for first 5 pitches L5_strike_perc)
    overall_strike_perc = df['type'].value_counts(normalize=True)['S'] * 100
    
    #make the trailing 5 pitches:
    for index, row in first_game_df.iterrows():
        #fill NaNs for L1_pitch using same method as when pitch_type was missing
        if row['pitch_count'] == 1:
            random_pitch = random.choices(population=list(pitcher_tendencies_overall.keys()), 
                               weights=list(pitcher_tendencies_overall.values()), 
                               k=1)[0]
            first_game_df.at[index, 'L1_pitch_type'] = random_pitch
            
        #for the first 5 rows, use overall pitcher tendencies    
        if row['pitch_count'] < 6:
            #fill with overall tendencies
            for pitch in list(pitcher_tendencies_overall.keys()):
                feature = 'L5_' + pitch + '_perc'
                first_game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100  
                #strike %
                first_game_df.at[index, 'L5_strike_perc'] = overall_strike_perc
        else:
            current_pitch = first_game_df.at[index, 'pitch_count']
            #make a subset of the prev 5 pitches
            subset = first_game_df[(first_game_df['pitch_count'] > current_pitch - 6) & (first_game_df['pitch_count'] < current_pitch)]
            #grab the value count percentages for the last 5 pitches
            subset_percentages = subset['pitch_type'].value_counts(normalize=True).to_dict()
            try:
                L5_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
            except KeyError:
                L5_strike_perc = 0
                
            first_game_df.at[index, 'L5_strike_perc'] = L5_strike_perc
            
            #iterate over all possible pitch types this pitcher throws:
            for pitch in list(pitcher_tendencies_overall.keys()):
                feature = 'L5_' + pitch + '_perc'
                #if he has thrown that pitch type in last 5
                try:
                    first_game_df.at[index, feature] = subset_percentages[pitch] * 100
                #except for when he hasnt thrown that type in last 5
                except:
                    first_game_df.at[index, feature] = 0
                    
    #apply the battting order features to the game:                
    first_game_df = make_game_batting_order(first_game_df)
    
    #iterate the same process for the rest of his games:
    for game in games[1:]:
        game_df = df[df['game_pk'] == game].copy() #get df for that game only
        game_df['pitch_count'] = range(1, game_df.shape[0] + 1) #make the pitch count for the game
        game_df['L1_pitch_type'] = game_df['pitch_type'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['type'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['L1_pitch_result'].replace({np.nan:'first pitch'})
        game_df['L1_pitch_zone'] = game_df['zone'].shift(periods=1)
        game_df['L1_pitch_zone'] = game_df['L1_pitch_zone'].fillna(0)
        
        #make the trailing 5 pitches:
        for index, row in game_df.iterrows():
            #fill NaNs for L1_pitch using same method as when pitch_type was missing
            if row['pitch_count'] == 1:
                random_pitch = random.choices(population=list(pitcher_tendencies_overall.keys()), 
                               weights=list(pitcher_tendencies_overall.values()), 
                               k=1)[0]
                game_df.at[index, 'L1_pitch_type'] = random_pitch
            
            if row['pitch_count'] < 6:
                #fill with overall tendencies
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100
                    #strike %
                    game_df.at[index, 'L5_strike_perc'] = overall_strike_perc
            else:
                current_pitch = game_df.at[index, 'pitch_count']
                subset = game_df[(game_df['pitch_count'] > current_pitch - 6) & (game_df['pitch_count'] < current_pitch)]
                subset_percentages = subset['pitch_type'].value_counts(normalize=True).to_dict()
                
                try:
                    L5_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                except KeyError:
                    L5_strike_perc = 0
                game_df.at[index, 'L5_strike_perc'] = L5_strike_perc
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    try:
                        game_df.at[index, feature] = subset_percentages[pitch] * 100
                    except:
                        game_df.at[index, feature] = 0
                        
        #apply the battting order features to the game:                
        game_df = make_game_batting_order(game_df)
        if game_df.empty:
            print('skipping game because of bat data: ' + str(game))
            continue
        
        #concatenate that game w/ updated pitch count and trailing pitches w/ prev games
        if game_df['game_pk'].values[0] == games[1]:
            new_df = pd.concat([first_game_df, game_df]) #concat the game_df w/ the first game
        else:
            new_df = pd.concat([new_df, game_df]) #concat the game_df w/ the previous games
    
    print('# pitches in df after: ' + str(len(new_df)))
    
    return new_df

In [None]:
batter_cols = ['fastball_perc_faced','fastball_chase_perc','fastball_bip_swung_perc', 'fastball_taken_strike_perc',
               'fastball_est_woba', 'fastball_babip', 'fastball_iso_value', 'breaking_perc_faced', 'breaking_chase_perc',
               'breaking_bip_swung_perc', 'breaking_taken_strike_perc', 'breaking_est_woba', 'breaking_babip', 
               'breaking_iso_value', 'offspeed_perc_faced', 'offspeed_chase_perc', 'offspeed_bip_swung_perc',
               'offspeed_taken_strike_perc', 'offspeed_est_woba', 'offspeed_babip', 'offspeed_iso_value',
               'pitchout_perc_faced']

def fill_batting_nans(pitcher_df, batting_order_slot_map):
    df = pitcher_df.copy()
    for slot in df['batting_order_slot'].unique().tolist():
        subset = df[df['batting_order_slot'] == slot].copy()
        df = df.drop(subset.index)
        for col in batter_cols:
            subset[col] = subset[col].fillna(batting_order_slot_map[slot][col])
        df = pd.concat([df, subset])
        print('finished w/ slot: ' + str(slot))
    return df

In [None]:
def get_left_right_pitch_tendencies(pitcher_df):
    #split the df into left hand and right handed batters
    left = pitcher_df[pitcher_df['stand'] == 'L'].copy()
    right = pitcher_df[pitcher_df['stand'] == 'R'].copy()
    
    #assign the normalized value counts for this pitchers pitch types to a dictionary
    overall_left = left['pitch_cat'].value_counts(normalize=True).to_dict()
    overall_right = right['pitch_cat'].value_counts(normalize=True).to_dict()
    
    #initialize empty dict for count categories tendencies
    by_count_left = {}
    by_count_right = {}
    
    #loop over each count category and get the pitchers tendencies and add to the dict
    for cat in pitcher_df['count_cat'].unique().tolist():
        left_subset = left[left['count_cat'] == cat]
        right_subset = right[right['count_cat'] == cat]
        
        by_count_left[cat] = left_subset['pitch_cat'].value_counts(normalize=True).to_dict()
        by_count_right[cat] = right_subset['pitch_cat'].value_counts(normalize=True).to_dict()
        
    return overall_left, overall_right, by_count_left, by_count_right

In [None]:
def make_tendency_features(pitcher_df, overall_left, overall_right, by_count_left, by_count_right):
    #helper functions to vectorize w/ df.apply():
    def get_overall_left_perc(x):
        return overall_left[x] * 100
    def get_overall_right_perc(x):
        return overall_right[x] * 100
    def get_by_count_left_perc(x):
        try:
            return by_count_left[x][pitch_type] * 100
        except KeyError:
            return 0
    def get_by_count_right_perc(x):
        try:
            return by_count_right[x][pitch_type] * 100
        except KeyError:
            return 0
    
    left = pitcher_df[pitcher_df['stand'] == 'L'].copy()
    right = pitcher_df[pitcher_df['stand'] == 'R'].copy()
    
    pitch_types_left = overall_left.keys()
    pitch_types_right = overall_right.keys()
    
    #Left
    for pitch_type in pitch_types_left:
        overall_feature = 'overall_' + pitch_type + '_perc'
        count_cat_feature = 'count_cat_' + pitch_type + '_perc'
      
        left[overall_feature] = pitch_type
        left[overall_feature] = left[overall_feature].apply(get_overall_left_perc)
        left[count_cat_feature] = left['count_cat'].apply(get_by_count_left_perc)
    #Right
    for pitch_type in pitch_types_right:
        overall_feature = 'overall_' + pitch_type + '_perc'
        count_cat_feature = 'count_cat_' + pitch_type + '_perc'
      
        right[overall_feature] = pitch_type
        right[overall_feature] = right[overall_feature].apply(get_overall_right_perc)
        right[count_cat_feature] = right['count_cat'].apply(get_by_count_right_perc)
        
    
    return pd.concat([left,right], sort=False).sort_values(by=['game_date', 'game_pk', 'at_bat_number', 'pitch_number'])


In [None]:
def add_pitcher_scouting_report(pitcher_df, pitcher_df17, start_dates, end_dates):
    df = pd.concat([pitcher_df, pitcher_df17], sort=False)
    
    #initialize empty list to store dfs (concat them together later)
    df_list = []
    
    #iterate over each period
    for i in range(len(start_dates)):
      
        #make the prior and current dfs:
        prior_df = df[df['game_date'] < start_dates[i]]
        current_df = df[(df['game_date'] >= start_dates[i]) & (df['game_date'] <= end_dates[i])].copy()
        
        #get the pitch tendencies from prior:
        overall_left, overall_right, by_count_left, by_count_right = get_left_right_pitch_tendencies(prior_df)
        
        #make the pitch tendencies features on current:
        current_df = make_tendency_features(current_df, overall_left, overall_right, by_count_left, by_count_right)
        
        #append the df to the list
        df_list.append(current_df)
    
    df = pd.concat(df_list, sort=False)
    return df


In [None]:
def make_game_batting_order(game_df):
    game_df = game_df.sort_values(by=['at_bat_number', 'pitch_number'])
    all_batters = game_df['batter'].unique().tolist()
    #re-set the at_bat_number for the game to be sequential starting at 1
    at_bat_keys = game_df['at_bat_number'].unique().tolist()
    at_bat_values = range(1, len(at_bat_keys)+1)
    at_bat_map = dict(zip(at_bat_keys, at_bat_values))
    game_df['at_bat_number'] = game_df['at_bat_number'].replace(at_bat_map)
    
    #get the first 9 batter ids
    first_9_batter_subset = game_df[game_df['at_bat_number'] < 10]
    first_9_batters = first_9_batter_subset['batter'].unique().tolist()
    
    #map the batter id to batting order position 1-9
    batting_order_map = dict(zip(first_9_batters, range(1,10)))
    
    #for anyone else who bats later in the game, assign 'PH' (pinch hitter) to their batting order slot
    other_batters = list(set(all_batters) - set(first_9_batters))
    if len(other_batters) > 0:
        for batter in other_batters:
            batting_order_map[batter] = 'PH'
    try:
        game_df['batting_order_slot'] = game_df['batter'].apply(lambda x: batting_order_map[x])    
    except KeyError:
        game_df = None
        return game_df
    
    game_df['pitcher_AB'] = game_df['batter'].apply(lambda x: True if x in pitcher_list else False)
    game_df['batting_order_slot'] = game_df['batting_order_slot'].where(game_df['pitcher_AB'] == False, other='pitcher')
    return game_df

In [None]:
def get_pitch_tendencies(pitcher_df):
    #assign the normalized value counts for this pitchers pitch types to a dictionary
    pitcher_tendencies_overall = pitcher_df['pitch_cat'].value_counts(normalize=True).to_dict()

    #initialize empty dict for count categories tendencies
    pitcher_tendencies_by_count = {}
    
    #loop over each count category and get the pitchers tendencies and add to the dict
    for cat in pitcher_df['count_cat'].unique().tolist():
        subset = pitcher_df[pitcher_df['count_cat'] == cat]
        pitcher_tendencies_by_count[cat] = subset['pitch_cat'].value_counts(normalize=True).to_dict()
    return pitcher_tendencies_overall, pitcher_tendencies_by_count


In [None]:
def make_game_pitchcount_and_trailing_pitch_features_and_batting_order(pitcher_df, pitcher_list):
    df = pitcher_df.copy()
    all_games = []
    
    print('#pitches in df before: ' + str(len(df)))
    
    pitcher_tendencies_overall, pitcher_tendencies_by_count = get_pitch_tendencies(df)
    games = df['game_pk'].unique().tolist()
    
    for game in games:
        
        #take the first game and make the pitch count feature
        game_df = df[df['game_pk'] == game].copy()
        game_df['pitch_count'] = range(1, game_df.shape[0] + 1)
    
        #make the L1_pitch type feature:
        game_df['L1_pitch_type'] = game_df['pitch_cat'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['type'].shift(periods=1)
        game_df['L1_pitch_result'] = game_df['L1_pitch_result'].replace({np.nan:'first pitch'})
        game_df['L1_pitch_zone'] = game_df['zone'].shift(periods=1)
        game_df['L1_ball_high'] = game_df['ball_high'].shift(periods=1)
        game_df['L1_ball_low'] = game_df['ball_low'].shift(periods=1)
        game_df['L1_ball_left'] = game_df['ball_left'].shift(periods=1)
        game_df['L1_ball_right'] = game_df['ball_right'].shift(periods=1)
        
        game_df[['L1_pitch_zone', 'L1_ball_high', 'L1_ball_low', 'L1_ball_left', 'L1_ball_right']] = game_df[['L1_pitch_zone', 'L1_ball_high', 'L1_ball_low', 'L1_ball_left', 'L1_ball_right']].fillna(-1)
        #game_df['L1_pitch_zone'] = game_df['L1_pitch_zone'].fillna(-1)

        #overall strike % (to fill in for first 5 pitches L5_strike_perc)
        overall_strike_perc = df['type'].value_counts(normalize=True)['S'] * 100

        #make the trailing 5 pitches:
        for index, row in game_df.iterrows():
            #fill NaNs for L1_pitch using same method as when pitch_type was missing
            if row['pitch_count'] == 1:
                random_pitch = random.choices(population=list(pitcher_tendencies_overall.keys()), 
                                   weights=list(pitcher_tendencies_overall.values()), 
                                   k=1)[0]
                game_df.at[index, 'L1_pitch_type'] = random_pitch

            #for the first 5 rows, use overall pitcher tendencies    
            if row['pitch_count'] < 6:
                #fill with overall tendencies
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100
                    feature = 'L15_' + pitch + '_perc'
                    game_df.at[index, feature] = pitcher_tendencies_overall[pitch] * 100
                #strike %
                game_df.at[index, 'L5_strike_perc'] = overall_strike_perc
                game_df.at[index, 'L15_strike_perc'] = overall_strike_perc

            else:
                current_pitch = game_df.at[index, 'pitch_count']
                #make a subset of the prev 5 pitches
                subset = game_df[(game_df['pitch_count'] > current_pitch - 6) & (game_df['pitch_count'] < current_pitch)]
                #grab the value count percentages for the last 5 pitches
                subset_percentages = subset['pitch_cat'].value_counts(normalize=True).to_dict()
                try:
                    L5_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                except KeyError:
                    L5_strike_perc = 0

                game_df.at[index, 'L5_strike_perc'] = L5_strike_perc

                #iterate over all possible pitch types this pitcher throws:
                for pitch in list(pitcher_tendencies_overall.keys()):
                    feature = 'L5_' + pitch + '_perc'
                    #if he has thrown that pitch type in last 5
                    try:
                        game_df.at[index, feature] = subset_percentages[pitch] * 100
                    #except for when he hasnt thrown that type in last 5
                    except:
                        game_df.at[index, feature] = 0


                if row['pitch_count'] < 16:
                    #make a subset of the prev 15 pitches
                    subset = game_df[(game_df['pitch_count'] < current_pitch)]
                    #grab the value count percentages for the last 15 pitches
                    subset_percentages = subset['pitch_cat'].value_counts(normalize=True).to_dict()
                    try:
                        L15_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                    except KeyError:
                        L15_strike_perc = 0

                    game_df.at[index, 'L15_strike_perc'] = L15_strike_perc

                    #iterate over all possible pitch types this pitcher throws:
                    for pitch in list(pitcher_tendencies_overall.keys()):
                        feature = 'L15_' + pitch + '_perc'
                        #if he has thrown that pitch type in last 15
                        try:
                            game_df.at[index, feature] = subset_percentages[pitch] * 100
                        #except for when he hasnt thrown that type in last 5
                        except:
                            game_df.at[index, feature] = 0
                else:
                    #make a subset of the prev 15 pitches
                    subset = game_df[(game_df['pitch_count'] > current_pitch - 16) & (game_df['pitch_count'] < current_pitch)]
                    #grab the value count percentages for the last 5 pitches
                    subset_percentages = subset['pitch_cat'].value_counts(normalize=True).to_dict()
                    try:
                        L15_strike_perc = subset['type'].value_counts(normalize=True)['S'] * 100
                    except KeyError:
                        L15_strike_perc = 0

                    game_df.at[index, 'L15_strike_perc'] = L15_strike_perc

                    #iterate over all possible pitch types this pitcher throws:
                    for pitch in list(pitcher_tendencies_overall.keys()):
                        feature = 'L15_' + pitch + '_perc'
                        #if he has thrown that pitch type in last 5
                        try:
                            game_df.at[index, feature] = subset_percentages[pitch] * 100
                        #except for when he hasnt thrown that type in last 5
                        except:
                            game_df.at[index, feature] = 0
                            
        #apply the battting order features to the game:                
        game_df = make_game_batting_order(game_df)
    
        all_games.append(game_df)
    
    new_df = pd.concat(all_games).sort_values(by=['game_date', 'game_pk', 'at_bat_number', 'pitch_number'])
    
    print('# pitches in df after: ' + str(len(new_df)))
    
    return new_df

In [None]:
def make_prev_ab_walk_basehit_run_and_homerun_features(pitcher_df):
    
    all_games = []
    #iterate over each game
    for game in pitcher_df['game_pk'].unique():
        #make subset df for that game
        game_df = pitcher_df[pitcher_df['game_pk'] == game].copy()
        #initialize columns to False:
        game_df['prev_ab_run_scored'] = False
        game_df['prev_ab_homerun'] = False
        game_df['prev_ab_walk'] = False
        game_df['prev_ab_basehit'] = False
        game_df['prev_ab_strikeout'] = False
        
        #this gets the 
        at_bats = game_df['at_bat_number'].sort_values().unique()
        
        #initialize empty dicts
        run_scored = []
        homeruns = []
        walks = []
        basehits = []
        strikeouts = []
        
        walks = ['walk', 'hit_by_pitch']
        basehits = ['single', 'double', 'triple', 'home_run']
        
        #starting w/ 2nd AB, iterate thru to the end of the at_bats:
        for ab in at_bats[2:]:
            #get the index for the last pitch of the prev AB
            prev_ab_last_pitch_index = game_df[game_df['at_bat_number'] == ab-1]['pitch_number'].index.max()
            #check if the last pitch resulted in a walk or hit by pitch:
            if game_df.loc[prev_ab_last_pitch_index]['events'] in walks:
                #if so, add an entry
                walks.append(ab)
            #check if last pitch gave up a basehit:
            elif game_df.loc[prev_ab_last_pitch_index]['events'] in basehits:
                basehits.append(ab)
            elif game_df.loc[prev_ab_last_pitch_index]['events'] == 'strikeout':
                strikeouts.append(ab)
            
            #to check if prev AB resulted in a run scoring: compare score before and after the AB
            prev_score = game_df[game_df['at_bat_number'] == ab-1]['bat_score'].values[0]
            current_score = game_df[game_df['at_bat_number'] == ab]['bat_score'].values[0]
            
            if current_score > prev_score:
                run_scored.append(ab)
                 
                #check if last AB gave up a homerun:
                if game_df.loc[prev_ab_last_pitch_index]['events'] == 'home_run':
                    homeruns.append(ab)
                    
        #iterate over each at_bat, and add the features to the df where appropriate
        for ab in at_bats:
            idx = game_df[game_df['at_bat_number'] == ab].index
            if ab in walks:
                game_df.at[idx, 'prev_ab_walk'] = True
            elif ab in basehits:
                game_df.at[idx, 'prev_ab_basehit'] = True
            elif ab in strikeouts:
                game_df.at[idx, 'prev_ab_strikeout'] = True
            if ab in run_scored:
                game_df.at[idx, 'prev_ab_run_scored'] = True
                if ab in homeruns:
                    game_df.at[idx, 'prev_ab_homerun'] = True
        all_games.append(game_df)
        
    return pd.concat(all_games).sort_values(by=['game_date', 'game_pk', 'pitch_count'])

In [None]:
batter_cols = ['fastball_perc_faced','fastball_chase_perc','fastball_bip_swung_perc', 'fastball_taken_strike_perc',
               'fastball_est_woba', 'fastball_babip', 'fastball_iso_value', 'breaking_perc_faced', 'breaking_chase_perc',
               'breaking_bip_swung_perc', 'breaking_taken_strike_perc', 'breaking_est_woba', 'breaking_babip', 
               'breaking_iso_value', 'offspeed_perc_faced', 'offspeed_chase_perc', 'offspeed_bip_swung_perc',
               'offspeed_taken_strike_perc', 'offspeed_est_woba', 'offspeed_babip', 'offspeed_iso_value',
               'pitchout_perc_faced']

def fill_batting_nans(pitcher_df, batting_order_slot_map):
    df = pitcher_df.copy()
    for slot in df['batting_order_slot'].unique().tolist():
        subset = df[df['batting_order_slot'] == slot].copy()
        df = df.drop(subset.index)
        for col in batter_cols:
            subset[col] = subset[col].fillna(batting_order_slot_map[slot][col])
        df = pd.concat([df, subset])
        print('finished w/ slot: ' + str(slot))
    df = df.sort_values(by=['game_date', 'game_pk', 'pitch_count'])
    return df

In [None]:
def add_pb_matchup_priors(pitcher_df, pitcher_df17, start_dates, end_dates):
    df = pd.concat([pitcher_df, pitcher_df17], sort=False)
    
    #initialize empty list to store dfs (concat them together later)
    df_list = []
    
    #iterate over each period
    for i in range(len(start_dates)):
        
        #make the prior and current dfs:
        prior_df = df[df['game_date'] < start_dates[i]]
        current_df = df[(df['game_date'] >= start_dates[i]) & (df['game_date'] <= end_dates[i])]
        
        #get all the pitch_types this pitcher has thrown in the past:
        pitch_types = prior_df['pitch_cat'].unique().tolist()
        
        try:
            pitch_types.remove('PO')
        except:
            pass
        print(pitch_types)
        
        #get a list of the batters in the current_df
        current_batters = current_df['batter'].unique().tolist()
        
        batters_dict = {}
        
        current_df_list = []
        
        for batter in current_batters:
            batter_df_list = []
            
            #first use subset from prior df
            batter_subset = prior_df[prior_df['batter'] == batter].copy()
            #if pitcher has never faced this batter before:
            if batter_subset.empty:
                #get the left or right handedness of the batter
                stand = current_df[current_df['batter'] == batter]['stand'].values[0]
                #use overall prior tendencies vs left or right handed hitters
                overall, by_count = get_pitch_tendencies(prior_df[prior_df['stand'] == stand])
            else:
                overall, by_count = get_pitch_tendencies(batter_subset)
            batters_dict[batter] = by_count
            
            #now use subset of current_df where batter=batter
            batter_subset = current_df[current_df['batter'] == batter].copy()
            #iterate over the different count_cat types:
            for count_cat in ['ahead', 'behind', 'neutral']:
                count_subset = batter_subset[batter_subset['count_cat'] == count_cat].copy()
                if count_subset.empty:
                    continue
                else:
                    for pitch in pitch_types:
                        try:
                            count_subset['PB_'+pitch] = batters_dict[batter][count_cat][pitch] * 100
                        except KeyError:
                            count_subset['PB_'+pitch] = 0
                            
                current_df_list.append(count_subset)
        
        current_df = pd.concat(current_df_list, sort=False)
        df_list.append(current_df)
                    
    new_df = pd.concat(df_list, sort=False).sort_values(by=['game_date', 'game_pk', 'pitch_count'])
    return new_df