In [None]:
def get_pitch_tendencies(pitcher_df):
    #assign the normalized value counts for this pitchers pitch types to a dictionary
    pitcher_tendencies_overall = pitcher_df['pitch_type'].value_counts(normalize=True).to_dict()
    
    #initialize empty dict for count categories tendencies
    pitcher_tendencies_by_count = {}
    
    #loop over each count category and get the pitchers tendencies and add to the dict
    for cat in pitcher_df['count_cat'].unique().tolist():
        subset = pitcher_df[pitcher_df['count_cat'] == cat]
        pitcher_tendencies_by_count[cat] = subset['pitch_type'].value_counts(normalize=True).to_dict()
    return pitcher_tendencies_overall, pitcher_tendencies_by_count

In [None]:
start_dates = ['2018-03-29', '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01', 
               '2018-09-01', '2019-03-28', '2019-05-01', '2019-06-01', '2019-07-01', 
               '2019-08-01']

end_dates =  ['2018-04-30', '2018-05-31', '2018-06-30', '2018-07-31', '2018-08-31', 
              '2018-10-01', '2019-04-30', '2019-05-31', '2019-06-30', '2019-07-31', 
              '2019-08-31']

  
def add_pitcher_scouting_report(pitcher_df, pitcher_df17, start_dates, end_dates):
    df = pd.concat([pitcher_df, pitcher_df17], sort=False)
    
    #initialize empty list to store dfs (concat them together later)
    df_list = []
    
    #iterate over each period
    for i in range(len(start_dates)):
      
        #make the prior and current dfs:
        prior_df = df[df['game_date'] < start_dates[i]]
        current_df = df[(df['game_date'] >= start_dates[i]) & (df['game_date'] <= end_dates[i])].copy()
        
        #get the pitch tendencies from prior:
        overall_left, overall_right, by_count_left, by_count_right = get_left_right_pitch_tendencies(prior_df)
        
        #make the pitch tendencies features on current:
        current_df = make_tendency_features(current_df, overall_left, overall_right, by_count_left, by_count_right)
        
        #append the df to the list
        df_list.append(current_df)
    
    df = pd.concat(df_list, sort=False)
    return df

In [None]:
def get_left_right_pitch_tendencies(pitcher_df):
    #split the df into left hand and right handed batters
    left = pitcher_df[pitcher_df['stand'] == 'L'].copy()
    right = pitcher_df[pitcher_df['stand'] == 'R'].copy()
    
    #assign the normalized value counts for this pitchers pitch types to a dictionary
    overall_left = left['pitch_cat'].value_counts(normalize=True).to_dict()
    overall_right = right['pitch_cat'].value_counts(normalize=True).to_dict()
    
    #initialize empty dict for count categories tendencies
    by_count_left = {}
    by_count_right = {}
    
    #loop over each count category and get the pitchers tendencies and add to the dict
    for cat in pitcher_df['count_cat'].unique().tolist():
        left_subset = left[left['count_cat'] == cat]
        right_subset = right[right['count_cat'] == cat]
        
        by_count_left[cat] = left_subset['pitch_cat'].value_counts(normalize=True).to_dict()
        by_count_right[cat] = right_subset['pitch_cat'].value_counts(normalize=True).to_dict()
        
    return overall_left, overall_right, by_count_left, by_count_right

In [None]:
def make_tendency_features(pitcher_df, overall_left, overall_right, by_count_left, by_count_right):
    #helper functions to vectorize w/ df.apply():
    def get_overall_left_perc(x):
        return overall_left[x] * 100
    def get_overall_right_perc(x):
        return overall_right[x] * 100
    def get_by_count_left_perc(x):
        try:
            return by_count_left[x][pitch_type] * 100
        except KeyError:
            return 0
    def get_by_count_right_perc(x):
        try:
            return by_count_right[x][pitch_type] * 100
        except KeyError:
            return 0
    
    left = pitcher_df[pitcher_df['stand'] == 'L'].copy()
    right = pitcher_df[pitcher_df['stand'] == 'R'].copy()
    
    pitch_types_left = overall_left.keys()
    pitch_types_right = overall_right.keys()
    
    #Left
    for pitch_type in pitch_types_left:
        overall_feature = 'overall_' + pitch_type + '_perc'
        count_cat_feature = 'count_cat_' + pitch_type + '_perc'
      
        left[overall_feature] = pitch_type
        left[overall_feature] = left[overall_feature].apply(get_overall_left_perc)
        left[count_cat_feature] = left['count_cat'].apply(get_by_count_left_perc)
    #Right
    for pitch_type in pitch_types_right:
        overall_feature = 'overall_' + pitch_type + '_perc'
        count_cat_feature = 'count_cat_' + pitch_type + '_perc'
      
        right[overall_feature] = pitch_type
        right[overall_feature] = right[overall_feature].apply(get_overall_right_perc)
        right[count_cat_feature] = right['count_cat'].apply(get_by_count_right_perc)
        
    
    return pd.concat([left,right], sort=False).sort_values(by=['game_date', 'game_pk', 'at_bat_number', 'pitch_number'])


In [None]:
def add_pb_matchup_priors(pitcher_df, pitcher_df17, start_dates, end_dates):
    df = pd.concat([pitcher_df, pitcher_df17], sort=False)
    
    #initialize empty list to store dfs (concat them together later)
    df_list = []
    
    #iterate over each period
    for i in range(len(start_dates)):
        
        #make the prior and current dfs:
        prior_df = df[df['game_date'] < start_dates[i]]
        current_df = df[(df['game_date'] >= start_dates[i]) & (df['game_date'] <= end_dates[i])]
        
        #get all the pitch_types this pitcher has thrown in the past:
        pitch_types = prior_df['pitch_cat'].unique().tolist()
        
        try:
            pitch_types.remove('PO')
        except:
            pass
        print(pitch_types)
        
        #get a list of the batters in the current_df
        current_batters = current_df['batter'].unique().tolist()
        
        batters_dict = {}
        
        current_df_list = []
        
        for batter in current_batters:
            batter_df_list = []
            
            #first use subset from prior df
            batter_subset = prior_df[prior_df['batter'] == batter].copy()
            #if pitcher has never faced this batter before:
            if batter_subset.empty:
                #get the left or right handedness of the batter
                stand = current_df[current_df['batter'] == batter]['stand'].values[0]
                #use overall prior tendencies vs left or right handed hitters
                overall, by_count = get_pitch_tendencies(prior_df[prior_df['stand'] == stand])
            else:
                overall, by_count = get_pitch_tendencies(batter_subset)
            batters_dict[batter] = by_count
            
            #now use subset of current_df where batter=batter
            batter_subset = current_df[current_df['batter'] == batter].copy()
            #iterate over the different count_cat types:
            for count_cat in ['ahead', 'behind', 'neutral']:
                count_subset = batter_subset[batter_subset['count_cat'] == count_cat].copy()
                if count_subset.empty:
                    continue
                else:
                    for pitch in pitch_types:
                        try:
                            count_subset['PB_'+pitch] = batters_dict[batter][count_cat][pitch] * 100
                        except KeyError:
                            count_subset['PB_'+pitch] = 0
                            
                current_df_list.append(count_subset)
        
        current_df = pd.concat(current_df_list, sort=False)
        df_list.append(current_df)
                    
    new_df = pd.concat(df_list, sort=False).sort_values(by=['game_date', 'game_pk', 'pitch_count'])
    return new_df