In [1]:
import pandas as pd
import numpy as np
import random
import pickle
import gc
#import psycopg2
from matplotlib import pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [16]:
pitcher = 'Bauer'

#best models pickle
path = '/home/ec2-user/SageMaker/RC-v1.2-Predictive-Modelling/modelling_noteboooks/'
models_df = pd.read_pickle(path+pitcher+'_ordinal_multiclass_best_models_v1.pkl', compression='zip').reset_index(drop=True)

In [21]:
#replace some categoricals with custom strategic ordinal encoding scale:
def custom_ordinal_ecode(df):
    df = df.copy()
        
    #description cols:
    desc_map = {'called_strike':1,
                'swinging_strike':2,
                'foul_tip':3,
                'foul':4,
                'swinging_strike_blocked':5,
                'foul_bunt':6,
                'missed_bunt':6,
                'bunt_foul_tip':6,
                'N/A':7,
                'pitchout':7,
                'hit_into_play':8,
                'ball':9,
                'blocked_ball':10,
                'hit_by_pitch':11,
                'hit_into_play_no_out':12,
                'hit_into_play_score':13}
    
    desc_cols = ['L1_description', 'L2_description', 'L3_description']
    df[desc_cols] = df[desc_cols].replace(desc_map).astype('int')

    #pitch_result cols
    pitch_result_map = {'S':1, 'N/A':2, 'X':3, 'B':4}
    result_cols = ['L1_pitch_result', 'L2_pitch_result']
    df[result_cols] = df[result_cols].replace(pitch_result_map).astype('int')

    #pitch_type cols
    pitch_type_map = {'FA':1, 'FF':1, 'FT':2, 'FC':2, 'FS':2, 'SI':2, 'SF':2, 'N/A':2.5, 'SL':3,
                      'CB':4, 'CU':4, 'SC':5, 'KC':5, 'CH':6, 'KN':7, 'EP':8, 'FO':9, 'PO':9}
    pitch_type_cols = ['L1_pitch_type', 'L2_pitch_type', 'L3_pitch_type', 'pitch_type']
    df[pitch_type_cols] = df[pitch_type_cols].replace(pitch_type_map).astype('float')

    #count_cat
    count_cat_map = {'ahead':1,'neutral':2, 'behind':3}
    df['count_cat'] = df['count_cat'].replace(count_cat_map).astype('int')

    #count
    _count_map = {'02':1, '12':2, '01':3, '22':4, '11':5, '00':6, '21':7, '32':8, '10':9, '20':10, '31':11, '30':12}
    df['_count'] = df['_count'].replace(_count_map).astype('int')

    #for swung and chased, make unknown (-1) set to 0, and 0 (didnt swing/chase) set to -1:
    swung_and_chased_cols = ['L1_batter_swung', 'L1_chased', 'L2_chased', 'L3_chased']

    def swung_chase_edit(x):
        if x == 0:
            return -1
        elif x == -1:
            return 0
        else:
            return x

    for col in swung_and_chased_cols:
        df[col] = df[col].apply(swung_chase_edit)

    #fill remaining misc categories to numerics:
    misc_map = {'L':-1, 'R':2, 'Top':-1, 'Bot': 1, 'Standard':0, 'Infield shift': 1, 'Strategic':2, '4th outfielder':3}
    df = df.replace(misc_map)

    #clean up category dtypes to ints
    df['year'] = df['year'].cat.codes
    df['catcher_id'] = df['catcher_id'].cat.codes
    
    cat_cols = ['outs_when_up', 'inning', 'at_bat_number', 'pitch_number', 'balls', 'strikes', 'pitch_count', 'L1_pitch_zone', 
                'L1_batter_swung', 'L1_chased', 'L2_pitch_zone', 'L2_chased', 'L3_pitch_zone', 'L3_chased', 'batting_order_slot', 
                'month']
    
    df[cat_cols] = df[cat_cols].astype('int')
    df[['stand', 'inning_topbot', 'if_fielding_alignment', 'of_fielding_alignment']] = df[['stand', 'inning_topbot', 'if_fielding_alignment', 'of_fielding_alignment']].astype('int')
    return df

def train_test_split_by_date(df, train_fraction):
    train_idx = int(len(df) * train_fraction)
    train_end_date = df.loc[train_idx].game_date
    train = df[df['game_date'] < train_end_date]
    test = df[df['game_date'] >= train_end_date]
    print('train shape: ' + str(train.shape))
    print('test shape: '+ str(test.shape))
    return train, test

def scale_numerics(X, X_test):
    scale_cols = ['fastball_perc_faced', 'fastball_chase_perc', 'fastball_bip_swung_perc', 'fastball_taken_strike_perc',
              'fastball_est_woba', 'fastball_babip', 'fastball_iso_value', 'breaking_perc_faced', 'breaking_chase_perc',
              'breaking_bip_swung_perc', 'breaking_taken_strike_perc', 'breaking_est_woba', 'breaking_babip',
              'breaking_iso_value', 'offspeed_perc_faced', 'offspeed_chase_perc', 'offspeed_bip_swung_perc',
              'offspeed_taken_strike_perc', 'offspeed_est_woba', 'offspeed_babip', 'offspeed_iso_value',
              'pitchout_perc_faced', 'overall_fastball_perc', 'count_cat_fastball_perc', 'overall_breaking_perc',
              'count_cat_breaking_perc', 'overall_offspeed_perc', 'count_cat_offspeed_perc', 'L5_fastball_perc',
              'L15_fastball_perc', 'L5_breaking_perc', 'L15_breaking_perc', 'L5_offspeed_perc', 'L15_offspeed_perc',
              'L5_strike_perc', 'L15_strike_perc', 'PB_fastball', 'PB_breaking', 'PB_offspeed']

    scaler = RobustScaler()
    X[scale_cols] = scaler.fit_transform(X[scale_cols].values)
    X_test[scale_cols] = scaler.transform(X_test[scale_cols].values)
    return X, X_test

In [22]:
#pitcher_df of Bauer pickle
path = '/home/ec2-user/SageMaker/RC-v1.2-Predictive-Modelling/pitcher_df_pickles/' + pitcher + '_df.pkl'
pitcher_df = pd.read_pickle(path, compression='zip').reset_index()

In [23]:
#encode cat vars
pitcher_df = custom_ordinal_ecode(pitcher_df)

#split into train/test
train, test = train_test_split_by_date(pitcher_df, .85)

#split into X matrix/ y vector
target = 'pitch_type'
drop_cols = ['index', 'player_name', 'game_date', 'pitch_cat', 'pitcher', target]

X = train.drop(columns=drop_cols)
X_test = test.drop(columns=drop_cols)

y = train[target]
y_test = test[target]

#scale numerics
X, X_test = scale_numerics(X, X_test)

train shape: (5093, 92)
test shape: (1024, 92)


In [24]:
X.head()

Unnamed: 0,on_3b,on_2b,on_1b,outs_when_up,inning,catcher_id,sz_top,sz_bot,at_bat_number,pitch_number,bat_score,stand,inning_topbot,if_fielding_alignment,score_diff,bases_loaded,fastball_perc_faced,fastball_chase_perc,fastball_bip_swung_perc,fastball_taken_strike_perc,fastball_est_woba,fastball_babip,fastball_iso_value,breaking_perc_faced,breaking_chase_perc,breaking_bip_swung_perc,breaking_taken_strike_perc,breaking_est_woba,breaking_babip,breaking_iso_value,offspeed_perc_faced,offspeed_chase_perc,offspeed_bip_swung_perc,offspeed_taken_strike_perc,offspeed_est_woba,offspeed_babip,offspeed_iso_value,pitchout_perc_faced,balls,strikes,of_fielding_alignment,_count,count_cat,overall_fastball_perc,count_cat_fastball_perc,overall_breaking_perc,count_cat_breaking_perc,overall_offspeed_perc,count_cat_offspeed_perc,pitch_count,L1_pitch_type,L1_pitch_result,L1_description,L1_pitch_zone,L1_batter_swung,L1_chased,L2_pitch_type,L2_pitch_result,L2_description,L2_pitch_zone,L2_chased,L3_pitch_type,L3_description,L3_pitch_zone,L3_chased,L5_fastball_perc,L15_fastball_perc,L5_breaking_perc,L15_breaking_perc,L5_offspeed_perc,L15_offspeed_perc,L5_strike_perc,L15_strike_perc,batting_order_slot,pitcher_AB,prev_ab_run_scored,prev_ab_homerun,prev_ab_walk,prev_ab_basehit,prev_ab_strikeout,PB_fastball,PB_breaking,PB_offspeed,month,year,on_base
0,0.0,0.0,0.0,0,1,1,3.4075,1.5189,1,1,0.0,-1,1,2,0.0,0,1.087622,0.851568,0.600844,-0.035039,-1.219767,2.020168,-1.364293,-0.507207,1.073528,1.374483,-0.714364,-1.231213,1.853559,-0.574075,-0.898124,1.212747,1.923593,0.054627,-0.716414,1.920684,-1.045649,0.0,0,0,0,6,2,-0.885291,0.0,-0.401096,0.0,0.998031,0.840942,1,2.5,2,7,-1,0,0,2.5,2,7,-1,0,2.5,7,-1,0,-0.121056,0.030615,-0.066536,0.0,0.375184,0.062776,0.397417,0.096126,1,0.0,0.0,0.0,0.0,0.0,0.0,0.067152,-0.192982,0.908115,4,0,0
1,0.0,0.0,0.0,0,1,1,3.4768,1.5882,1,2,0.0,-1,1,2,0.0,0,1.087622,0.851568,0.600844,-0.035039,-1.219767,2.020168,-1.364293,-0.507207,1.073528,1.374483,-0.714364,-1.231213,1.853559,-0.574075,-0.898124,1.212747,1.923593,0.054627,-0.716414,1.920684,-1.045649,0.0,0,1,0,3,1,-0.885291,-0.57316,-0.401096,0.661002,0.998031,0.844812,2,1.0,1,1,3,-1,-1,2.5,2,7,-1,0,2.5,7,-1,0,-0.121056,0.030615,-0.066536,0.0,0.375184,0.062776,0.397417,0.096126,1,0.0,0.0,0.0,0.0,0.0,0.0,-0.444544,0.362573,0.911778,4,0,0
2,0.0,0.0,0.0,0,1,1,3.1996,1.2689,1,3,0.0,-1,1,2,0.0,0,1.087622,0.851568,0.600844,-0.035039,-1.219767,2.020168,-1.364293,-0.507207,1.073528,1.374483,-0.714364,-1.231213,1.853559,-0.574075,-0.898124,1.212747,1.923593,0.054627,-0.716414,1.920684,-1.045649,0.0,1,1,0,5,1,-0.885291,-0.57316,-0.401096,0.661002,0.998031,0.844812,3,1.0,4,9,12,-1,-1,1.0,1,1,3,-1,2.5,7,-1,0,-0.121056,0.030615,-0.066536,0.0,0.375184,0.062776,0.397417,0.096126,1,0.0,0.0,0.0,0.0,0.0,0.0,-0.444544,0.362573,0.911778,4,0,0
3,0.0,0.0,0.0,0,1,1,3.4248,1.5882,1,4,0.0,-1,1,0,0.0,0,1.087622,0.851568,0.600844,-0.035039,-1.219767,2.020168,-1.364293,-0.507207,1.073528,1.374483,-0.714364,-1.231213,1.853559,-0.574075,-0.898124,1.212747,1.923593,0.054627,-0.716414,1.920684,-1.045649,0.0,1,2,0,2,1,-0.885291,-0.57316,-0.401096,0.661002,0.998031,0.844812,4,1.0,1,4,1,1,-1,1.0,4,9,12,-1,1.0,1,3,-1,-0.121056,0.030615,-0.066536,0.0,0.375184,0.062776,0.397417,0.096126,1,0.0,0.0,0.0,0.0,0.0,0.0,-0.444544,0.362573,0.911778,4,0,0
4,0.0,0.0,0.0,0,1,1,3.2087,1.2775,1,5,0.0,-1,1,0,0.0,0,1.087622,0.851568,0.600844,-0.035039,-1.219767,2.020168,-1.364293,-0.507207,1.073528,1.374483,-0.714364,-1.231213,1.853559,-0.574075,-0.898124,1.212747,1.923593,0.054627,-0.716414,1.920684,-1.045649,0.0,2,2,0,4,1,-0.885291,-0.57316,-0.401096,0.661002,0.998031,0.844812,5,3.0,4,9,14,-1,-1,1.0,1,4,1,-1,1.0,9,12,-1,-0.121056,0.030615,-0.066536,0.0,0.375184,0.062776,0.397417,0.096126,1,0.0,0.0,0.0,0.0,0.0,0.0,-0.444544,0.362573,0.911778,4,0,0


In [None]:
#choose whatever model from best models

#fit the model to X, y


#model.predict X_test, compare y_pred vs y_test