In [22]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
pitches = pd.read_csv('pitchdata_3_18.csv')

In [5]:
pitches.columns

Index(['PITCHNO', 'TM_DATE', 'TM_TIME', 'PAOFINNING', 'PITCHOFPA', 'PITCHER',
       'PITCHERID', 'PITCHERTHROWS', 'PITCHERTEAM', 'BATTER', 'BATTERID',
       'BATTERSIDE', 'BATTERTEAM', 'PITCHERSET', 'INNING', 'TOP_BOTTOM',
       'OUTS', 'BALLS', 'STRIKES', 'TAGGEDPITCHTYPE', 'AUTOPITCHTYPE',
       'PITCHCALL', 'KORBB', 'RELSPEED', 'EFFECTIVEVELO', 'VERTRELANGLE',
       'HORZRELANGLE', 'SPINRATE', 'SPINAXIS', 'TILT', 'RELHEIGHT', 'RELSIDE',
       'EXTENSION', 'VERTBREAK', 'INDUCEDVERTBREAK', 'HORZBREAK',
       'PLATELOCHEIGHT', 'PLATELOCSIDE', 'ZONESPEED', 'SPEEDDROP',
       'VERTAPPRANGLE', 'HORZAPPRANGLE', 'ZONETIME', 'PITCHLASTMEASUREDX',
       'PITCHLASTMEASUREDY', 'PITCHLASTMEASUREDZ', 'PFXX', 'PFXZ', 'X0', 'Y0',
       'Z0', 'VX0', 'VY0', 'VZ0', 'AX0', 'AY0', 'AZ0', 'GAMEID',
       'PI_PITCH_TYPE'],
      dtype='object')

In [6]:
pitch_list = []

In [7]:
pitch_label_col = "Pitch Label"
pitch_type_col = "PI_PITCH_TYPE"
cols_norm = ['RELSPEED','VERTRELANGLE','HORZRELANGLE','SPINRATE','SPINAXIS','RELHEIGHT','RELSIDE','EXTENSION','VERTBREAK','INDUCEDVERTBREAK','HORZBREAK','PLATELOCHEIGHT','PLATELOCSIDE','VERTAPPRANGLE','HORZAPPRANGLE','PFXX','PFXZ','X0','Y0','Z0','VX0','VY0','VZ0','AX0','AY0','AZ0']
cols = ['PITCHNO', 'TM_DATE_TIME','PAOFINNING','PITCHOFPA','PITCHER','PITCHERID','PITCHERTHROWS','BATTER','BATTERID','BATTERSIDE','BATTERTEAM','PITCHERSET','INNING','OUTS','BALLS','STRIKES','PITCHCALL','KORBB','RELSPEED','VERTRELANGLE','HORZRELANGLE','SPINRATE','SPINAXIS','TILT','RELHEIGHT','RELSIDE','EXTENSION','VERTBREAK','INDUCEDVERTBREAK','HORZBREAK','PLATELOCHEIGHT','PLATELOCSIDE','VERTAPPRANGLE','HORZAPPRANGLE','PFXX','PFXZ','X0','Y0','Z0','VX0','VY0','VZ0','AX0','AY0','AZ0','GAMEID','PI_PITCH_TYPE']

In [8]:
# This function encodes several columns including `batter side` and `pitcher throwing hand`
def get_columns(df):
    df['TM_DATE_TIME'] = pd.to_datetime(df['TM_DATE'] + ' ' + df['TM_TIME'])
    df['BATTERSIDE'] = df['BATTERSIDE'].map({'Right': 1, 'Left': 0})
    df['PITCHERTHROWS'] = df['PITCHERTHROWS'].map({'Right': 1, 'Left': 0})
    return df[cols]

In [9]:
# This function normalizes each numerical column
def normalize_cols(df):
    df_copy = df.copy()
    df_copy[cols_norm] = df_copy[cols_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    return df_copy

In [10]:
# This function encodes the label (pitch type) of the current pitch
def pitch_labels(df):
    df[pitch_type_col] = df[pitch_type_col].astype('category')
    df[pitch_label_col] = df[pitch_type_col].cat.codes
    return df

In [11]:
# This function encodes the outcome of the current pitch.
def outcome_encoded(df):
    pitchcall_col = 'PITCHCALL'
    pitchcall_encoded_col = 'PitchOutcome'
    df[pitchcall_col] = df[pitchcall_col].astype('category')
    df[pitchcall_encoded_col] = df[pitchcall_col].cat.codes
    return df

In [12]:
pitches_select = get_columns(pitches)

In [13]:
pitches_norm = normalize_cols(pitches_select)

In [14]:
pitches_labels = pitch_labels(pitches_norm)

In [15]:
pitches_labels.head()

Unnamed: 0,PITCHNO,TM_DATE_TIME,PAOFINNING,PITCHOFPA,PITCHER,PITCHERID,PITCHERTHROWS,BATTER,BATTERID,BATTERSIDE,...,Z0,VX0,VY0,VZ0,AX0,AY0,AZ0,GAMEID,PI_PITCH_TYPE,Pitch Label
0,1,2018-03-29 14:40:39,1,1,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,0.506461,0.340135,0.239717,0.275225,0.748064,0.538883,0.624597,20180329-Arlington-1,FA,3
1,2,2018-03-29 14:40:53,1,2,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,0.498633,0.373942,0.243286,0.299847,0.761498,0.549332,0.627089,20180329-Arlington-1,SI,6
2,3,2018-03-29 14:41:08,1,3,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,0.517026,0.301988,0.205955,0.314069,0.754353,0.576244,0.633082,20180329-Arlington-1,FA,3
3,4,2018-03-29 14:41:52,2,1,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,0.525111,0.357516,0.209435,0.334423,0.731836,0.559973,0.668849,20180329-Arlington-1,FA,3
4,5,2018-03-29 14:42:05,2,2,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,0.505105,0.271808,0.208083,0.286454,0.742238,0.59039,0.67609,20180329-Arlington-1,FA,3


In [16]:
print(pitches_labels.columns)

Index(['PITCHNO', 'TM_DATE_TIME', 'PAOFINNING', 'PITCHOFPA', 'PITCHER',
       'PITCHERID', 'PITCHERTHROWS', 'BATTER', 'BATTERID', 'BATTERSIDE',
       'BATTERTEAM', 'PITCHERSET', 'INNING', 'OUTS', 'BALLS', 'STRIKES',
       'PITCHCALL', 'KORBB', 'RELSPEED', 'VERTRELANGLE', 'HORZRELANGLE',
       'SPINRATE', 'SPINAXIS', 'TILT', 'RELHEIGHT', 'RELSIDE', 'EXTENSION',
       'VERTBREAK', 'INDUCEDVERTBREAK', 'HORZBREAK', 'PLATELOCHEIGHT',
       'PLATELOCSIDE', 'VERTAPPRANGLE', 'HORZAPPRANGLE', 'PFXX', 'PFXZ', 'X0',
       'Y0', 'Z0', 'VX0', 'VY0', 'VZ0', 'AX0', 'AY0', 'AZ0', 'GAMEID',
       'PI_PITCH_TYPE', 'Pitch Label'],
      dtype='object')


In [17]:
# Adds a few columns for the purposes of predicting the pitch given information about the following pitch 
# and the upcoming situation.
# Columns include:
# 1. Next batter
# 2. Next pitch count (Balls, Strikes)
# 3. Next outs
# 4. Next inning
# 5. Next pitch (for prediction purposes) ? 
def get_next_cols(df):
    new_cols = {'BALLS': 'NEXT BALLS', 'STRIKES': 'NEXT STRIKES', 
                'BATTER': 'NEXT BATTER', 'BATTERID': 'NEXT BATTERID',
                'OUTS': 'NEXT OUTS', 'INNING': 'NEXT INNING'}
    
    for col in new_cols:
        
        new_col = df[col].tolist() + [-1]
        new_col = new_col[1:]
        df[new_cols[col]] = new_col
        
    return df 

In [18]:
p = get_next_cols(pitches_labels)

In [19]:
p.head(100)

Unnamed: 0,PITCHNO,TM_DATE_TIME,PAOFINNING,PITCHOFPA,PITCHER,PITCHERID,PITCHERTHROWS,BATTER,BATTERID,BATTERSIDE,...,AZ0,GAMEID,PI_PITCH_TYPE,Pitch Label,NEXT BALLS,NEXT STRIKES,NEXT BATTER,NEXT BATTERID,NEXT OUTS,NEXT INNING
0,1,2018-03-29 14:40:39,1,1,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,0.624597,20180329-Arlington-1,FA,3,1,0,"Springer, George",543807,0,1
1,2,2018-03-29 14:40:53,1,2,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,0.627089,20180329-Arlington-1,SI,6,2,0,"Springer, George",543807,0,1
2,3,2018-03-29 14:41:08,1,3,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,0.633082,20180329-Arlington-1,FA,3,0,0,"Bregman, Alex",608324,0,1
3,4,2018-03-29 14:41:52,2,1,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,0.668849,20180329-Arlington-1,FA,3,1,0,"Bregman, Alex",608324,0,1
4,5,2018-03-29 14:42:05,2,2,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,0.676090,20180329-Arlington-1,FA,3,0,0,"Altuve, Jose",514888,0,1
5,6,2018-03-29 14:42:50,3,1,"Hamels, Cole",430935,0,"Altuve, Jose",514888,1,...,0.585546,20180329-Arlington-1,SI,6,0,1,"Altuve, Jose",514888,0,1
6,7,2018-03-29 14:43:07,3,2,"Hamels, Cole",430935,0,"Altuve, Jose",514888,1,...,0.496494,20180329-Arlington-1,CH,0,1,1,"Altuve, Jose",514888,0,1
7,8,2018-03-29 14:43:33,3,3,"Hamels, Cole",430935,0,"Altuve, Jose",514888,1,...,0.559215,20180329-Arlington-1,FC,4,0,0,"Correa, Carlos",621043,1,1
8,9,2018-03-29 14:44:16,4,1,"Hamels, Cole",430935,0,"Correa, Carlos",621043,1,...,0.461582,20180329-Arlington-1,CU,2,1,0,"Correa, Carlos",621043,1,1
9,10,2018-03-29 14:44:54,4,2,"Hamels, Cole",430935,0,"Correa, Carlos",621043,1,...,0.573580,20180329-Arlington-1,FC,4,2,0,"Correa, Carlos",621043,1,1
