In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
pitches = pd.read_csv('pitchdata_3_18.csv')

In [3]:
pitches.columns

Index(['PITCHNO', 'TM_DATE', 'TM_TIME', 'PAOFINNING', 'PITCHOFPA', 'PITCHER',
       'PITCHERID', 'PITCHERTHROWS', 'PITCHERTEAM', 'BATTER', 'BATTERID',
       'BATTERSIDE', 'BATTERTEAM', 'PITCHERSET', 'INNING', 'TOP_BOTTOM',
       'OUTS', 'BALLS', 'STRIKES', 'TAGGEDPITCHTYPE', 'AUTOPITCHTYPE',
       'PITCHCALL', 'KORBB', 'RELSPEED', 'EFFECTIVEVELO', 'VERTRELANGLE',
       'HORZRELANGLE', 'SPINRATE', 'SPINAXIS', 'TILT', 'RELHEIGHT', 'RELSIDE',
       'EXTENSION', 'VERTBREAK', 'INDUCEDVERTBREAK', 'HORZBREAK',
       'PLATELOCHEIGHT', 'PLATELOCSIDE', 'ZONESPEED', 'SPEEDDROP',
       'VERTAPPRANGLE', 'HORZAPPRANGLE', 'ZONETIME', 'PITCHLASTMEASUREDX',
       'PITCHLASTMEASUREDY', 'PITCHLASTMEASUREDZ', 'PFXX', 'PFXZ', 'X0', 'Y0',
       'Z0', 'VX0', 'VY0', 'VZ0', 'AX0', 'AY0', 'AZ0', 'GAMEID',
       'PI_PITCH_TYPE'],
      dtype='object')

In [4]:
pitch_list = []

In [5]:
pitch_label_col = "Pitch Type"
pitch_type_col = "PI_PITCH_TYPE"
cols_norm = ['RELSPEED','VERTRELANGLE','HORZRELANGLE','SPINRATE','SPINAXIS','RELHEIGHT','RELSIDE','EXTENSION','VERTBREAK','INDUCEDVERTBREAK','HORZBREAK','PLATELOCHEIGHT','PLATELOCSIDE','VERTAPPRANGLE','HORZAPPRANGLE','PFXX','PFXZ','X0','Y0','Z0','VX0','VY0','VZ0','AX0','AY0','AZ0']
cols = ['PITCHNO', 'TM_DATE_TIME','PAOFINNING','PITCHOFPA','PITCHER','PITCHERID','PITCHERTHROWS','BATTER','BATTERID','BATTERSIDE','BATTERTEAM','PITCHERSET','INNING', 'TOP_BOTTOM','OUTS','BALLS','STRIKES','PITCHCALL','KORBB','RELSPEED','VERTRELANGLE','HORZRELANGLE','SPINRATE','SPINAXIS','TILT','RELHEIGHT','RELSIDE','EXTENSION','VERTBREAK','INDUCEDVERTBREAK','HORZBREAK','PLATELOCHEIGHT','PLATELOCSIDE','VERTAPPRANGLE','HORZAPPRANGLE','PFXX','PFXZ','X0','Y0','Z0','VX0','VY0','VZ0','AX0','AY0','AZ0','GAMEID','PI_PITCH_TYPE']

In [6]:
# This function encodes several columns including `batter side` and `pitcher throwing hand`
def get_columns(df):
    df['TM_DATE_TIME'] = pd.to_datetime(df['TM_DATE'] + ' ' + df['TM_TIME'])
    df['BATTERSIDE'] = df['BATTERSIDE'].map({'Right': 1, 'Left': 0})
    df['PITCHERTHROWS'] = df['PITCHERTHROWS'].map({'Right': 1, 'Left': 0})
    return df[cols]

In [7]:
# This function normalizes each numerical column
def normalize_cols(df):
    df_copy = df.copy()
    df_copy[cols_norm] = df_copy[cols_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    return df_copy

In [8]:
# This function encodes the label (pitch type) of the current pitch
def pitch_labels(df):
    df[pitch_type_col] = df[pitch_type_col].astype('category')
    df[pitch_label_col] = df[pitch_type_col].cat.codes
    return df

In [9]:
# This function encodes the outcome of the current pitch.
def encoded(df):
    pitchcall_col = 'PITCHCALL'
    pitchcall_encoded_col = 'PitchOutcome'
    df[pitchcall_col] = df[pitchcall_col].astype('category')
    df[pitchcall_encoded_col] = df[pitchcall_col].cat.codes
    
    pitcher_col = 'PITCHER'
    pitcher_encoded_col = 'Pitcher Id Norm'
    df[pitcher_col] = df[pitcher_col].astype('category')
    df[pitcher_encoded_col] = df[pitcher_col].cat.codes
    
    batter_col = 'BATTER'
    batter_encoded_col = 'Batter Id Norm'
    df[batter_col] = df[batter_col].astype('category')
    df[batter_encoded_col] = df[batter_col].cat.codes
    return df

In [10]:
pitches_select = get_columns(pitches)

In [11]:
pitches_encoded = encoded(pitches_select)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

In [12]:
pitches_norm = normalize_cols(pitches_encoded)

In [13]:
pitches_labels = pitch_labels(pitches_norm)

In [14]:
pitches_labels.head()

Unnamed: 0,PITCHNO,TM_DATE_TIME,PAOFINNING,PITCHOFPA,PITCHER,PITCHERID,PITCHERTHROWS,BATTER,BATTERID,BATTERSIDE,...,VZ0,AX0,AY0,AZ0,GAMEID,PI_PITCH_TYPE,PitchOutcome,Pitcher Id Norm,Batter Id Norm,Pitch Type
0,1,2018-03-29 14:40:39,1,1,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,0.275225,0.748064,0.538883,0.624597,20180329-Arlington-1,FA,0,105,345,3
1,2,2018-03-29 14:40:53,1,2,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,0.299847,0.761498,0.549332,0.627089,20180329-Arlington-1,SI,0,105,345,6
2,3,2018-03-29 14:41:08,1,3,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,0.314069,0.754353,0.576244,0.633082,20180329-Arlington-1,FA,3,105,345,3
3,4,2018-03-29 14:41:52,2,1,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,0.334423,0.731836,0.559973,0.668849,20180329-Arlington-1,FA,0,105,43,3
4,5,2018-03-29 14:42:05,2,2,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,0.286454,0.742238,0.59039,0.67609,20180329-Arlington-1,FA,3,105,43,3


In [15]:
print(pitches_labels.columns)

Index(['PITCHNO', 'TM_DATE_TIME', 'PAOFINNING', 'PITCHOFPA', 'PITCHER',
       'PITCHERID', 'PITCHERTHROWS', 'BATTER', 'BATTERID', 'BATTERSIDE',
       'BATTERTEAM', 'PITCHERSET', 'INNING', 'TOP_BOTTOM', 'OUTS', 'BALLS',
       'STRIKES', 'PITCHCALL', 'KORBB', 'RELSPEED', 'VERTRELANGLE',
       'HORZRELANGLE', 'SPINRATE', 'SPINAXIS', 'TILT', 'RELHEIGHT', 'RELSIDE',
       'EXTENSION', 'VERTBREAK', 'INDUCEDVERTBREAK', 'HORZBREAK',
       'PLATELOCHEIGHT', 'PLATELOCSIDE', 'VERTAPPRANGLE', 'HORZAPPRANGLE',
       'PFXX', 'PFXZ', 'X0', 'Y0', 'Z0', 'VX0', 'VY0', 'VZ0', 'AX0', 'AY0',
       'AZ0', 'GAMEID', 'PI_PITCH_TYPE', 'PitchOutcome', 'Pitcher Id Norm',
       'Batter Id Norm', 'Pitch Type'],
      dtype='object')


In [16]:
# Adds a few columns for the purposes of predicting the pitch given information about the following pitch 
# and the upcoming situation.
# Columns include:
# 1. Next batter
# 2. Next pitch count (Balls, Strikes)
# 3. Next outs
# 4. Next inning
# 5. Next pitch (for prediction purposes) ? 
def get_next_cols(df):
    new_cols = {'BALLS': 'NEXT BALLS', 'STRIKES': 'NEXT STRIKES', 
                'BATTER': 'NEXT BATTER', 'Batter Id Norm': 'NEXT BATTERID',
                'OUTS': 'NEXT OUTS', 'INNING': 'NEXT INNING', 'PAOFINNING': 'PA NEXT',
                'Pitcher Id Norm': 'NEXT PITCHERID'}
    
    for col in new_cols:
        
        df[new_cols[col]] = df.groupby(['GAMEID', 'TOP_BOTTOM'])[col].shift(-1)
        
    return df 

In [17]:
p = get_next_cols(pitches_labels)

In [18]:
next_pitch = p.copy()

In [19]:
next_pitch['Next Pitch Label'] = next_pitch.groupby(['GAMEID', 'TOP_BOTTOM'])['Pitch Type'].shift(-1)

In [20]:
next_pitch.head()

Unnamed: 0,PITCHNO,TM_DATE_TIME,PAOFINNING,PITCHOFPA,PITCHER,PITCHERID,PITCHERTHROWS,BATTER,BATTERID,BATTERSIDE,...,Pitch Type,NEXT BALLS,NEXT STRIKES,NEXT BATTER,NEXT BATTERID,NEXT OUTS,NEXT INNING,PA NEXT,NEXT PITCHERID,Next Pitch Label
0,1,2018-03-29 14:40:39,1,1,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,3,1.0,0.0,"Springer, George",345.0,0.0,1.0,1.0,105.0,6.0
1,2,2018-03-29 14:40:53,1,2,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,6,2.0,0.0,"Springer, George",345.0,0.0,1.0,1.0,105.0,3.0
2,3,2018-03-29 14:41:08,1,3,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,3,0.0,0.0,"Bregman, Alex",43.0,0.0,1.0,2.0,105.0,3.0
3,4,2018-03-29 14:41:52,2,1,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,3,1.0,0.0,"Bregman, Alex",43.0,0.0,1.0,2.0,105.0,3.0
4,5,2018-03-29 14:42:05,2,2,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,3,0.0,0.0,"Altuve, Jose",11.0,0.0,1.0,3.0,105.0,6.0


In [21]:
next_pitch.columns

Index(['PITCHNO', 'TM_DATE_TIME', 'PAOFINNING', 'PITCHOFPA', 'PITCHER',
       'PITCHERID', 'PITCHERTHROWS', 'BATTER', 'BATTERID', 'BATTERSIDE',
       'BATTERTEAM', 'PITCHERSET', 'INNING', 'TOP_BOTTOM', 'OUTS', 'BALLS',
       'STRIKES', 'PITCHCALL', 'KORBB', 'RELSPEED', 'VERTRELANGLE',
       'HORZRELANGLE', 'SPINRATE', 'SPINAXIS', 'TILT', 'RELHEIGHT', 'RELSIDE',
       'EXTENSION', 'VERTBREAK', 'INDUCEDVERTBREAK', 'HORZBREAK',
       'PLATELOCHEIGHT', 'PLATELOCSIDE', 'VERTAPPRANGLE', 'HORZAPPRANGLE',
       'PFXX', 'PFXZ', 'X0', 'Y0', 'Z0', 'VX0', 'VY0', 'VZ0', 'AX0', 'AY0',
       'AZ0', 'GAMEID', 'PI_PITCH_TYPE', 'PitchOutcome', 'Pitcher Id Norm',
       'Batter Id Norm', 'Pitch Type', 'NEXT BALLS', 'NEXT STRIKES',
       'NEXT BATTER', 'NEXT BATTERID', 'NEXT OUTS', 'NEXT INNING', 'PA NEXT',
       'NEXT PITCHERID', 'Next Pitch Label'],
      dtype='object')

In [22]:
input_cols = ['PAOFINNING', 'PITCHOFPA', 'Pitcher Id Norm', 'PITCHERTHROWS', 'BATTERSIDE', 'INNING', 'OUTS', 
             'BALLS', 'STRIKES', 'PitchOutcome', 'RELSPEED', 'VERTRELANGLE', 'HORZRELANGLE', 'SPINRATE',
             'SPINAXIS', 'RELHEIGHT', 'RELSIDE', 'EXTENSION', 'VERTBREAK', 'INDUCEDVERTBREAK', 'HORZBREAK',
             'PLATELOCHEIGHT', 'PLATELOCSIDE', 'VERTAPPRANGLE', 'HORZAPPRANGLE', 'PFXX', 'PFXZ', 'X0', 'Z0',
             'VX0', 'VY0', 'VZ0', 'AX0', 'AY0', 'AZ0', 'Batter Id Norm', 'NEXT BALLS', 'NEXT STRIKES', 'NEXT OUTS',
             'NEXT INNING', 'PA NEXT', 'NEXT PITCHERID', 'Pitch Type', 'Next Pitch Label']

In [23]:
input_df = next_pitch[input_cols]
input_df.head()

Unnamed: 0,PAOFINNING,PITCHOFPA,Pitcher Id Norm,PITCHERTHROWS,BATTERSIDE,INNING,OUTS,BALLS,STRIKES,PitchOutcome,...,AZ0,Batter Id Norm,NEXT BALLS,NEXT STRIKES,NEXT OUTS,NEXT INNING,PA NEXT,NEXT PITCHERID,Pitch Type,Next Pitch Label
0,1,1,105,0,1,1,0,0,0,0,...,0.624597,345,1.0,0.0,0.0,1.0,1.0,105.0,3,6.0
1,1,2,105,0,1,1,0,1,0,0,...,0.627089,345,2.0,0.0,0.0,1.0,1.0,105.0,6,3.0
2,1,3,105,0,1,1,0,2,0,3,...,0.633082,345,0.0,0.0,0.0,1.0,2.0,105.0,3,3.0
3,2,1,105,0,1,1,0,0,0,0,...,0.668849,43,1.0,0.0,0.0,1.0,2.0,105.0,3,3.0
4,2,2,105,0,1,1,0,1,0,3,...,0.67609,43,0.0,0.0,0.0,1.0,3.0,105.0,3,6.0


In [26]:
train = input_df.sample(frac=0.7).dropna()
X = train[input_cols[0:len(input_cols) - 1]]
y = train[input_cols[-1]]
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [None]:
train.columns

In [27]:
predictions = clf.predict(X)

In [28]:
train['predictions'] = predictions

In [34]:
train.loc[train['predictions'] != 3.0].head()

Unnamed: 0,PAOFINNING,PITCHOFPA,Pitcher Id Norm,PITCHERTHROWS,BATTERSIDE,INNING,OUTS,BALLS,STRIKES,PitchOutcome,...,Batter Id Norm,NEXT BALLS,NEXT STRIKES,NEXT OUTS,NEXT INNING,PA NEXT,NEXT PITCHERID,Pitch Type,Next Pitch Label,predictions
11307,2,5,202,1,1,2,1,2,2,1,...,106,2.0,2.0,1.0,2.0,2.0,202.0,6,6.0,6.0
4588,6,1,185,1,1,2,2,0,0,4,...,51,0.0,1.0,2.0,2.0,6.0,185.0,6,0.0,6.0
3471,1,1,263,1,1,5,0,0,0,1,...,124,0.0,1.0,0.0,5.0,1.0,263.0,6,0.0,6.0
11112,3,1,162,1,0,5,2,0,0,0,...,173,1.0,0.0,2.0,5.0,3.0,162.0,6,6.0,6.0
4520,1,2,287,1,0,1,0,0,1,1,...,125,0.0,2.0,0.0,1.0,1.0,287.0,6,6.0,6.0
