In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
pitches = pd.read_csv('pitchdata_3_18.csv')

In [None]:
df_list = []

In [None]:
for month in range(3, 11):
    if month == 5 or month == 7:
        pitch1 = pd.read_csv('pitchdata_' + str(month) + '_15_18.csv')
        pitch2 = pd.read_csv('pitchdata_' + str(month) + '_31_18.csv')
        df_list.append(pitch1)
        df_list.append(pitch2)
    elif month == 6 or month == 8 or month == 9:
        pitch1 = pd.read_csv('pitchdata_' + str(month) + '_15_18.csv')
        pitch2 = pd.read_csv('pitchdata_' + str(month) + '_30_18.csv')
        df_list.append(pitch1)
        df_list.append(pitch2)
    else:
        pitch = pd.read_csv('pitchdata_' + str(month) + '_18.csv')
        df_list.append(pitch) 

In [None]:
pitches = pd.concat(df_list)

In [3]:
pitch_label_col = "Pitch Type"
pitch_type_col = "PI_PITCH_TYPE"
cols_norm = ['RELSPEED','VERTRELANGLE','HORZRELANGLE','SPINRATE','SPINAXIS','RELHEIGHT','RELSIDE','EXTENSION','VERTBREAK','INDUCEDVERTBREAK','HORZBREAK','PLATELOCHEIGHT','PLATELOCSIDE','VERTAPPRANGLE','HORZAPPRANGLE','PFXX','PFXZ','X0','Y0','Z0','VX0','VY0','VZ0','AX0','AY0','AZ0']
cols = ['PITCHNO', 'TM_DATE_TIME','PAOFINNING','PITCHOFPA','PITCHER','PITCHERID','PITCHERTHROWS','BATTER','BATTERID','BATTERSIDE','BATTERTEAM','PITCHERSET','INNING', 'TOP_BOTTOM','OUTS','BALLS','STRIKES','PITCHCALL','KORBB','RELSPEED','VERTRELANGLE','HORZRELANGLE','SPINRATE','SPINAXIS','TILT','RELHEIGHT','RELSIDE','EXTENSION','VERTBREAK','INDUCEDVERTBREAK','HORZBREAK','PLATELOCHEIGHT','PLATELOCSIDE','VERTAPPRANGLE','HORZAPPRANGLE','PFXX','PFXZ','X0','Y0','Z0','VX0','VY0','VZ0','AX0','AY0','AZ0','GAMEID','PI_PITCH_TYPE']

In [4]:
# This function encodes several columns including `batter side` and `pitcher throwing hand`
def get_columns(df):
    df['TM_DATE_TIME'] = pd.to_datetime(df['TM_DATE'] + ' ' + df['TM_TIME'])
    df['BATTERSIDE'] = df['BATTERSIDE'].map({'Right': 1, 'Left': 0})
    df['PITCHERTHROWS'] = df['PITCHERTHROWS'].map({'Right': 1, 'Left': 0})
    return df[cols]

In [5]:
# This function normalizes each numerical column
def normalize_cols(df):
    df_copy = df.copy()
    df_copy[cols_norm] = df_copy[cols_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    return df_copy

In [6]:
# This function encodes the label (pitch type) of the current pitch
def pitch_labels(df):
    df[pitch_type_col] = df[pitch_type_col].astype('category')
    df[pitch_label_col] = df[pitch_type_col].cat.codes
    return df

In [7]:
# This function encodes the outcome of the current pitch.
def encoded(df):
    pitchcall_col = 'PITCHCALL'
    pitchcall_encoded_col = 'PitchOutcome'
    df[pitchcall_col] = df[pitchcall_col].astype('category')
    df[pitchcall_encoded_col] = df[pitchcall_col].cat.codes
    
    pitcher_col = 'PITCHER'
    pitcher_encoded_col = 'Pitcher Id Norm'
    df[pitcher_col] = df[pitcher_col].astype('category')
    df[pitcher_encoded_col] = df[pitcher_col].cat.codes
    
    batter_col = 'BATTER'
    batter_encoded_col = 'Batter Id Norm'
    df[batter_col] = df[batter_col].astype('category')
    df[batter_encoded_col] = df[batter_col].cat.codes
    return df

In [8]:
# Adds a few columns for the purposes of predicting the pitch given information about the following pitch 
# and the upcoming situation.
# Columns include:
# 1. Next batter
# 2. Next pitch count (Balls, Strikes)
# 3. Next outs
# 4. Next inning

def get_next_cols(df):
    new_cols = {'BALLS': 'NEXT BALLS', 'STRIKES': 'NEXT STRIKES', 
                'BATTER': 'NEXT BATTER', 'Batter Id Norm': 'NEXT BATTERID',
                'OUTS': 'NEXT OUTS', 'INNING': 'NEXT INNING', 'PAOFINNING': 'PA NEXT',
                'Pitcher Id Norm': 'NEXT PITCHERID'}
    
    for col in new_cols:
        
        df[new_cols[col]] = df.groupby(['GAMEID', 'TOP_BOTTOM'])[col].shift(-1)
        
    return df

In [17]:
def get_next_pitch_label(df, pitch_label_col='Next Pitch Label'):
    df[pitch_label_col] = df.groupby(['GAMEID', 'TOP_BOTTOM'])['Pitch Type'].shift(-1)
    return df 

In [10]:
pitches_select = get_columns(pitches)

In [11]:
pitches_encoded = encoded(pitches_select)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

In [12]:
pitches_norm = normalize_cols(pitches_encoded)

In [13]:
pitches_labels = pitch_labels(pitches_norm)

In [14]:
p = get_next_cols(pitches_labels)

In [56]:
next_pitch = get_next_pitch_label(p)

In [57]:
next_pitch.head()

Unnamed: 0,PITCHNO,TM_DATE_TIME,PAOFINNING,PITCHOFPA,PITCHER,PITCHERID,PITCHERTHROWS,BATTER,BATTERID,BATTERSIDE,...,Pitch Type,NEXT BALLS,NEXT STRIKES,NEXT BATTER,NEXT BATTERID,NEXT OUTS,NEXT INNING,PA NEXT,NEXT PITCHERID,Next Pitch Label
0,1,2018-03-29 14:40:39,1,1,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,3,1.0,0.0,"Springer, George",345.0,0.0,1.0,1.0,105.0,6.0
1,2,2018-03-29 14:40:53,1,2,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,6,2.0,0.0,"Springer, George",345.0,0.0,1.0,1.0,105.0,3.0
2,3,2018-03-29 14:41:08,1,3,"Hamels, Cole",430935,0,"Springer, George",543807,1,...,3,0.0,0.0,"Bregman, Alex",43.0,0.0,1.0,2.0,105.0,3.0
3,4,2018-03-29 14:41:52,2,1,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,3,1.0,0.0,"Bregman, Alex",43.0,0.0,1.0,2.0,105.0,3.0
4,5,2018-03-29 14:42:05,2,2,"Hamels, Cole",430935,0,"Bregman, Alex",608324,1,...,3,0.0,0.0,"Altuve, Jose",11.0,0.0,1.0,3.0,105.0,6.0


In [21]:
input_cols = ['PAOFINNING', 'PITCHOFPA', 'Pitcher Id Norm', 'PITCHERTHROWS', 'BATTERSIDE', 'INNING', 'OUTS', 
             'BALLS', 'STRIKES', 'PitchOutcome', 'RELSPEED', 'VERTRELANGLE', 'HORZRELANGLE', 'SPINRATE',
             'SPINAXIS', 'RELHEIGHT', 'RELSIDE', 'EXTENSION', 'VERTBREAK', 'INDUCEDVERTBREAK', 'HORZBREAK',
             'PLATELOCHEIGHT', 'PLATELOCSIDE', 'VERTAPPRANGLE', 'HORZAPPRANGLE', 'PFXX', 'PFXZ', 'X0', 'Z0',
             'VX0', 'VY0', 'VZ0', 'AX0', 'AY0', 'AZ0', 'Batter Id Norm', 'NEXT BALLS', 'NEXT STRIKES', 'NEXT OUTS',
             'NEXT INNING', 'PA NEXT', 'NEXT PITCHERID', 'Pitch Type', 'Next Pitch Label']

## The following code selects only the columns that are to be included in the training process

In [22]:
input_df = next_pitch[input_cols]
input_df.head()

Unnamed: 0,PAOFINNING,PITCHOFPA,Pitcher Id Norm,PITCHERTHROWS,BATTERSIDE,INNING,OUTS,BALLS,STRIKES,PitchOutcome,...,AZ0,Batter Id Norm,NEXT BALLS,NEXT STRIKES,NEXT OUTS,NEXT INNING,PA NEXT,NEXT PITCHERID,Pitch Type,Next Pitch Label
0,1,1,105,0,1,1,0,0,0,0,...,0.624597,345,1.0,0.0,0.0,1.0,1.0,105.0,3,6.0
1,1,2,105,0,1,1,0,1,0,0,...,0.627089,345,2.0,0.0,0.0,1.0,1.0,105.0,6,3.0
2,1,3,105,0,1,1,0,2,0,3,...,0.633082,345,0.0,0.0,0.0,1.0,2.0,105.0,3,3.0
3,2,1,105,0,1,1,0,0,0,0,...,0.668849,43,1.0,0.0,0.0,1.0,2.0,105.0,3,3.0
4,2,2,105,0,1,1,0,1,0,3,...,0.67609,43,0.0,0.0,0.0,1.0,3.0,105.0,3,6.0


## This code sets up a Random Forest Classifier training data, releases a new dataset with the model's predictions as well as the model itself

In [23]:
def RandomForest(df, sample_fraction=0.7, input_cols=input_cols):
    
    train, test = train_test_split(df, test_size=0.2)
    
    X_train = train[input_cols[0:len(input_cols) - 1]]
    y_train = train[input_cols[-1]]
    
    X_test = test[input_cols[0:len(input_cols) - 1]]
    y_test = test[input_cols[-1]]
    
    RFModel = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
    RFModel.fit(X_train, y_train)
    
    predictions = clf.predict(X_train)
    train['predictions'] = predictions
    
    predictions_test = clf.predict(X_test)
    test['predictions'] = predictions_test
    
    score_train = RFModel.score(X_train, y_train)
    score_test = RFModel.score(X_test, y_test)
    
    print("Training accuracy: " + str(score_train))
    print("Testing accuracy: " + str(score_test))
    
    return train, test, RFModel

## Now we need to break up the data so that we can use a LSTM. I think we'll need to use sorting:
1. Game ID
2. Top Bottom

Then split up the data that have different values for the above. 

In [116]:
next_pitch['values'] = next_pitch[input_cols[:len(input_cols) - 1]].apply(lambda x: list(x), axis=1)

In [117]:
data_ready = next_pitch.groupby(['GAMEID', 'TOP_BOTTOM']).agg({'values': lambda x: list(x), 
                                                               'Next Pitch Label': lambda x : list(x)})
data_ready.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,values,Next Pitch Label
GAMEID,TOP_BOTTOM,Unnamed: 2_level_1,Unnamed: 3_level_1
20180329-Arlington-1,Bottom,"[[1.0, 1.0, 272.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0....","[3.0, 7.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 7.0, ..."
20180329-Arlington-1,Top,"[[1.0, 1.0, 105.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0....","[6.0, 3.0, 3.0, 3.0, 6.0, 0.0, 4.0, 2.0, 4.0, ..."
20180329-CITI-1,Bottom,"[[1.0, 1.0, 158.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0....","[6.0, 4.0, 3.0, 6.0, 6.0, 3.0, 7.0, 3.0, 6.0, ..."
20180329-CITI-1,Top,"[[1.0, 1.0, 260.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0....","[6.0, 3.0, 6.0, 6.0, 6.0, 0.0, 3.0, 6.0, 7.0, ..."
20180329-ChaseField-1,Bottom,"[[1.0, 1.0, 97.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0...","[7.0, 3.0, 7.0, 3.0, 3.0, 7.0, 7.0, 7.0, 3.0, ..."


In [118]:
values = data_ready['values'].values
labels = data_ready['Next Pitch Label'].values

In [131]:
fin_values = []
fin_labels = []

In [132]:
# Fin values should roughly correspond to 
for val in values:
    fin_values.append(val[0])
for label in labels:
    fin_labels.append(label)

In [134]:
def generate_batch(batch_size, values, labels):
    
    total = len(values)
    
    indices = np.random.choice(total, batch_size, replace=False)
    
    batch_values = []
    batch_labels = []
    
    for ind in indices:
        batch_values.append(values[ind])
        batch_labels.append(labels[ind])
    
    return batch_values, batch_labels

In [None]:
class LSTMModel():
    
    def __init__(self, rnn_size, output_size, learning_rate=1e-4):

        self.inputs = tf.placeholder(tf.float32, shape=[None, None, embedding_size])
        self.labels = tf.placeholder(tf.int32, shape=[None, 1])
    
        lm_cell = tf.nn.rnn_cell.LSTMCell(rnn_size)
    
        outputs, states = tf.nn.dynamic_rnn(lm_cell, self.inputs, dtype=tf.float32)
    
        self.output_logits = tf.layers.dense(outputs, output_size)
    
        self.loss = tf.losses.sparse_softmax_cross_entropy(self.labels, self.output_logits)
        
        optimizer = tf.train.AdamOptimizer(learning_rate)
        
        self.global_step = tf.train.get_or_create_global_step()
        self.train_op = optimizer.minimize(self.loss)
        self.saver = tf.train.Saver()