In [1]:
import math

from sklearn import metrics
import pandas as pd
import numpy as np

data_folder = "./data/"

#Get and load data
# pitch_14_17_file = "pitcher_2014_2017.csv"
bball_data_2_file = "Baseball Data-2.csv"

TRAINING_FEATURES = ['RunsScored', 'VertBreak', 'HorzBreak', 'PlateLocSide', 'ZoneSpeed',
       'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'BallStrikeNum', 'norm_PlateLocHeight']
       
LABELS_FEATURE = ['GroundTruth']
SUBCATEGORY_KEYS = ['Pitcher', 'Batter', 'PitcherThrows', 'BatterSide', 'TaggedPitchType']
PREPROCESSING_KEYS = ['Balls', 'Strikes', 'PitchCall', 'PlateLocHeight']

PCT_FOR_TRAIN = 0.7

In [2]:
#normalizing function which moves 0 of PlateLocHeight to center of strike zone from ground
#returns a value 2.5ft (30in) less than the original value
def normalize_PlateLocHeight(PlateLocHeight):
    return ((PlateLocHeight * 12) - 30)/12


#preprocessing function used to calculate the plate location (this will be different than PitchCall (even in terms of BallCalled vs StrikeCalled))
#returns classification: heart (strike) = 0, shadow (strike) = 1, shadow (ball) = 2, chase (ball) = 3, waste (ball) = 4
def PlateZone(PlateLocHeight, PlateLocSide):
    FOOT = 12

    #waste zone
    #outside 84in to 6in, -20in to 20in horizontal, (strike zone * 200%)
    if ((PlateLocHeight > 7 or PlateLocHeight < 0.5) and (PlateLocSide < -(20/FOOT) or PlateLocSide > (20/FOOT))):
        return 4

    #heart zone
    #inside 38in to 22in vertical, -6.7in to 6.7in horizontal, (strike zone size * 67%)
    if ((PlateLocHeight < (38/FOOT) and PlateLocHeight > (22/FOOT)) and (PlateLocSide > (-6.7/FOOT) and PlateLocSide < (6.7/FOOT))):
        return 0

    #strike zone 
    #inside 42in to 18in vertical, -10in to 10in horizontal
    if (PlateLocHeight < (42/FOOT) and PlateLocHeight > (18/FOOT) and (PlateLocSide > (-10/FOOT) and PlateLocSide < (10/FOOT))):
        return 1

    #shadow zone
    #inside 46in to 14in vertical, -13.3in to 13.3in horizontal, (strike zone size * 133%)
    if (PlateLocHeight < (46/FOOT) and PlateLocHeight > (14/FOOT) and (PlateLocSide > (-13.3/FOOT) and PlateLocSide < (13.3/FOOT))):
        return 2

    #chase inside 84in to 6in, -20in to 20in horizontal, (strike zone * 200%)
    return 3


#preprocessing function used to generate a single number that will be used to classify the ball/strike count before the current pitch 
#returns int [0 - 11]
def PitchCount(balls, strikes):
    # Strikes: 0  1   2
    # Balls v|---------- 
    #       0| 0  1   2 
    #       1| 3  4   5
    #       2| 6  7   8
    #       3| 9  10  11
    if(balls == 0):
        if(strikes == 0):
            return 0
        if (strikes == 1):
            return 1
        return 2
    if(balls == 1):
        if(strikes == 0):
            return 3
        if (strikes == 1):
            return 4
        return 5
    if(balls == 2):
        if(strikes == 0):
            return 6
        if (strikes == 1):
            return 7
        return 8
    if(balls == 3):
        if(strikes == 0):
            return 9
        if (strikes == 1):
            return 10
        return 11


#preprocessing function generates the ground truth hitability of a pitch
#these values will definitely need to be adjusted
def GenerateGroundTruthLabels(pitchCall):
    if pitchCall == 'BallCalled':
        return 0
    if pitchCall == 'BallIntentional' or pitchCall == 'HitByPitch':
        return 1
    if pitchCall == 'StrikeSwinging' or pitchCall == 'StrikeCalled':
        return 2
    else:
        return 3

def GetSubCategory(data=None,feature=None, key=None):
    if data is None or feature is None or key is None:
        print("feature and key are required parameters, data is optional")
        return None
    
    #determine the rows to be dropped
    sub_data = data.loc[data[feature] == key]
    return sub_data

In [3]:
from sklearn.neural_network import MLPClassifier

#use this cell to find the optimal hyperparameters for the model
def TrainValidateNN(training_data, validation_data, learning_rate_init, batch_size, hidden_layers):
    X = training_data.loc[:,~training_data.columns.isin(LABELS_FEATURE)]
    y = training_data.loc[:,training_data.columns == LABELS_FEATURE[0]]

    vX = validation_data.loc[:,~validation_data.columns.isin(LABELS_FEATURE)]
    vy = validation_data.loc[:,validation_data.columns == LABELS_FEATURE[0]]

    parameter_accuracy = []
    for hl in hidden_layers:
        for bs in batch_size:
            for lri in learning_rate_init:
                mlp = MLPClassifier(solver='adam',max_iter=1000,hidden_layer_sizes=hl, batch_size=bs,learning_rate_init=lri)
                mlp.fit(X,y.values.ravel())

                #then check accuracy of validation data
                val_pred = mlp.predict(vX)
                score = metrics.accuracy_score(vy,val_pred)
                #put parameters and accuracy in matrix
                parameter_accuracy.append((hl,bs,lri,score))
                print(hl,bs,lri,score)



    #select parameters with highest accuracy
    parameter_accuracy.sort(key = lambda x:x[3])

    best_parameters = parameter_accuracy[-1]
    print(best_parameters)

    # #train new neural net using best hyperparameters found above
    best_neural_net = MLPClassifier(solver='adam',hidden_layer_sizes=best_parameters[0], batch_size=best_parameters[1], learning_rate_init=best_parameters[2])
    best_neural_net = best_neural_net.fit(X, y.values.ravel())

    res_pred = best_neural_net.predict(vX)
    score = metrics.accuracy_score(vy,res_pred)
    print(score*100)

    return best_neural_net, best_parameters[0], best_parameters[1], best_parameters[2]

In [4]:
#import the columns we will need for training and preprocessing
bball_data = pd.read_csv(data_folder+bball_data_2_file, usecols=['Pitcher', 'PitcherThrows', 'Batter', 'BatterSide', 'PitchCall', 'RunsScored', 'VertBreak', 'HorzBreak', 'ZoneSpeed', 'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'PlateLocHeight', 'PlateLocSide', 'Balls', 'Strikes', 'TaggedPitchType'])

#run preprocessing functions and normalize data
bball_data['BallStrikeNum'] = bball_data.apply(lambda pitch : PitchCount(pitch['Balls'], pitch['Strikes']), axis=1)
bball_data['GroundTruth'] = bball_data.apply(lambda pitch : GenerateGroundTruthLabels(pitch['PitchCall']), axis=1)
bball_data['norm_PlateLocHeight'] = bball_data.apply(lambda pitch : normalize_PlateLocHeight(pitch['PlateLocHeight']), axis=1)
#bball_data['Zone'] = bball_data.apply(lambda pitch : PlateZone(pitch['PlateLocHeight'], pitch['PlateLocSide']))

#drop all features used for preprocessing
bball_data.drop(labels=PREPROCESSING_KEYS, axis=1, inplace=True)

#Remove NaN valued rows
bball_data.dropna(inplace=True)
bball_data.reset_index(drop=True, inplace=True)

#drop all features that could be useful subcategories
bball_data.drop(labels=SUBCATEGORY_KEYS, axis = 1, inplace=True)

#splitting data into training, validation, testing
total_samples = len(bball_data.index)
training_samples = math.floor(PCT_FOR_TRAIN*total_samples)
validation_samples = math.ceil((1-PCT_FOR_TRAIN)*total_samples)

sum = training_samples+validation_samples

print("total samples:",total_samples,
        "\ntraining samples:",training_samples,
        "\nvalidation samples:",validation_samples,
        "\nsum of training, and validation:",sum)

#makes shuffled version of the data
indices = np.arange(total_samples)
np.random.shuffle(indices)
shuffled_bball_data = bball_data.reindex(indices).reset_index(drop=True)

#gets the amount of random data points as determined by set proportion
training_data = shuffled_bball_data.iloc[0:training_samples]
validation_data = shuffled_bball_data.iloc[training_samples:training_samples+validation_samples]

training_data.keys()

total samples: 1029479 
training samples: 720635 
validation samples: 308844 
sum of training, and validation: 1029479


Index(['RunsScored', 'VertBreak', 'HorzBreak', 'PlateLocSide', 'ZoneSpeed',
       'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'BallStrikeNum',
       'GroundTruth', 'norm_PlateLocHeight'],
      dtype='object')

In [12]:
hidden_layers = [(100,100,100,50,50,50,25,25,25)
                ]
batch_sizes = [320,640,1280,2560]
learning_rates = [0.001,0.003,0.005]

best_hyperparams = TrainValidateNN(training_data=training_data,validation_data=validation_data,learning_rate_init=learning_rates, batch_size=batch_sizes, hidden_layers=hidden_layers)

(100, 100, 100, 50, 50, 50, 25, 25, 25) 320 0.001 0.6813180764398855
(100, 100, 100, 50, 50, 50, 25, 25, 25) 320 0.003 0.6806251699887321
(100, 100, 100, 50, 50, 50, 25, 25, 25) 320 0.005 0.6723070546942793
(100, 100, 100, 50, 50, 50, 25, 25, 25) 640 0.001 0.6815512038440119
(100, 100, 100, 50, 50, 50, 25, 25, 25) 640 0.003 0.6811788475735323
(100, 100, 100, 50, 50, 50, 25, 25, 25) 640 0.005 0.6757942521143361
(100, 100, 100, 50, 50, 50, 25, 25, 25) 1280 0.001 0.6816030099338177
(100, 100, 100, 50, 50, 50, 25, 25, 25) 1280 0.003 0.6828463560891583
(100, 100, 100, 50, 50, 50, 25, 25, 25) 1280 0.005 0.6816256750981078
(100, 100, 100, 50, 50, 50, 25, 25, 25) 2560 0.001 0.6819073707114272
(100, 100, 100, 50, 50, 50, 25, 25, 25) 2560 0.003 0.6807514473326339
(100, 100, 100, 50, 50, 50, 25, 25, 25) 2560 0.005 0.681570631127689
((100, 100, 100, 50, 50, 50, 25, 25, 25), 1280, 0.003, 0.6828463560891583)
67.84007460076931
