In [53]:
import math
#from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import pandas as pd
import numpy as np

data_folder = "./data/"

#Get and load data
# pitch_14_17_file = "pitcher_2014_2017.csv"
bball_data_2_file = "Baseball Data-2.csv"

TRAINING_FEATURES = ['RunsScored', 'VertBreak', 'HorzBreak', 'PlateLocSide', 'ZoneSpeed',
       'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'BallStrikeNum', 'norm_PlateLocHeight']
       
LABELS_FEATURE = ['GroundTruth']
SUBCATEGORY_KEYS = ['Pitcher', 'Batter', 'PitcherThrows', 'BatterSide', 'TaggedPitchType']
PREPROCESSING_KEYS = ['Balls', 'Strikes', 'PitchCall', 'PlateLocHeight']

PCT_FOR_TRAIN = 0.7

In [54]:
#normalizing function which moves 0 of PlateLocHeight to center of strike zone from ground
#returns a value 2.5ft (30in) less than the original value
def normalize_PlateLocHeight(PlateLocHeight):
    return ((PlateLocHeight * 12) - 30)/12


#preprocessing function used to calculate the plate location (this will be different than PitchCall (even in terms of BallCalled vs StrikeCalled))
#returns classification: heart (strike) = 0, shadow (strike) = 1, shadow (ball) = 2, chase (ball) = 3, waste (ball) = 4
def PlateZone(PlateLocHeight, PlateLocSide):
    FOOT = 12

    #waste zone
    #outside 84in to 6in, -20in to 20in horizontal, (strike zone * 200%)
    if ((PlateLocHeight > 7 or PlateLocHeight < 0.5) and (PlateLocSide < -(20/FOOT) or PlateLocSide > (20/FOOT))):
        return 4

    #heart zone
    #inside 38in to 22in vertical, -6.7in to 6.7in horizontal, (strike zone size * 67%)
    if ((PlateLocHeight < (38/FOOT) and PlateLocHeight > (22/FOOT)) and (PlateLocSide > (-6.7/FOOT) and PlateLocSide < (6.7/FOOT))):
        return 0

    #strike zone 
    #inside 42in to 18in vertical, -10in to 10in horizontal
    if (PlateLocHeight < (42/FOOT) and PlateLocHeight > (18/FOOT) and (PlateLocSide > (-10/FOOT) and PlateLocSide < (10/FOOT))):
        return 1

    #shadow zone
    #inside 46in to 14in vertical, -13.3in to 13.3in horizontal, (strike zone size * 133%)
    if (PlateLocHeight < (46/FOOT) and PlateLocHeight > (14/FOOT) and (PlateLocSide > (-13.3/FOOT) and PlateLocSide < (13.3/FOOT))):
        return 2

    #chase inside 84in to 6in, -20in to 20in horizontal, (strike zone * 200%)
    return 3


#preprocessing function used to generate a single number that will be used to classify the ball/strike count before the current pitch 
#returns int [0 - 11]
def PitchCount(balls, strikes):
    # Strikes: 0  1   2
    # Balls v|---------- 
    #       0| 0  1   2 
    #       1| 3  4   5
    #       2| 6  7   8
    #       3| 9  10  11
    if(balls == 0):
        if(strikes == 0):
            return 0
        if (strikes == 1):
            return 1
        return 2
    if(balls == 1):
        if(strikes == 0):
            return 3
        if (strikes == 1):
            return 4
        return 5
    if(balls == 2):
        if(strikes == 0):
            return 6
        if (strikes == 1):
            return 7
        return 8
    if(balls == 3):
        if(strikes == 0):
            return 9
        if (strikes == 1):
            return 10
        return 11


#preprocessing function generates the ground truth hitability of a pitch
#these values will definitely need to be adjusted
def GenerateGroundTruthLabels(pitchCall):
    if pitchCall == 'BallCalled':
        return 0
    if pitchCall == 'BallIntentional' or pitchCall == 'HitByPitch':
        return 1
    if pitchCall == 'StrikeSwinging' or pitchCall == 'StrikeCalled':
        return 2
    else:
        return 3

def GetSubCategory(data=None,feature=None, key=None):
    if data is None or feature is None or key is None:
        print("feature and key are required parameters, data is optional")
        return None
    
    #determine the rows to be dropped
    sub_data = data.loc[data[feature] == key]
    return sub_data

In [55]:
#import the columns we will need for training and preprocessing
bball_data = pd.read_csv(data_folder+bball_data_2_file, usecols=['Pitcher', 'PitcherThrows', 'Batter', 'BatterSide', 'PitchCall', 'RunsScored', 'VertBreak', 'HorzBreak', 'ZoneSpeed', 'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'PlateLocHeight', 'PlateLocSide', 'Balls', 'Strikes', 'TaggedPitchType'])

#run preprocessing functions and normalize data
bball_data['BallStrikeNum'] = bball_data.apply(lambda pitch : PitchCount(pitch['Balls'], pitch['Strikes']), axis=1)
bball_data['GroundTruth'] = bball_data.apply(lambda pitch : GenerateGroundTruthLabels(pitch['PitchCall']), axis=1)
bball_data['norm_PlateLocHeight'] = bball_data.apply(lambda pitch : normalize_PlateLocHeight(pitch['PlateLocHeight']), axis=1)
#bball_data['Zone'] = bball_data.apply(lambda pitch : PlateZone(pitch['PlateLocHeight'], pitch['PlateLocSide']))

#drop all features used for preprocessing
bball_data.drop(labels=PREPROCESSING_KEYS, axis=1, inplace=True)

#Remove NaN valued rows
bball_data.dropna(inplace=True)
bball_data.reset_index(drop=True, inplace=True)

#get data will all [key] as value for [feature]
#bball_data=GetSubCategory(data=baseball_data,feature=,key=)

#drop all features that could be useful subcategories
bball_data.drop(labels=SUBCATEGORY_KEYS, axis = 1, inplace=True)

#splitting data into training, validation, testing
total_samples = len(bball_data.index)
training_samples = math.floor(PCT_FOR_TRAIN*total_samples)
validation_samples = math.ceil((1-PCT_FOR_TRAIN)*total_samples)

sum = training_samples+validation_samples

print("total samples:",total_samples,
        "\ntraining samples:",training_samples,
        "\nvalidation samples:",validation_samples,
        "\nsum of training, and validation:",sum)

#makes shuffled version of the data
indices = np.arange(total_samples)
np.random.shuffle(indices)
shuffled_bball_data = bball_data.reindex(indices).reset_index(drop=True)

#gets the amount of random data points as determined by set proportion
training_data = shuffled_bball_data.iloc[0:training_samples]
validation_data = shuffled_bball_data.iloc[training_samples:training_samples+validation_samples]

training_data.keys()

total samples: 1029479 
training samples: 720635 
validation samples: 308844 
sum of training, and validation: 1029479


Index(['RunsScored', 'VertBreak', 'HorzBreak', 'PlateLocSide', 'ZoneSpeed',
       'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'BallStrikeNum',
       'GroundTruth', 'norm_PlateLocHeight'],
      dtype='object')

In [63]:
from sklearn import tree

#scikit learn doesnt support categorical columns
def TrainValidateTree(training_data, validation_data, possible_min_leaf_samples=None, possible_depths=None):
    X = training_data.loc[:,~training_data.columns.isin(LABELS_FEATURE)]
    y = training_data.loc[:,training_data.columns == LABELS_FEATURE[0]]

    vX = validation_data.loc[:,~validation_data.columns.isin(LABELS_FEATURE)]
    vy = validation_data.loc[:,validation_data.columns == LABELS_FEATURE[0]]

    #validation (hyper-parameter tuning)
    parameter_accuracy = []
    for msl in possible_min_leaf_samples:
        for d in possible_depths:
            #train on training with this iteration of parameters
            decision_tree = tree.DecisionTreeClassifier(min_samples_leaf=msl, max_depth=d)
            decision_tree = decision_tree.fit(X, y)
            #then check accuracy of validation data
            val_pred = decision_tree.predict(vX)
            score = metrics.accuracy_score(vy,val_pred)
            #put parameters and accuracy in matrix
            parameter_accuracy.append((msl,d,score,decision_tree.tree_.node_count))
            print(msl,d,score,decision_tree.tree_.node_count)

    #select parameters with highest accuracy
    parameter_accuracy.sort(key = lambda x:x[2])

    best_parameters = parameter_accuracy[-1]
    print(best_parameters)

    #train new decision tree using best hyperparameters found above
    best_decision_tree = tree.DecisionTreeClassifier(min_samples_leaf=best_parameters[0], max_depth=best_parameters[1])
    best_decision_tree = decision_tree.fit(X, y)

    res_pred = best_decision_tree.predict(vX)
    score = metrics.accuracy_score(vy,res_pred)
    print(score*100)

    return best_decision_tree, best_parameters[0], best_parameters[1]

In [57]:
#use graphviz to make pdf and image of tree
import graphviz

def plot_tree(decision_tree, feature_labels=TRAINING_FEATURES):
    print(decision_tree.tree_.node_count)

    class_labels = ['BallCalled','BallIntentional/HitByPitch','StrikeSwinging/StrikeCalled',"Correct Swing"]

    #tree with recognizable labels and color coded nodes corresponding to classes
    dot_data = tree.export_graphviz(decision_tree, out_file=None,
                                    feature_names = feature_labels,
                                    class_names = class_labels,
                                    filled=True, rounded = True,
                                    special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.render("bball_basic_decision_tree_labeled")
    graph 

In [64]:
d = [8, 12, 14]
mls = [1000, 3000, 5000]


best_hyperparams_and_model = TrainValidateTree(training_data = training_data, validation_data = validation_data, possible_min_leaf_samples=mls, possible_depths=d)
plot_tree(best_hyperparams_and_model[0], feature_labels=TRAINING_FEATURES)

Index(['RunsScored', 'VertBreak', 'HorzBreak', 'PlateLocSide', 'ZoneSpeed',
       'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'BallStrikeNum',
       'norm_PlateLocHeight'],
      dtype='object')


KeyboardInterrupt: 

In [None]:
from sklearn import ensemble

#use this cell to find the optimal hyperparameters for the model
def TrainValidateForest(training_data, validation_data, possible_subtrees=None, possible_min_leaf_samples=None, possible_depths=None):
    X = training_data.loc[:,~training_data.columns.isin(LABELS_FEATURE)]
    y = training_data.loc[:,training_data.columns == LABELS_FEATURE[0]]

    vX = validation_data.loc[:,~validation_data.columns.isin(LABELS_FEATURE)]
    vy = validation_data.loc[:,validation_data.columns == LABELS_FEATURE[0]]

    parameter_accuracy = []
    for msl in possible_min_leaf_samples:
        for d in possible_depths:
            for n_trees in possible_subtrees:
                #train on training with this iteration of parameters
                decision_forest = ensemble.RandomForestClassifier(n_estimators=n_trees, min_samples_leaf=msl, max_depth=d)
                decision_forest = decision_forest.fit(X, y.values.ravel())
                #then check accuracy of validation data
                val_pred = decision_forest.predict(vX)
                score = metrics.accuracy_score(vy,val_pred)
                #put parameters and accuracy in matrix
                parameter_accuracy.append((msl,d,n_trees,score))
                print(msl,d,n_trees,score)

    #select parameters with highest accuracy
    parameter_accuracy.sort(key = lambda x:x[3])

    best_parameters = parameter_accuracy[-1]
    print(best_parameters)

    #train new decision tree using best hyperparameters found above
    best_decision_forest = ensemble.RandomForestClassifier(min_samples_leaf=best_parameters[0], max_depth=best_parameters[1], n_estimators=best_parameters[2])
    best_decision_forest = decision_forest.fit(X, y.values.ravel())

    res_pred = best_decision_forest.predict(vX)
    score = metrics.accuracy_score(vy,res_pred)
    print(score*100)

    return best_decision_forest, best_parameters[0], best_parameters[1], best_parameters[2]

In [None]:
n_trees = [50, 100, 200, 300]
d = [8, 10, 12, 14]
mls = [1000, 3000, 5000]
best_hyperparams = TrainValidateForest(training_data=training_data,validation_data=validation_data,possible_subtrees=n_trees, possible_depths=d, possible_min_leaf_samples=mls)