In [34]:
import math
#from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import pandas as pd
import numpy as np

data_folder = "./data/"

#Get and load data
# pitch_14_17_file = "pitcher_2014_2017.csv"
bball_data_2_file = "Baseball Data-2.csv"

TRAINING_FEATURES = ['RunsScored', 'VertBreak', 'HorzBreak', 'ZoneSpeed',
       'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'BallStrikeNum', 'Zone']
       
LABELS_FEATURE = ['GroundTruth']
PREPROCESSING_KEYS = ['Balls', 'Strikes', 'PitchCall', 'PlateLocHeight', 'PlateLocSide']

SUBCATEGORY_KEYS = ['Pitcher', 'Batter', 'PitcherThrows', 'BatterSide', 'TaggedPitchType']
TaggedPitchType_Keys = ['Fastball', 'Curveball', 'ChangeUp', 'Slider']
PitcherThrows_Keys = ['Left', 'Right']
BatterSide_Keys = ['Left', 'Right']

PCT_FOR_TRAIN = 0.6

In [35]:
#normalizing function which moves 0 of PlateLocHeight to center of strike zone from ground
#returns a value 2.5ft (30in) less than the original value
def normalize_PlateLocHeight(PlateLocHeight):
    return ((PlateLocHeight * 12) - 30)/12


#preprocessing function used to calculate the plate location (this will be different than PitchCall (even in terms of BallCalled vs StrikeCalled))
#returns classification: heart (strike) = 0, shadow (strike) = 1, shadow (ball) = 2, chase (ball) = 3, waste (ball) = 4
def PlateZone(PlateLocHeight, PlateLocSide):
    FOOT = 12

    #waste zone
    #outside 84in to 6in, -20in to 20in horizontal, (strike zone * 200%)
    if ((PlateLocHeight > 7 or PlateLocHeight < 0.5) and (PlateLocSide < -(20/FOOT) or PlateLocSide > (20/FOOT))):
        return 4

    #heart zone
    #inside 38in to 22in vertical, -6.7in to 6.7in horizontal, (strike zone size * 67%)
    if ((PlateLocHeight < (38/FOOT) and PlateLocHeight > (22/FOOT)) and (PlateLocSide > (-6.7/FOOT) and PlateLocSide < (6.7/FOOT))):
        return 0

    #strike zone 
    #inside 42in to 18in vertical, -10in to 10in horizontal
    if (PlateLocHeight < (42/FOOT) and PlateLocHeight > (18/FOOT) and (PlateLocSide > (-10/FOOT) and PlateLocSide < (10/FOOT))):
        return 1

    #shadow zone
    #inside 46in to 14in vertical, -13.3in to 13.3in horizontal, (strike zone size * 133%)
    if (PlateLocHeight < (46/FOOT) and PlateLocHeight > (14/FOOT) and (PlateLocSide > (-13.3/FOOT) and PlateLocSide < (13.3/FOOT))):
        return 2

    #chase inside 84in to 6in, -20in to 20in horizontal, (strike zone * 200%)
    return 3


#preprocessing function used to generate a single number that will be used to classify the ball/strike count before the current pitch 
#returns int [0 - 11]
def PitchCount(balls, strikes):
    # Strikes: 0  1   2
    # Balls v|---------- 
    #       0| 0  1   2 
    #       1| 3  4   5
    #       2| 6  7   8
    #       3| 9  10  11
    if(balls == 0):
        if(strikes == 0):
            return 0
        if (strikes == 1):
            return 1
        return 2
    if(balls == 1):
        if(strikes == 0):
            return 3
        if (strikes == 1):
            return 4
        return 5
    if(balls == 2):
        if(strikes == 0):
            return 6
        if (strikes == 1):
            return 7
        return 8
    if(balls == 3):
        if(strikes == 0):
            return 9
        if (strikes == 1):
            return 10
        return 11


#preprocessing function generates the ground truth hitability of a pitch
#these values will definitely need to be adjusted
def GenerateGroundTruthLabels(pitchCall):
    if pitchCall == 'BallCalled':
        return 0
    if pitchCall == 'BallIntentional' or pitchCall == 'HitByPitch':
        return 1
    if pitchCall == 'StrikeSwinging' or pitchCall == 'StrikeCalled':
        return 2
    else:
        return 3

def GetSubCategory(data, feature, key):
    #determine the rows to be dropped
    sub_data = data.loc[data[feature] == key]
    return sub_data

In [36]:
from sklearn import tree
import time

#scikit learn doesnt support categorical columns
def TrainValidateTree(training_data, validation_data, possible_min_leaf_samples=None, possible_depths=None):
    X = training_data.loc[:,~training_data.columns.isin(LABELS_FEATURE)]
    y = training_data.loc[:,training_data.columns == LABELS_FEATURE[0]]

    vX = validation_data.loc[:,~validation_data.columns.isin(LABELS_FEATURE)]
    vy = validation_data.loc[:,validation_data.columns == LABELS_FEATURE[0]]

    #validation (hyper-parameter tuning)
    parameter_accuracy = []
    for msl in possible_min_leaf_samples:
        for d in possible_depths:
            #train on training with this iteration of parameters
            decision_tree = tree.DecisionTreeClassifier(min_samples_leaf=msl, max_depth=d)
            begin_time = time.time()
            decision_tree = decision_tree.fit(X, y)
            train_time = time.time() - begin_time
            #then check accuracy of validation data
            begin_time = time.time()
            val_pred = decision_tree.predict(vX)
            val_time = time.time() - begin_time
            score = metrics.accuracy_score(vy,val_pred)
            #put parameters and accuracy in matrix
            tree_metrics = msl,d,score,decision_tree.tree_.node_count,train_time,val_time

            parameter_accuracy.append(tree_metrics)
            print(tree_metrics)

    #select parameters with highest accuracy
    parameter_accuracy.sort(key = lambda x:x[2])

    best_parameters = parameter_accuracy[-1]
    print(best_parameters)

    #train new decision tree using best hyperparameters found above
    best_decision_tree = tree.DecisionTreeClassifier(min_samples_leaf=best_parameters[0], max_depth=best_parameters[1])
    begin_time = time.time()
    best_decision_tree = decision_tree.fit(X, y)
    train_time = time.time() - begin_time

    begin_time = time.time()
    res_pred = best_decision_tree.predict(vX)
    val_time = time.time() - begin_time
    score = metrics.accuracy_score(vy,res_pred)
    print(score*100)

    return best_decision_tree, best_parameters[0], best_parameters[1], train_time, val_time

In [37]:
from sklearn import ensemble
import time

#use this cell to find the optimal hyperparameters for the model
def TrainValidateForest(training_data, validation_data, jobs=-1, subtrees=[100], depths=[None], \
                        min_sample_split=[2], min_samples_leaf=[1], max_features=['auto'], max_leaf_nodes=[None]):

    X = training_data.loc[:,~training_data.columns.isin(LABELS_FEATURE)]
    y = training_data.loc[:,training_data.columns == LABELS_FEATURE[0]]

    vX = validation_data.loc[:,~validation_data.columns.isin(LABELS_FEATURE)]
    vy = validation_data.loc[:,validation_data.columns == LABELS_FEATURE[0]]

    parameter_accuracy = []
    for n_trees in subtrees:
        for d in depths:
            for mss in min_sample_split:
                for msl in min_samples_leaf:
                    for mf in max_features:
                        for mln in max_leaf_nodes:
                            #train on training with this iteration of parameters
                            decision_forest = ensemble.RandomForestClassifier(n_estimators=n_trees, max_depth=d, \
                                min_samples_split=mss, min_samples_leaf=msl, max_features=mf, max_leaf_nodes=mln, n_jobs=jobs)

                            begin_time = time.time()
                            decision_forest = decision_forest.fit(X, y.values.ravel())
                            train_time = time.time() - begin_time

                            #then check accuracy of validation data
                            begin_time = time.time()
                            val_pred = decision_forest.predict(vX)
                            val_time = time.time() - begin_time

                            score = metrics.accuracy_score(vy,val_pred)

                            #put parameters and accuracy in matrix
                            tree_metrics = score,n_trees,d,mss,msl,mf,mln,train_time,val_time
                            parameter_accuracy.append(tree_metrics)
                            print(tree_metrics)

    #select parameters with highest accuracy
    parameter_accuracy.sort(key = lambda x:x[0])

    best_parameters = parameter_accuracy[-1]
    print(best_parameters)

    #train new decision tree using best hyperparameters found above
    best_decision_forest = ensemble.RandomForestClassifier(n_estimators=best_parameters[1], max_depth=best_parameters[2], min_samples_split=best_parameters[3], min_samples_leaf=best_parameters[4], max_features=best_parameters[5], max_leaf_nodes=best_parameters[6], n_jobs=jobs)

    begin_time = time.time()
    best_decision_forest = decision_forest.fit(X, y.values.ravel())
    train_time = time.time() - begin_time

    begin_time = time.time()
    res_pred = best_decision_forest.predict(vX)
    val_time = time.time() - begin_time

    score = metrics.accuracy_score(vy,res_pred)
    print(score*100)

    return best_decision_forest, score, best_parameters[1], best_parameters[2], best_parameters[3], best_parameters[4], best_parameters[5], best_parameters[6], train_time, val_time

In [38]:
#use graphviz to make pdf and image of tree
import graphviz

def plot_tree(decision_tree, feature_labels=TRAINING_FEATURES, name="bball_basic_decision_tree_labeled", folder_name=None):
    print(decision_tree.tree_.node_count)

    class_labels = ['BallCalled','BallIntentional/HitByPitch','StrikeSwinging/StrikeCalled',"Correct Swing"]

    if folder_name is None:
    #tree with recognizable labels and color coded nodes corresponding to classes
        dot_data = tree.export_graphviz(decision_tree, out_file=None,
                                        feature_names = feature_labels,
                                        class_names = class_labels,
                                        filled=True, rounded = True,
                                        special_characters=True)
        graph = graphviz.Source(dot_data)
        graph.render(filename=name, cleanup=True, format='png', directory="tree_plots")
    
    else:
        dot_data = tree.export_graphviz(decision_tree, out_file=None,
                                        feature_names = feature_labels,
                                        class_names = class_labels,
                                        filled=True, rounded = True,
                                        special_characters=True)
        graph = graphviz.Source(dot_data)
        graph.render(filename=name, cleanup=True, format='png', directory="tree_plots")


In [43]:
#import the columns we will need for training and preprocessing
bball_data = pd.read_csv(data_folder+bball_data_2_file, usecols=['Pitcher', 'PitcherThrows', 'Batter', 'BatterSide', 'PitchCall', 'RunsScored', 'VertBreak', 'HorzBreak', 'ZoneSpeed', 'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'PlateLocHeight', 'PlateLocSide', 'Balls', 'Strikes', 'TaggedPitchType'])

#run preprocessing functions and normalize data
bball_data['BallStrikeNum'] = bball_data.apply(lambda pitch : PitchCount(pitch['Balls'], pitch['Strikes']), axis=1)
bball_data['GroundTruth'] = bball_data.apply(lambda pitch : GenerateGroundTruthLabels(pitch['PitchCall']), axis=1)
#bball_data['norm_PlateLocHeight'] = bball_data.apply(lambda pitch : normalize_PlateLocHeight(pitch['PlateLocHeight']), axis=1)
bball_data['Zone'] = bball_data.apply(lambda pitch : PlateZone(pitch['PlateLocHeight'], pitch['PlateLocSide']), axis=1)

#drop all features used for preprocessing
bball_data.drop(labels=PREPROCESSING_KEYS, axis=1, inplace=True)

#get data will all [key] as value for [feature]
#bball_data=GetSubCategory(data=bball_data,feature='TaggedPitchType',key='Fastball')

#Remove NaN valued rows
bball_data.dropna(inplace=True)
bball_data.reset_index(drop=True, inplace=True)

rows_by_key = []
for curr_key in TaggedPitchType_Keys:
        rows_by_key.append(GetSubCategory(data=bball_data,feature='TaggedPitchType',key=curr_key))

split_data = []
for i in range(len(rows_by_key)):
        #drop all colums that cointain strings
        curr_data = rows_by_key[i]
        curr_data.drop(labels=SUBCATEGORY_KEYS, axis = 1, inplace = True)
        curr_data.dropna(inplace=True)
        curr_data.reset_index(drop=True, inplace=True)

        print(TaggedPitchType_Keys[i])
        this_keyset = []

        #splitting data into training, validation, testing
        total_samples = len(curr_data.index)
        training_samples = math.floor(PCT_FOR_TRAIN*total_samples)
        validation_samples = math.ceil((1-PCT_FOR_TRAIN)*total_samples)

        this_keyset.append(total_samples)
        this_keyset.append(training_samples)
        this_keyset.append(validation_samples)

        sum = training_samples+validation_samples

        print("total samples:",total_samples,
                "\ntraining samples:",training_samples,
                "\nvalidation samples:",validation_samples,
                "\nsum of training, and validation:",sum)

        #makes shuffled version of the data
        indices = np.arange(total_samples)
        np.random.shuffle(indices)
        shuffled_bball_data = curr_data.reindex(indices).reset_index(drop=True)

        #gets the amount of random data points as determined by set proportion
        training_data = shuffled_bball_data.iloc[0:training_samples]
        validation_data = shuffled_bball_data.iloc[training_samples:training_samples+validation_samples]

        this_keyset.append(training_data)
        this_keyset.append(validation_data)

        split_data.append(this_keyset)

Fastball
total samples: 580335 
training samples: 348201 
validation samples: 232134 
sum of training, and validation: 580335
Curveball
total samples: 85542 
training samples: 51325 
validation samples: 34217 
sum of training, and validation: 85542
ChangeUp
total samples: 109613 
training samples: 65767 
validation samples: 43846 
sum of training, and validation: 109613
Slider
total samples: 176522 
training samples: 105913 
validation samples: 70609 
sum of training, and validation: 176522


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [40]:
#d = [8, 14]
#mls = [1000]

#best_hyperparams_and_model = TrainValidateTree(training_data = training_data, validation_data = validation_data, possible_min_leaf_samples=mls, possible_depths=d)
#plot_tree(best_hyperparams_and_model[0])

In [45]:
n_trees = [100]
d = [None]
mss = [100]
msl = [100]
mf = [len(TRAINING_FEATURES)]
mln = [None]

#n_trees = list(n_trees[0])
#d = list(d[0])
#mss = list(mss[0])
#msl = list(msl[0])
#mf = list(mf[0])
#mln = list(mln[0])

model_list = []

for subfeature_idx in range(len(split_data)):
    curr_row = split_data[subfeature_idx]

    td = curr_row[3]
    vd = curr_row[4]
    model_list.append(TrainValidateForest(training_data=td, validation_data=vd, subtrees=n_trees, depths=d, \
                        min_sample_split=mss, min_samples_leaf=msl, max_features=mf, max_leaf_nodes=mln))

#i = 0
#for tree_ in model_list[0].estimators_:
#    plot_tree(decision_tree=tree_, name='forest_tree'+str(i))
#    i += 1

(0.6502321934744587, 100, None, 100, 50, 1, None, 13.762155055999756, 0.7470080852508545)
(0.6614800072371992, 100, None, 100, 50, 3, None, 28.470919132232666, 0.8792951107025146)
(0.6613938501038193, 100, None, 100, 50, 9, None, 69.62489604949951, 0.7145979404449463)
(0.6614800072371992, 100, None, 100, 50, 3, None, 28.470919132232666, 0.8792951107025146)


KeyboardInterrupt: 