# This Notebook serves to examine baseline non-neural models for the Pitch Prediction task
- Random Forest
- Logistic Regression
- Non-linear SVMs

In [10]:
import tensorflow as tf 
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [11]:
input_cols = ['PAOFINNING', 'PITCHOFPA', 'Pitcher Id Norm', 'PITCHERTHROWS', 'BATTERSIDE', 'INNING', 'OUTS', 
             'BALLS', 'STRIKES', 'PitchOutcome', 'RELSPEED', 'VERTRELANGLE', 'HORZRELANGLE', 'SPINRATE',
             'SPINAXIS', 'RELHEIGHT', 'RELSIDE', 'EXTENSION', 'VERTBREAK', 'INDUCEDVERTBREAK', 'HORZBREAK',
             'PLATELOCHEIGHT', 'PLATELOCSIDE', 'VERTAPPRANGLE', 'HORZAPPRANGLE', 'PFXX', 'PFXZ', 'X0', 'Z0',
             'VX0', 'VY0', 'VZ0', 'AX0', 'AY0', 'AZ0', 'Batter Id Norm', 'NEXT BALLS', 'NEXT STRIKES', 'NEXT OUTS',
             'NEXT INNING', 'PA NEXT', 'NEXT PITCHERID', 'Pitch Type', 'Next Pitch Label']

In [12]:
train = pd.read_csv("pitchesTrain.csv")
test = pd.read_csv("pitchesTest.csv")

# First Model: Random Forest

In [15]:
def RandomForest(train, test, input_cols=input_cols):
    
    X_train = train[input_cols[0:len(input_cols) - 1]]
    y_train = train[input_cols[-1]]
    
    X_test = test[input_cols[0:len(input_cols) - 1]]
    y_test = test[input_cols[-1]]
    
    RFModel = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
    RFModel.fit(X_train, y_train)
    
    predictions = RFModel.predict(X_train)
    train['predictions'] = predictions
    
    predictions_test = RFModel.predict(X_test)
    test['predictions'] = predictions_test
    
    score_train = RFModel.score(X_train, y_train)
    score_test = RFModel.score(X_test, y_test)
    
    print("Training accuracy: " + str(score_train))
    print("Testing accuracy: " + str(score_test))
    
    return train, test, RFModel

In [16]:
_, _, RFModel = RandomForest(train, test, input_cols=input_cols)

Training accuracy: 0.44855835465459554
Testing accuracy: 0.4453113468148367


# Second Model: Logistic Regression

In [6]:
def LogReg(train, test, input_cols=input_cols):
    
    X_train = train[input_cols[0:len(input_cols) - 1]]
    y_train = train[input_cols[-1]]
    
    X_test = test[input_cols[0:len(input_cols) - 1]]
    y_test = test[input_cols[-1]]
    
    LRModel = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
    LRModel.fit(X_train, y_train)
    
    predictions = LRModel.predict(X_train)
    train['predictions'] = predictions
    
    predictions_test = LRModel.predict(X_test)
    test['predictions'] = predictions_test
    
    score_train = LRModel.score(X_train, y_train)
    score_test = LRModel.score(X_test, y_test)
    
    print("Training accuracy: " + str(score_train))
    print("Testing accuracy: " + str(score_test))
    
    return train, test, LRModel

In [None]:
_, _, LRModel = LogReg(input_df, input_cols=input_cols)

# Third Model: Support Vector Machine

In [7]:
def SVM(train, test, input_cols=input_cols):
    
    X_train = train[input_cols[0:len(input_cols) - 1]]
    y_train = train[input_cols[-1]]
    
    X_test = test[input_cols[0:len(input_cols) - 1]]
    y_test = test[input_cols[-1]]
    
    SVMModel = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,kernel='linear', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False)
    SVMModel.fit(X_train, y_train)
    
    predictions = SVMModel.predict(X_train)
    train['predictions'] = predictions
    
    predictions_test = SVMModel.predict(X_test)
    test['predictions'] = predictions_test
    
    score_train = SVMModel.score(X_train, y_train)
    score_test = SVMModel.score(X_test, y_test)
    
    print("Training accuracy: " + str(score_train))
    print("Testing accuracy: " + str(score_test))
    
    return train, test, SVMModel

In [None]:
_, _, SVMModel = SVM(input_df, input_cols=input_cols)