# Random Forest Pipeline

This notebook will build a Random Forest Classifier on the Statcast data. We will tune hyperparmeters to boost performance, and then apply interpretability methods, LIME and SHAP to the results. 

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from data_import import preprocess_data
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate, validation_curve
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from Evaluation import *

In [3]:
#load in the preprocessed data
data = preprocess_data('Statcast_data.csv')
target = data['description']
#filter out 'player_name'; will not use as a feature
#and filter out the target
data = data.iloc[:, :-2]
data.head()



Unnamed: 0,release_speed,release_spin_rate,release_pos_x,release_pos_y,release_pos_z,pfx_x,pfx_z,vx0,vy0,vz0,...,pitch_name_Changeup,pitch_name_Curveball,pitch_name_Cutter,pitch_name_Sinker,pitch_name_Slider,pitch_name_Split Finger,pitch_name_nan,p_throws_L,p_throws_R,p_throws_nan
0,1.073523,0.225683,2.080234,-0.016458,-1.073886,2.119752,-0.27965,-1.984984,-1.064687,1.276245,...,0,0,0,0,0,0,0,1,0,0
1,1.340953,0.25821,2.033107,-0.393337,-0.823324,1.22874,0.483185,-1.856961,-1.349626,0.535466,...,0,0,0,0,0,0,0,1,0,0
2,-1.316632,0.898992,2.124057,1.138366,-1.320666,-0.982587,-1.009283,-1.006819,1.330334,1.589316,...,0,0,0,0,1,0,0,1,0,0
3,1.257381,0.274474,2.013077,-0.965694,-1.152964,1.662236,0.401776,-2.347235,-1.209132,-0.252616,...,0,0,0,0,0,0,0,1,0,0
4,1.307524,0.625765,2.099451,-0.293616,-1.431626,1.619067,0.139219,-2.665304,-1.265463,0.268337,...,0,0,0,0,0,0,0,1,0,0


In [4]:
#first, create a test set
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state = 777, test_size = .2)

#then instatiate the model we will use: Random Forest
forest = RandomForestClassifier()

In [5]:
eval_model(forest, X_train, y_train, cv = 5)

Mean train_accuracy Value: 0.9922612089115344
train_accuracy scores: [0.99196748 0.99178548 0.99276839 0.99241661 0.99236808]

Mean test_accuracy Value: 0.6323141347418302
test_accuracy scores: [0.63372968 0.62887649 0.63419724 0.63427656 0.63049071]

Mean train_f1 Value: 0.9878087494499166
train_f1 scores: [0.98735918 0.98706114 0.98860508 0.98804811 0.98797024]

Mean test_f1 Value: 0.23671642659584013
test_f1 scores: [0.2329505  0.23292206 0.23645021 0.24172285 0.23953651]

Mean train_roc_auc Value: 0.999592119377775
train_roc_auc scores: [0.99959433 0.99956299 0.99964614 0.99959014 0.999567  ]

Mean test_roc_auc Value: 0.5291673747700535
test_roc_auc scores: [0.52949212 0.52223255 0.52724669 0.53222646 0.53463906]



In [9]:
train_scores, valid_scores = validation_curve(forest, X_train, y_train, "max_depth",
                                             np.logspace(-7, 3, 3),
                                               cv=5)

In [19]:
valid_scores

array([[0.68041737, 0.68041737, 0.68040186, 0.68043489, 0.68043489],
       [0.68041737, 0.68041737, 0.68040186, 0.68043489, 0.68043489],
       [0.6476098 , 0.64872604, 0.65230052, 0.64815804, 0.6481095 ]])