# MODELING TEMPLATE

### FOLLOW THIS TEMPLATE FOR ALL MODELING PROCESSES

### DATA IS THE IMBALANCED DATA WITH SELECTED FEATURES 

In [1]:
#start with all dependencies

import numpy as np
import pandas as pd
from Evaluation import *
import sklearn
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.preprocessing import Imputer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
%matplotlib inline
import imblearn
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)



In [2]:
#Read in the data as pandas dataframe
data = 'Statcast_data.csv'

def load_data(file):
    '''Load the Statcast_data.csv file into a pandas dataframe.
       Resolve unnecessary column in csv file'''
    baseball = pd.read_csv(file, index_col = 0)
    baseball['description'] = baseball['description'].replace({'blocked_ball': 0, 'ball': 0, "called_strike": 1})
    baseball['position_x'] = baseball['release_pos_x'] + baseball['pfx_x']
    baseball['position_z'] = baseball['release_pos_z'] + baseball['pfx_z']
    baseball['pitch_name'] = baseball['pitch_name'].replace('Knuckle Curve', 'Curveball')
    baseball = baseball[baseball.pitch_name != 'Eephus'] 
    
    final_df = baseball.loc[:, ['p_throws','pitch_name', 'position_x', 'position_z', 
                           'release_spin_rate', 'sz_top', 'sz_bot','vx0', 'vy0', 'vz0', 'description']]

    final_df=pd.get_dummies(final_df, prefix = 'pitch')
    final_df['player_name'] = baseball['player_name']
    
    final_df = final_df.dropna()
    return final_df




bsb = load_data(data)
X = bsb.drop(columns = ['player_name', 'description'])
y = bsb['description']

X.head()



Unnamed: 0,position_x,position_z,release_spin_rate,sz_top,sz_bot,vx0,vy0,vz0,pitch_L,pitch_R,pitch_2-Seam Fastball,pitch_4-Seam Fastball,pitch_Changeup,pitch_Curveball,pitch_Cutter,pitch_Sinker,pitch_Slider,pitch_Split Finger
0,4.9868,5.6846,2314.0,3.2971,1.5059,-9.8035,-138.113,0.1339,1,0,1,0,0,0,0,0,0,0
1,4.1077,6.3981,2324.0,3.3136,1.573,-9.0084,-140.5865,-2.4218,1,0,0,1,0,0,0,0,0,0
2,2.3349,4.9982,2521.0,3.9119,1.708,-3.7285,-117.3223,1.214,1,0,0,0,0,0,0,0,1,0
3,4.4509,6.1538,2329.0,3.5553,1.5639,-12.0533,-139.3669,-5.1407,1,0,0,1,0,0,0,0,0,0
4,4.5827,5.8014,2437.0,3.345,1.6241,-14.0287,-139.8559,-3.3434,1,0,0,1,0,0,0,0,0,0


# Logistic Regression 

## 1: Without PCA
Without PCA

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 777)

logit_reg = LogisticRegression()
sampler = RandomUnderSampler(ratio = 1, random_state=777)

logit_pipe_no_pca = make_pipeline(sampler, logit_reg)

logit_pipe_no_pca_results = cross_validate(logit_pipe_no_pca, X_train, y_train, 
                            scoring = ['accuracy', 'f1', 'roc_auc'], 
                            cv =5, return_estimator=True, return_train_score = True)

for result in ['train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'train_roc_auc', 'test_roc_auc']:
    print(f"Mean {result} Value: {np.mean(logit_pipe_no_pca_results[result])}")
    print(f"{result} scores: {logit_pipe_no_pca_results[result]}")
    print() 

Mean train_accuracy Value: 0.5727747981838016
train_accuracy scores: [0.57312831 0.57490711 0.57148128 0.5706907  0.5736666 ]

Mean test_accuracy Value: 0.5724902332588193
test_accuracy scores: [0.56609044 0.57557711 0.5769474  0.56962159 0.57421463]

Mean train_f1 Value: 0.4892347539638376
train_f1 scores: [0.48980299 0.48991272 0.48922603 0.48660658 0.49062544]

Mean test_f1 Value: 0.4888899644589613
test_f1 scores: [0.48046949 0.49073547 0.49474413 0.48796087 0.49053986]

Mean train_roc_auc Value: 0.5892749975200653
train_roc_auc scores: [0.59084795 0.59002018 0.5871227  0.58921055 0.5891736 ]

Mean test_roc_auc Value: 0.588528203305888
test_roc_auc scores: [0.58262026 0.5909469  0.59483526 0.58692751 0.58731108]



In [4]:
logit_pipe_no_pca_results['estimator'][0][1].coef_

array([[-4.02404641e-02,  2.66302193e-01,  1.83339257e-04,
         6.41872180e-01, -1.20465341e+00, -2.32675213e-02,
         1.75793975e-02,  8.20138388e-02, -6.01481291e-02,
         7.70620101e-02,  4.19285845e-01,  3.50362920e-01,
        -4.86525257e-01, -1.02239139e-01,  6.28436745e-02,
         5.33577947e-01, -1.00459398e-01, -6.26897566e-01]])

In [5]:
coefs = [coef for coef in logit_pipe_no_pca_results['estimator'][0][1].coef_ ]
print("Feature Coefficient Values: \n")
for col, coef in zip(X_train.columns, coefs[0]):
    print( col, coef)

Feature Coefficient Values: 

position_x -0.04024046414317446
position_z 0.2663021928355917
release_spin_rate 0.00018333925717775937
sz_top 0.6418721803925094
sz_bot -1.2046534137235971
vx0 -0.023267521254397957
vy0 0.017579397511534072
vz0 0.08201383881658676
pitch_L -0.06014812908005663
pitch_R 0.07706201009126125
pitch_2-Seam Fastball 0.4192858449577251
pitch_4-Seam Fastball 0.3503629198025354
pitch_Changeup -0.48652525699550225
pitch_Curveball -0.1022391392576193
pitch_Cutter 0.06284367445160861
pitch_Sinker 0.5335779473753631
pitch_Slider -0.10045939781937029
pitch_Split Finger -0.6268975662471908


## Interpretation

## draw a few rules. since negative, if this goes up, more toward ball yhat goes more toward 0, the chance of strike goes down.

Looking at the coefficient values, it apppers that a fastball and its derivatives, a 2-seam fastball and a sinker, have a positive affect on determining a ball or a strike. Off speed pitches, like a Changeup, seem to have the opposite affect.Thus, one 'rule' that is inferable from the results is that throwing a fastball increases the chances of throwing a strike, while throwing an off speed deacreases the liklihood of a strike. 

This, however, also assumes that the values of the other pitches are held constant. In delivering a baseball pitch, that assumption simply does not hold; the process is too complex to be able to constantly provide the same values of velocity and movement. 

Of course, just the type of pitch alone is not very useful in determining a strike or ball.  Looking at the other features, the position of the pitch in the z dimension also seems to show that increasing the value of this feature leads to a higher chance of prediction for a strike. The top of the strike zone also has a significant coefficient value, which intuitively makes sense as well; the bigger a batter is, the larger his strike zone is, which allows for a better chance of the pitcher throwing a strike. 

Thus, some human subjective rules that we can infer is that fastball and related pitches thrown with a higher level trajectory may have a higher chance of being called a strike.  

In [6]:
predictions = logit_pipe_no_pca_results['estimator'][0].predict(X_test)

def eval_test_set(x_test_predictions, y_test):
    print(f"Accuracy Score: {metrics.accuracy_score(y_test, x_test_predictions)}")
    print()
    print(f"AUC Score: {metrics.roc_auc_score(y_test, x_test_predictions)}")
    print()
    print(f"F1 Score: {metrics.f1_score(y_test, x_test_predictions)}")
    print()
    print(f"Classification Report: \n {metrics.classification_report(y_test, x_test_predictions)}")
    print()
    print(f" Confustion Matrix: \n {metrics.confusion_matrix(y_test, x_test_predictions)}")

eval_test_set(predictions, y_test)



Accuracy Score: 0.5735698700313063

AUC Score: 0.5910787303168575

F1 Score: 0.49168834106072606

Classification Report: 
               precision    recall  f1-score   support

           0       0.76      0.54      0.63     21437
           1       0.40      0.64      0.49     10186

    accuracy                           0.57     31623
   macro avg       0.58      0.59      0.56     31623
weighted avg       0.64      0.57      0.59     31623


 Confustion Matrix: 
 [[11616  9821]
 [ 3664  6522]]


## With PCA


In [65]:
logit_pipe_with_pca_cv_results_lst = []
for n_components in [2,3,4,5,6]:
    print(f"Number of Components: {n_components}")
    print('-'*75)
    print()
    pca = PCA(n_components=n_components)
    
    pipe = Pipeline(steps = [
        ('sampler', sampler), 
        ('pca', pca),
        ('logit', logit_reg)
    ])
    
    logit_pipe_with_pca_results = cross_validate(pipe, X_train, y_train, scoring = ['accuracy', 'f1', 'roc_auc'], 
                                cv =3, return_estimator=True, return_train_score = True)

    logit_pipe_with_pca_cv_results_lst.append(logit_pipe_with_pca_results)
    for result in ['train_accuracy', 'test_accuracy', 'train_f1', 'test_f1', 'train_roc_auc', 'test_roc_auc']:
        print(f"Mean {result} Value: {np.mean(logit_pipe_with_pca_results[result])}")
        print(f"{result} scores: {logit_pipe_with_pca_results[result]}")
        print() 

Number of Components: 2
---------------------------------------------------------------------------

Mean train_accuracy Value: 0.490534213157147
train_accuracy scores: [0.49145387 0.48908214 0.49106663]

Mean test_accuracy Value: 0.49097690581665704
test_accuracy scores: [0.49286911 0.49198368 0.48807792]

Mean train_f1 Value: 0.41868041599152833
train_f1 scores: [0.41869544 0.41915479 0.41819102]

Mean test_f1 Value: 0.4187723353159356
test_f1 scores: [0.41689997 0.41905037 0.42036666]

Mean train_roc_auc Value: 0.5154582861550399
train_roc_auc scores: [0.51482316 0.51512406 0.51642765]

Mean test_roc_auc Value: 0.5151094108313564
test_roc_auc scores: [0.51562455 0.51690123 0.51280245]

Number of Components: 3
---------------------------------------------------------------------------

Mean train_accuracy Value: 0.48743516277632803
train_accuracy scores: [0.48615701 0.48675785 0.48939063]

Mean test_accuracy Value: 0.4867710589062379
test_accuracy scores: [0.48999146 0.48673434 0.483

In [66]:
best_logit_pipe_with_pca_predictions = logit_pipe_with_pca_cv_results_lst[-1]['estimator'][0].predict(X_test)

eval_test_set(best_logit_pipe_with_pca_predictions, y_test)

Accuracy Score: 0.5748663947127091

AUC Score: 0.5748512179227437

F1 Score: 0.4655323209032361

Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.57      0.65     21437
           1       0.39      0.57      0.47     10186

    accuracy                           0.57     31623
   macro avg       0.57      0.57      0.56     31623
weighted avg       0.63      0.57      0.59     31623


 Confustion Matrix: 
 [[12324  9113]
 [ 4331  5855]]


In [60]:
logit_pipe_with_pca_cv_results_lst[-2]['estimator'][0]

Pipeline(memory=None,
         steps=[('sampler',
                 RandomUnderSampler(random_state=777, ratio=1,
                                    replacement=False, return_indices=False,
                                    sampling_strategy=1)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=5,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('logit',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)