# Phillies Questionnaire

In [141]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [142]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [143]:
dat = pd.read_csv('./strikeouts.csv')

In [144]:
dat.head()

Unnamed: 0,Name,Team,fangraphs_id,G,IP,ERA,FIP,xFIP,AVG,K%,BB%,Swing%,Contact%,GB%,LD%,FB%,2ndHalfK%,2ndHalfIP
0,Clayton Kershaw,Dodgers,2036,19,132.1,2.18,3.02,2.76,0.194,0.314,0.044,0.508,0.731,0.453,0.197,0.35,0.249,42.2
1,Max Scherzer,Nationals,3137,18,128.1,2.1,2.62,3.08,0.162,0.355,0.055,0.519,0.692,0.387,0.146,0.467,0.324,72.1
2,Chris Sale,Red Sox,10603,18,127.2,2.75,2.1,2.67,0.198,0.359,0.044,0.513,0.684,0.363,0.218,0.419,0.366,86.2
3,Chris Archer,Rays,6345,19,123.0,3.95,3.17,3.44,0.245,0.285,0.074,0.469,0.719,0.422,0.224,0.354,0.303,78.0
4,Ivan Nova,Pirates,1994,18,120.2,3.21,4.14,4.17,0.252,0.139,0.031,0.495,0.849,0.479,0.228,0.293,0.212,66.1


In [145]:
dat.shape

(296, 18)

In [146]:
# We drop name and team because we can simply use fangraphs_id to identify each player
features = dat.drop(['Name', 'Team', 'fangraphs_id'], axis = 1)
features.shape

(296, 15)

In [147]:
# Labels are the values we want to predict
labels = np.array(features['2ndHalfK%'])

features = features.drop('2ndHalfK%', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

features = np.array(features)

In [148]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [149]:
# The baseline predictions are first half K%'s
baseline_preds = test_features[:, feature_list.index('K%')]

# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 3), 'percent.')

Average baseline error:  0.039 percent.


In [150]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [151]:
# Make predictions with the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 3), 'percent.')

# Plot predicted vs true prices
#plt.scatter(Y_test, Y_pred, alpha=0.5)
#plt.xlabel("2nd Half K%")
#plt.ylabel("Predicted 2nd Half K%")
#plt.title("Predicted 2nd Half K% vs. 2nd Half K%");

Mean Absolute Error: 0.041 percent.


In [152]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 3), '%.')

Accuracy: 80.748 %.


## Variable Importances

In [153]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: K%                   Importance: 0.43
Variable: Contact%             Importance: 0.14
Variable: LD%                  Importance: 0.05
Variable: BB%                  Importance: 0.04
Variable: GB%                  Importance: 0.04
Variable: FB%                  Importance: 0.04
Variable: 2ndHalfIP            Importance: 0.04
Variable: G                    Importance: 0.03
Variable: FIP                  Importance: 0.03
Variable: xFIP                 Importance: 0.03
Variable: AVG                  Importance: 0.03
Variable: Swing%               Importance: 0.03
Variable: IP                   Importance: 0.02
Variable: ERA                  Importance: 0.02


## Using 2 most important features

In [154]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Extract the two most important features
important_indices = [feature_list.index('K%'), feature_list.index('Contact%'), feature_list.index('LD%'),
                    feature_list.index('BB%'), feature_list.index('GB%'), feature_list.index('FB%')]
train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]

# Train the random forest
rf_most_important.fit(train_important, train_labels)

# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)

errors = abs(predictions - test_labels)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 3), 'percent.')

mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape

print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.044 percent.
Accuracy: 79.39 %.
