In [1]:
import time

import cuml
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, VarianceThreshold
import xgboost as xgb

import data
import features
from training import calc_result_stats, split_X_y, train
from utils import print_time, write_dict

# Load Data
Load the data, only keeping the columns that we can use in both the training and holdout sets

In [2]:
cols = [
    "AwayScore",
    "B1",
    "B2",
    "B3",
    "Balls",
    "BatterHand",
    "DayNight",
    "GameDate",
    "GameNumber",
    "GameSeqNum",
    "HomeScore",
    "Inning",
    "Outs",
    "PAOfInning",
    "PitchBreakHorz",
    "PitchBreakVert",
    "PitchOfPA",
    "PitchResult",
    "PitchType",
    "PitcherHand",
    "PitcherID",
    "PlateLocX",
    "PlateLocZ",
    "ReleaseLocX",
    "ReleaseLocY",
    "ReleaseLocZ",
    "ReleaseSpeed",
    "ReleaseVelocityX",
    "ReleaseVelocityY",
    "ReleaseVelocityZ",
    "Season",
    "SpinRate",
    "Strikes",
    "TopInning",
]

df = data.load_train(columns=cols)
df_test = data.load_holdout()
df_test_original = data.load_holdout()

# Clean Data
Here we One Hot Encode all categorical columns, make the label column (SwingAndMiss), and then set the index to a unique index. Each of these functions can be found in the data.py file

In [3]:
def clean_data(df):
    df = data.one_hot_encode(df)
    df = data.set_index(df)
    return df

In [4]:
df = clean_data(df)
df = data.make_label(df)


# Create Features
I decided to create some manual features that I thought could be helpful in creating predictions. Below I create those feautres. The code for each function can be found in features.py

In [5]:
def create_features(df):
    df = features.appearance_gap(df)
    df = features.pa_of_game(df)
    df = features.pa_of_season(df)
    df = features.pitch_of_game(df)
    df = features.pitch_of_game_pitch_type(df)
    df = features.pitch_of_season(df)
    df = features.cumulative_avg_movements_game(df)
    df = features.cumulative_avg_velocities_game(df)
    df = features.runs_ahead(df)
    return df

In [6]:
df = create_features(df)

# Remove any Intermediate Features
I also removed any features for the Knuckleball pitch type because there were no knuckleballs thrown in the holdout set

In [7]:
final_cols = [
    "Inning",
    "PAOfInning",
    "PitchOfPA",
    "Balls",
    "Strikes",
    "Outs",
    "B1",
    "B2",
    "B3",
    "ReleaseLocX",
    "ReleaseLocY",
    "ReleaseLocZ",
    "ReleaseVelocityX",
    "ReleaseVelocityY",
    "ReleaseVelocityZ",
    "ReleaseSpeed",
    "PitchBreakVert",
    "PitchBreakHorz",
    "SpinRate",
    "PlateLocX",
    "PlateLocZ",
    "BatterHand_L",
    "PitcherHand_L",
    "DayNight_Day",
    "PitchType_CB",
    "PitchType_CF",
    "PitchType_CH",
    "PitchType_FB",
    "PitchType_SF",
    "PitchType_SI",
    "TopInning_BOTTOM",
    "AppearanceGap",
    "PAOfGame",
    "PAOfSeason",
    "PitchOfGame",
    "PitchOfGameCB",
    "PitchOfGameCF",
    "PitchOfGameCH",
    "PitchOfGameFB",
    "PitchOfGameSF",
    "PitchOfGameSI",
    "PitchOfGameSL",
    "PitchOfSeason",
    "PitchBreakHorzAvgGame",
    "PitchBreakHorzAvgGameCB",
    "PitchBreakHorzAvgGameCF",
    "PitchBreakHorzAvgGameCH",
    "PitchBreakHorzAvgGameFB",
    "PitchBreakHorzAvgGameSF",
    "PitchBreakHorzAvgGameSI",
    "PitchBreakHorzAvgGameSL",
    "PitchBreakVertAvgGame",
    "PitchBreakVertAvgGameCB",
    "PitchBreakVertAvgGameCF",
    "PitchBreakVertAvgGameCH",
    "PitchBreakVertAvgGameFB",
    "PitchBreakVertAvgGameSF",
    "PitchBreakVertAvgGameSI",
    "PitchBreakVertAvgGameSL",
    "PitcherID",
    "ReleaseSpeedAvgGame",
    "ReleaseSpeedAvgGameCB",
    "ReleaseSpeedAvgGameCF",
    "ReleaseSpeedAvgGameCH",
    "ReleaseSpeedAvgGameFB",
    "ReleaseSpeedAvgGameSF",
    "ReleaseSpeedAvgGameSI",
    "ReleaseSpeedAvgGameSL",
    "RunsAhead",
]
df = df.loc[:, final_cols + ["SwingAndMiss"]]

# Training
I decided to use a tree based model for this classification problem. The main reason for this was the project requirement to report "what inputs to your model(s) seem to be driving that prediction for that
particular pitch the most?" With tree based methods, it is very easy to see which features are most important in the classification. 

Because the classes are very imbalanced, I use the scale_pos_weight parameter of the XGBoost tree to train the model with respect to SwingAndMisses more. When I tried models without weighting, they had a high accuracy, but poor f1 score as they were producing predictions of all not a swing and miss.

I use cross validation in order to tune two parameters in this model: max_depth and scale_pos_weight. The cross validation function is written in train.py as train(). 


In [None]:
all_results = []
for weight in range(2, 11):
    for max_depth in range(4, 13, 2):
        clf = xgb.XGBClassifier(
            eval_metric="auc",
            gamma=0,
            objective="binary:logistic",
            max_depth=max_depth,
            n_estimators=1000,
            scale_pos_weight=weight,
            tree_method="gpu_hist",
            use_label_encoder=False
        )

        results = train(df, clf, feature_selection=None)
        stats = calc_result_stats(results)
        all_results.append({
            "gamma" : 0,
            "max_depth": max_depth,
            "stats": stats,
            "weight" : weight,
        })
all_results = sorted(all_results, key = lambda i : i["stats"]["val_acc_1_mean"])

In [8]:
# Final Model
clf = xgb.XGBClassifier(
    eval_metric="auc",
    gamma=0,
    objective="binary:logistic",
    max_depth=4,
    n_estimators=1000,
    scale_pos_weight=9,
    tree_method="gpu_hist",
    use_label_encoder=False
)
train_X, train_y = split_X_y(df)
clf.fit(train_X, train_y)

df_test = clean_data(df_test)
df_test = create_features(df_test)
df_test = df_test.loc[:, final_cols]
holdout_predictions = clf.predict(df_test)
df_test["Prediction"] = holdout_predictions
df_test = df_test.to_pandas()

# Save original DataFrame with predictions to csv
df_test_original = df_test_original.to_pandas()
df_test_original["Prediction"] = holdout_predictions
df_test_original.to_csv("Predictions.csv", index=False)

# Results
Because the data is imbalanced, I had to decide how to balance overall accuracy and accuracy per class for my final model. I decided to choose a model with a high True Positive rate for predicting a swing and miss; at the cost of a high false positive rate for predicting not a swing and miss. I mainly made this decision because the problem set is interested in determining what factors lead to swing and misses. A model that is biased more towards predicting swing and misses will be better suited for this.

The final parameters I chose were a class weight of 9 and a max depth of 4.

Because XGBoost uses a forest of trees, there is no definite order of what features are the most important. However, I aggregated the number of times a feature appeared in the first 7 splits (3 layers of the tree). The following table shows how many times the feature appeared in the tree.

| Feature | Appearances |
|------|------|
| PlateLocX | 500 |
| PlateLocZ | 483 |
| ReleaseVelocityX | 308 |
| PitchBreakVert | 305 |
| PitchBreakHorz | 260 |
| ReleaseLocX | 208 |
| ReleaseVelocityZ | 302 |
| ReleaseLocZ | 189 |
| SpinRate | 170 |
| ReleaseLocY | 168 |


# Find pitchers with best swing and miss rate
The final part of the problem set is to identify which pitchers and pitches in the holdout set are the best at creating swing and misses. To do this, for each holdout pitcher I calculated the number of swing and miss pitches. I then removed any pitchers that were in the lower quartile of pitches thrown to make sure there was an adequate sample size for each pitcher. 

In [9]:
swing_and_miss_ct = df_test_original.groupby(["PitcherID", "PitchType"]).apply(lambda g : g["Prediction"].sum())
pitch_count = df_test_original.groupby(["PitcherID", "PitchType"]).apply(lambda g : g.shape[0])
holdout_results = pd.concat((swing_and_miss_ct, pitch_count), axis=1)
holdout_results.columns=["SwingAndMissCt", "PitchCount"]
holdout_results["SwingAndMissAvg"] = holdout_results["SwingAndMissCt"]/holdout_results["PitchCount"]
pitch_count_cutoff = holdout_results["PitchCount"].quantile(q=0.25)
holdout_results = holdout_results[holdout_results["PitchCount"] > pitch_count_cutoff]
print(holdout_results.sort_values(by="SwingAndMissCt", ascending=False))
print(holdout_results.sort_values(by="SwingAndMissAvg", ascending=False))

                     SwingAndMissCt  PitchCount  SwingAndMissAvg
PitcherID PitchType                                             
363727    FB                    583        1483         0.393122
353975    FB                    582        1537         0.378660
353472    SL                    552         965         0.572021
          FB                    519        1334         0.389055
353975    SL                    468         986         0.474645
...                             ...         ...              ...
408384    SI                     15          83         0.180723
363201    CB                     15          56         0.267857
334902    SI                     14         102         0.137255
340757    SI                     13          56         0.232143
348223    CB                     13          50         0.260000

[240 rows x 3 columns]
                     SwingAndMissCt  PitchCount  SwingAndMissAvg
PitcherID PitchType                                             
3

# Future Work
Overall I feel as though there are a lot of improvements that can be made to my work. Firstly, I do not use any temporal component in my model, which could definitely improve accuracy. I could create features based on the results of the most recent previous pitches. Additionally, time series models such as an LSTM could prove useful. 

I did not spend any time trying neural networks due to the interpretability requirements of the assignment, but there are still ways to use feature selection algorithms in combination with neural networks to see what factors are contributing.