In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import warnings
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier

warnings.filterwarnings("ignore", category=FutureWarning, message="'force_all_finite' was renamed to 'ensure_all_finite'")

In [2]:
# Read Data
dict_dat = pd.read_csv("/Users/gamerboyj/data-dictionary.csv")
train_dat = pd.read_csv("/Users/gamerboyj/data-train.csv")
test_dat = pd.read_csv("/Users/gamerboyj/data-test.csv")

In [3]:
# Split Data
X_train = train_dat.drop(columns=["pitch_id", "gamedate", "venue_id",
"lf_id", "cf_id", "rf_id", "first_fielder", "is_airout"])
y_train = train_dat["is_airout"]
X_test = test_dat.drop(columns=["pitch_id","gamedate","venue_id","p_airout"])

In [4]:
# Preprocess Data
categorical_feats = ["level", "bat_side", "pitch_side", "inning",
"top", "pre_balls", "pre_strikes", "pre_outs"]
numeric_feats = ["temperature", "exit_speed", "hit_spin_rate", "vert_exit_angle", "horz_exit_angle"]

ct = make_column_transformer(
(make_pipeline(SimpleImputer(), StandardScaler()), numeric_feats),
(OneHotEncoder(handle_unknown='ignore'), categorical_feats))
kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [5]:
# Function that performs hyperparameter tuning, fits the model, and prints the best score and best parameters
models = []

def fit_to_result(name, model, param_grid, X_train, y_train):
    pipe_model = make_pipeline(ct, model)
    model_gs = GridSearchCV(pipe_model, param_grid=param_grid, n_jobs=-1, return_train_score=True, cv=kf)
    model_gs.fit(X_train, y_train)
    model_results_dict = {"Model Name": name, "Best Score": model_gs.best_score_, "Best Params": model_gs.best_params_}
    print(model_results_dict)
    models.append(model_results_dict)

In [6]:
# Hyperparameter optimization for Logistic Regression
lr_name = "Logistic Regression"
lr_param_grid = {
    "logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100]
    }
lr = LogisticRegression(random_state=1, max_iter=1000)
fit_to_result(lr_name, lr, lr_param_grid, X_train, y_train)

{'Model Name': 'Logistic Regression', 'Best Score': 0.6338514621500639, 'Best Params': {'logisticregression__C': 0.001}}


In [7]:
#Hyperparameter optimization for Random Forest
rf_name = "Random Forest"
rf_param_grid = {
    "randomforestclassifier__n_estimators": [100], 
    "randomforestclassifier__max_depth": [10, 20, None]
    }
rf = RandomForestClassifier(random_state=1)
rf.class_weight = "balanced"
fit_to_result(rf_name, rf, rf_param_grid, X_train, y_train)

{'Model Name': 'Random Forest', 'Best Score': 0.8789990610974454, 'Best Params': {'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 100}}


In [8]:
#Hyperparameter optimization for CatBoost
cb_name = "CatBoost" 
cb_param_grid = {
    "catboostclassifier__iterations": [1000], 
    "catboostclassifier__learning_rate": [0.1, 0.3],
    "catboostclassifier__depth": [4, 6, 8]
    }
cb = CatBoostClassifier(random_state=1, verbose=False)
fit_to_result(cb_name, cb, cb_param_grid, X_train, y_train)

{'Model Name': 'CatBoost', 'Best Score': 0.8953065776776701, 'Best Params': {'catboostclassifier__depth': 6, 'catboostclassifier__iterations': 1000, 'catboostclassifier__learning_rate': 0.1}}


In [9]:
#Hyperparameter optimization for LightGBM
lgbm_name = "LightGBM"
lgbm_param_grid = {
    "lgbmclassifier__n_estimators": [100], 
    "lgbmclassifier__learning_rate": [0.1, 0.3],
    "lgbmclassifier__max_depth": [4, 6, 8]
    }
lgbm = LGBMClassifier(random_state=1, verbose=-1)
fit_to_result(lgbm_name, lgbm, lgbm_param_grid, X_train, y_train)



{'Model Name': 'LightGBM', 'Best Score': 0.8928708295989545, 'Best Params': {'lgbmclassifier__learning_rate': 0.3, 'lgbmclassifier__max_depth': 6, 'lgbmclassifier__n_estimators': 100}}


In [10]:
# Show best model with optimal parameters
highest_score_model = max(models, key=lambda x: x["Best Score"])
print(highest_score_model)

{'Model Name': 'CatBoost', 'Best Score': 0.8953065776776701, 'Best Params': {'catboostclassifier__depth': 6, 'catboostclassifier__iterations': 1000, 'catboostclassifier__learning_rate': 0.1}}


In [11]:
# Fit the best model
unique_model_params = {
    "Logistic Regression": {
        "random_state": 1,
        "max_iter": 1000
        },
    "Random Forest": {
        "random_state": 1,
        "class_weight": "balanced"
        },
    "CatBoost": {
        "random_state": 1,
        "verbose": False
        },
    "LightGBM": {
        "random_state": 1,
        "verbose": -1
        }
    }

best_model_name = highest_score_model["Model Name"]
best_params = highest_score_model["Best Params"].copy()

if best_model_name == "Logistic Regression":
    best_model = LogisticRegression(**best_params)
elif best_model_name == "Random Forest":
    best_model = RandomForestClassifier(**best_params)
elif best_model_name == "CatBoost":
    best_params = {
        key.replace('catboostclassifier__', ''): value
        for key, value in highest_score_model['Best Params'].items()
        }
    best_model = CatBoostClassifier(**best_params)
elif best_model_name == "LightGBM":
    best_model = LGBMClassifier(**best_params)
else:
    raise ValueError(f"Unknown model name: {best_model_name}")

pipe_best_model = make_pipeline(ct, best_model)
pipe_best_model.fit(X_train, y_train)
prob = pipe_best_model.predict_proba(X_test)

0:	learn: 0.6003121	total: 13.2ms	remaining: 13.2s
1:	learn: 0.5403708	total: 24.6ms	remaining: 12.3s
2:	learn: 0.4943681	total: 34.2ms	remaining: 11.4s
3:	learn: 0.4543252	total: 43.8ms	remaining: 10.9s
4:	learn: 0.4264925	total: 54.4ms	remaining: 10.8s
5:	learn: 0.4057564	total: 65.4ms	remaining: 10.8s
6:	learn: 0.3894597	total: 76.4ms	remaining: 10.8s
7:	learn: 0.3745433	total: 87.2ms	remaining: 10.8s
8:	learn: 0.3614630	total: 99.3ms	remaining: 10.9s
9:	learn: 0.3546474	total: 111ms	remaining: 10.9s
10:	learn: 0.3452350	total: 121ms	remaining: 10.9s
11:	learn: 0.3387165	total: 132ms	remaining: 10.9s
12:	learn: 0.3326445	total: 143ms	remaining: 10.9s
13:	learn: 0.3254728	total: 154ms	remaining: 10.8s
14:	learn: 0.3217131	total: 165ms	remaining: 10.8s
15:	learn: 0.3180784	total: 178ms	remaining: 11s
16:	learn: 0.3137071	total: 189ms	remaining: 10.9s
17:	learn: 0.3100198	total: 199ms	remaining: 10.9s
18:	learn: 0.3073839	total: 209ms	remaining: 10.8s
19:	learn: 0.3047844	total: 221ms	

In [12]:
# Fill in predicted probability values for p_airout
p_airout_df = pd.DataFrame(prob, columns=["prob_0", "p_airout"])
test_probs = pd.concat([test_dat.drop(columns=["p_airout"]), p_airout_df], axis=1)
print(test_probs.head())

                               pitch_id    gamedate  temperature level  \
0  001e5980-3d49-11ee-a040-75d2e9a8133a  2023-08-17           77     A   
1  004e75c0-3fb0-11ee-8b6f-dfb7a529cdcb  2023-08-20           93     A   
2  006e64c0-22af-11ee-bd6c-4f4458ae8744  2023-07-14           68     A   
3  0073e030-05a9-11ee-933c-3d57a8ddb0c4  2023-06-07           81     B   
4  0088ee40-535a-11ee-bc12-fb267e2cefd4  2023-09-14           80     A   

  bat_side pitch_side  inning  top  pre_balls  pre_strikes  pre_outs  \
0        R          L       4    0          3            2         2   
1        L          L       8    1          3            1         0   
2        R          R       6    0          0            1         2   
3        R          R       5    1          2            2         1   
4        R          R       3    1          2            2         1   

   venue_id  exit_speed  hit_spin_rate  vert_exit_angle  horz_exit_angle  \
0    673187    79.51539    1298.605969        

In [13]:
importance_values = pipe_best_model.named_steps["catboostclassifier"].get_feature_importance()
feature_names = pipe_best_model.named_steps["columntransformer"].get_feature_names_out()
feature_pairs = list(zip(feature_names, importance_values))
feature_pairs.sort(key=lambda x: x[1], reverse=True)
for f, i in feature_pairs:
    print(f"{f}: {i}")

pipeline__horz_exit_angle: 56.89168853826807
pipeline__vert_exit_angle: 15.488457514643343
pipeline__exit_speed: 14.326777578239545
pipeline__hit_spin_rate: 4.11974840148584
pipeline__temperature: 2.1898989695792523
onehotencoder__bat_side_R: 1.4496160035794186
onehotencoder__bat_side_L: 1.3825059814697922
onehotencoder__level_A: 0.3098616254108883
onehotencoder__pre_strikes_1: 0.26120124345608503
onehotencoder__pitch_side_L: 0.24429326600759738
onehotencoder__pre_strikes_2: 0.20242457828303534
onehotencoder__pre_outs_1: 0.19786725418288964
onehotencoder__inning_6: 0.1868417646857508
onehotencoder__top_0: 0.18135943507048335
onehotencoder__pre_strikes_0: 0.17440650205173977
onehotencoder__pre_balls_2: 0.17088764528725167
onehotencoder__level_B: 0.1606707526231668
onehotencoder__pre_balls_0: 0.16010586987087755
onehotencoder__pre_balls_1: 0.16010367079657098
onehotencoder__pre_balls_3: 0.1599895806781692
onehotencoder__inning_1: 0.15792589093006928
onehotencoder__inning_3: 0.15453998895

In [14]:
hpipe_best_model.score(X_train, y_train)

0.9220342315380162

In [16]:
feature_importances = pd.DataFrame({
    "feature": X_train.columns,
    "importance": pipe_best_model.feature_importances_
})

AttributeError: 'Pipeline' object has no attribute 'feature_importances_'