In [1]:
import os
from datetime import datetime
import joblib
import pandas as pd
import warnings
from colorama import Fore
import json
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import numpy as np
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score, precision_score

cols = ['PitcherTeam', 'Pitcher', 'PitcherThrows', 'TaggedPitchType', 'RelSpeed', 'SpinRate', 'SpinAxis', 'Tilt',
        'VertBreak', 'HorzBreak', 'HomeTeam', 'yt_Efficiency']
features = ['RelSpeed', 'SpinRate', 'VertBreak', 'HorzBreak', 'yt_Efficiency', 'SpinAxis']

data = pd.DataFrame()
directory = 'Game Data/Joliet Slammers'
try:
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            data = pd.concat([data, df], ignore_index=True)
except OSError:
    print("ERROR: File not found, try again")

In [2]:
data.PitcherTeam = data.PitcherTeam.replace({'Joliet slammers': 'Joliet Slammers'})

In [3]:
data = data[((data['HomeTeam'] == 'Joliet Slammers') & (data['PitcherTeam'] == 'Joliet Slammers')) | ((data['HomeTeam'] == 'Schaumburg Boomers') & (data['PitcherTeam'] == 'Schaumburg Boomers')) | ((data['HomeTeam'] == 'Lake Erie Crushers') & (data['PitcherTeam'] == 'Lake Erie Crushers'))].dropna(subset=features + ['TaggedPitchType'])

In [4]:
data['TaggedPitchType'].value_counts()

Fastball     2293
Slider       1057
Sinker        743
Changeup      478
Cutter        446
Curveball     352
Splitter      126
Name: TaggedPitchType, dtype: int64

In [5]:
pitch_name_mapping = {
    'Fastball': 0,
    'Sinker': 1,
    'Cutter': 2,
    'Splitter': 3,
    'Slider': 4,
    'Curveball': 5,
    'Changeup': 6,
}

data['TaggedPitchType'] = data['TaggedPitchType'].replace(pitch_name_mapping)

data['PitcherThrows'] = data['PitcherThrows'].replace({
    'Right': 0,
    'Left': 1,
    'R': 0,
    'L': 1
})

data.loc[data['PitcherThrows'] == 1, 'HorzBreak'] *= -1
data.loc[data['PitcherThrows'] == 1, 'SpinAxis'] = 360 - data.loc[data['PitcherThrows'] == 1, 'SpinAxis']
    
X = data[features]
y = data.TaggedPitchType

In [6]:
def objective(space, X_train, X_test, y_train, y_test):
    model = XGBClassifier(
        max_depth=int(space['max_depth']),
        gamma=space['gamma'],
        reg_alpha=int(space['reg_alpha']),
        reg_lambda=space['reg_lambda'],
        colsample_bytree=space['colsample_bytree'],
        min_child_weight=int(space['min_child_weight']),
        n_estimators=int(space['n_estimators']))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.seed())

space = {
    'max_depth': hp.quniform("max_depth", 3, 18, 1),
    'gamma': hp.uniform('gamma', 1, 9),
    'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'seed': 12
}

best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=50)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print("Best parameters:", best_params)

model = XGBClassifier(**best_params)
model.fit(X_train, y_train)

100%|█████████████████████████████████████████████████| 50/50 [01:08<00:00,  1.38s/trial, best loss: -0.83803457688808]
Best parameters: {'max_depth': 17, 'gamma': 1.398767785636796, 'reg_alpha': 44, 'reg_lambda': 0.4452402022741278, 'colsample_bytree': 0.803505451297938, 'min_child_weight': 6, 'n_estimators': 50}


In [7]:
y_pred_final = model.predict(X_test)
accuracy_final = accuracy_score(y_test, y_pred_final)
print("Final accuracy on test set:", accuracy_final)

Final accuracy on test set: 0.83803457688808


In [8]:
joblib.dump(model, 'model/fl-pitch-tagging-model.joblib')

['model/fl-pitch-tagging-model.joblib']