## Tuning
Now that we know the XGB mode performs best, we can hypertune the parameters to squeeze some extra precision and accuracy out of the model

In [1]:
from pybaseball import statcast, cache
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
import joblib
from tqdm import tqdm
cache.enable()

In [2]:
gb = joblib.load('models/three-feature.joblib')

In [3]:
data = pd.read_csv('data/data.csv')

In [4]:
columns = ['events', 'launch_speed', 'launch_angle', 'spray_angle']
features = ['launch_speed', 'launch_angle', 'spray_angle']
care_about = ['home_run', 'field_out', 'single', 'double', 'triple']
filtered_df = data[data['events'].isin(care_about)].loc[:, data.columns.isin(columns)].dropna().reset_index(drop=True)

Using a 10,000 row subset of the data in order to perform a randomized search to hypertune the parameters...

In [5]:
filtered_df_train = filtered_df.iloc[0: 10000]

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(filtered_df_train.events)

In [7]:
param_grid = {
    'n_estimators': np.arange(50, 151, 10),
    'learning_rate': np.arange(0.01, 0.2, 0.05),
    'max_depth': np.arange(3, 8),
    'min_child_weight': np.arange(1, 6),
    'gamma': np.arange(0, 0.5, 0.1),
    'subsample': np.arange(0.7, 1.0, 0.1),
    'colsample_bytree': np.arange(0.7, 1.0, 0.1)
}

random_search = RandomizedSearchCV(gb, param_distributions=param_grid, n_iter=50, scoring='accuracy', n_jobs=-1, cv=5)

random_search.fit(filtered_df_train[features], y)

best_params = random_search.best_params_
best_model = random_search.best_estimator_

print("Best Hyperparameters:")
print(best_params)

Best Hyperparameters:
{'subsample': 0.7999999999999999, 'n_estimators': 150, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.060000000000000005, 'gamma': 0.1, 'colsample_bytree': 0.9999999999999999}


In [8]:
X = filtered_df[features]
y = le.fit_transform(filtered_df.events)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

best_model = XGBClassifier(**best_params)

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')

print("Updated Accuracy:", accuracy)
print("Updated Precision:", precision)

Updated Accuracy: 0.7966784978747645
Updated Precision: 0.7849428369392488


  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
joblib.dump(best_model, 'models/tuned-model.joblib')

['models/tuned-model.joblib']