In [44]:
# General Imports
import pickle as pkl
import numpy as np
import pandas as pd
import warnings
import joblib

# Domain Imports
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from xgboost import XGBClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from keras import regularizers
from IPython.display import clear_output

# Local Package Imports
from build_datasets.dataset_builder import DatasetBuilder
from multimodal_communication import cloud_functions as cf
from utils import ml_pipe

# Turn Off Warnings
warnings.filterwarnings("ignore")


## Define General Functions and Constants

In [2]:
def build_train_test_set(fpath, encode_y=False, y_col='play_type'):
    with open(fpath, "rb") as file:
        data = pkl.load(file)
        x = data.drop(columns=['play_type', 'is_on_base'])
        y = data[y_col]

        if encode_y:
            encoder = OrdinalEncoder()
            y = encoder.fit_transform(np.array(y).reshape(-1,1))
            y = [int(x[0]) for x in y]
            with open('data/y-label_encoder.pkl', 'wb') as fpath:
                pkl.dump(encoder, fpath)

    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=.2)

    return x_train, x_test, y_train, y_test

In [62]:
strat_kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Create training and testing data
# Define the filepath to grab data
fpath = f"../../../../MLB-Data/daily_stats_dfs/daily_stats_df_updated_2025-03-05.pkl"

# Create the train and test sets
x_train, x_test, y_train, y_test = build_train_test_set(fpath, encode_y=True)



# Models
## Dumb Guesser Model

In [4]:
x_train, x_test, y_train, y_test = build_train_test_set(fpath)

dumb_log_loss_plays = 0
league_averages_plays = {}
for play in np.unique(y_train):
    play_share = len(y_train[y_train == play])/len(y_train)

    league_averages_plays[play] = play_share
    dumb_log_loss_plays += play_share * np.log(play_share) # Generalization of the log loss formula to a baseline guesser case
    
    
print("The Crappy Average Estimator for Predicting Plays Has a Neg Log Loss of: {}".format(round(dumb_log_loss_plays,3)))

The Crappy Average Estimator for Predicting Plays Has a Neg Log Loss of: -1.632


## Logistic Model

In [5]:
# Define the pipeline for a Logistic Model
logistic_pipe = ml_pipe(model=LogisticRegression(max_iter = 1000))

### Determine Best Dataset

### Train Baseline Model

Given the 10_40_75_504 dataset is the most accurate, we move forward with that for Logistic Optimization

In [6]:
# Make the baseline model with a warning catcher for logistic convergence failure
score = cross_val_score(logistic_pipe, x_train,
                         y_train, cv=strat_kfold, scoring='neg_log_loss', n_jobs=4).mean()

print(f"Logistic Baseline Score: {round(score, 2)}")

Logistic Baseline Score: -1.58


### Determine Optimal Model

In [8]:
# Build a parameter dictionary to grid search the model on 
#parameters = {'model__C':[.0001, .001, .01, 1, 10, 100], "model__class_weight":[None, 'balanced']}
parameters = {'model__C':[.5, 1, 2, 3]}

# Build the GridSearch
grid_search = GridSearchCV(logistic_pipe, parameters, cv=strat_kfold, scoring='neg_log_loss', n_jobs=10, verbose=3)
grid_search.fit(x_train, y_train)

best_logistic_score = min(grid_search.best_score_, score)

# Clear output from GridSearch Verbose
clear_output(wait=False)

# Print Results
print(f'Best Model Parameters: {grid_search.best_params_}')
print(f"Best Model Score: {grid_search.best_score_}")

Best Model Parameters: {'model__C': 0.5}
Best Model Score: -1.5830343452840343


### Save Best Performing Model

In [12]:
best_logistic_pipe = grid_search.best_estimator_
best_logistic_pipe.fit(x_train, y_train)

In [14]:
with open('data/models/logistic_regression_model.pkl', 'wb') as path:
    pkl.dump(best_logistic_pipe, path)

## XGBoost

In [15]:
# Define the pipeline for a Logistic Model
xgb_pipe = ml_pipe(model=XGBClassifier())

### Train Baseline Model

In [None]:
# Create the train and test sets, this time encoded for xgboost
x_train, x_test, y_train, y_test = build_train_test_set(fpath, encode_y=True)

# Make the baseline model
score = cross_val_score(xgb_pipe, x_train,
                         y_train, cv=strat_kfold, scoring='neg_log_loss', n_jobs=10).mean()

print(f"XGBoost Baseline Score: {round(score, 2)}")

XGBoost Baseline Score: -1.64


### Determine Optimal Model

In [18]:
# Build a parameter dictionary to grid search the model on 
# parameters = {
#     'model__learning_rate': [0.01, 0.1, 0.2],
#     'model__n_estimators': [100, 200, 300],
#     'model__max_depth': [3, 5, 7],
#     'model__min_child_weight': [1, 3, 5],
#     'model__subsample': [0.6, 0.8, 1.0],
#     'model__colsample_bytree': [0.3, 0.5, 0.7],
#     'model__gamma': [0, 0.1, 0.2],
#     'model__alpha': [0, 0.1, 1],
#     'model__lambda': [0, 0.1, 1],
# }

parameters = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5],
    'model__subsample': [0.7, 1.0],
    'model__colsample_bytree': [0.7, 1.0],
    'model__gamma': [0, 0.1, 0.5],
}

# Build the GridSearch
grid_search = GridSearchCV(xgb_pipe, parameters, cv=strat_kfold, scoring='neg_log_loss', n_jobs=10, verbose=3)
grid_search.fit(x_train, y_train)

best_xgb_score = min(grid_search.best_score_, score)

# Clear output from GridSearch Verbose
clear_output(wait=False)

# Print Results
print(f'Best Model Parameters: {grid_search.best_params_}')
print(f"Best Model Score: {grid_search.best_score_}")

Best Model Parameters: {'model__colsample_bytree': 0.7, 'model__gamma': 0.5, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__subsample': 1.0}
Best Model Score: -1.5805157102571048


### Save Best Performing Model

In [19]:
best_xgb_pipe = grid_search.best_estimator_

best_xgb_pipe.fit(x_train, y_train)

Becuase we did a grid search on the mini x_train, we do a quick check on the scoring for the full dataset

In [20]:
with open('data/models/XGBoost_model.pkl', 'wb') as path:
    pkl.dump(best_xgb_pipe, path)

## Neural Network

In [63]:
x_train, x_test, y_train, y_test = build_train_test_set(fpath, encode_y=True, y_col='play_type')

In [50]:
NN_pipe = ml_pipe(model=None)
x_train = NN_pipe.fit_transform(x_train)

# Ensure data is in the correct format
x_train = np.array(x_train, dtype=np.float32)
y_train_encoded = np.array(y_train, dtype=np.int32)

# Early stopping callback to prevent overfitting
callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=5),
                  keras.callbacks.ModelCheckpoint(filepath=f'best_NN_model.keras', monitor='val_loss', save_best_only=True)]

# Define the model
model = Sequential([
    Dense(256, input_shape=(110,), activation='relu', kernel_regularizer=regularizers.l2()),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(12, activation='softmax')
])

# Compile the model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['sparse_categorical_crossentropy']
)

# Fit the model
# Fit the model
history = model.fit(x_train, np.array(y_train), validation_split=.2,
                    epochs=100, batch_size=32, callbacks=[callbacks_list], verbose=1)


Epoch 1/100
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 744us/step - loss: 2.0301 - sparse_categorical_crossentropy: 1.6798 - val_loss: 1.6263 - val_sparse_categorical_crossentropy: 1.5922
Epoch 2/100
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 694us/step - loss: 1.6293 - sparse_categorical_crossentropy: 1.5967 - val_loss: 1.6092 - val_sparse_categorical_crossentropy: 1.5820
Epoch 3/100
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 672us/step - loss: 1.6162 - sparse_categorical_crossentropy: 1.5886 - val_loss: 1.6084 - val_sparse_categorical_crossentropy: 1.5824
Epoch 4/100
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 690us/step - loss: 1.6135 - sparse_categorical_crossentropy: 1.5886 - val_loss: 1.6011 - val_sparse_categorical_crossentropy: 1.5785
Epoch 5/100
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 679us/step - loss: 1.6042 - sparse_categorical_crossentropy: 1.5824 

# Testing Models

## Logistic Model

In [51]:
with open('data/models/logistic_regression_model.pkl', 'rb') as fpath:
    logistic_pipe = pkl.load(fpath)

test_predictions = logistic_pipe.predict_proba(x_test)
test_score = log_loss(y_test, test_predictions)

print(f"The Logistic Model's Final Test Score: {round(test_score, 2)}")

The Logistic Model's Final Test Score: 1.57


In [52]:
with open('data/models/XGBoost_model.pkl', 'rb') as fpath:
    xgb_pipe = pkl.load(fpath)

test_predictions = xgb_pipe.predict_proba(x_test)
test_score = log_loss(y_test, test_predictions)

print(f"The XGBoost Model's Final Test Score: {round(test_score, 2)}")

The XGBoost Model's Final Test Score: 1.57


In [65]:
NN_model = tf.keras.models.load_model('data/models/best_NN_model.keras')

test_pipe =ml_pipe(model=None).fit(pd.DataFrame(x_train))
processed_x_test = test_pipe.transform(x_test)

test_predictions = NN_model.predict(processed_x_test)

labels=np.arange(12)
test_score = log_loss(y_test, test_predictions, labels=labels)

print(f"The Neual Network's Final Test Score: {round(test_score, 2)}")

[1m767/767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 532us/step
The Neual Network's Final Test Score: 1.57


Because the model retains it's low log loss, and given it is easily integreated into the pipe format, we will carry the XGBoost model forward into predictions. We start by training it on the FULL dataset from 2016-2018 so that we eventually model games from 2019 with it.

In [33]:
fpath="../build_datasets/data/processed_data/final_dataset_nonML_2021-2023_rolling_windows_20_45_75_504"
with open(fpath, "rb") as file:
    data = pkl.load(file)
    x = data.drop(columns=['play_type', 'is_on_base'])#, 'batter', 'pitcher'])
    y = data['play_type']
    y = [int(x[0]) for x in OrdinalEncoder().fit_transform(np.array(y).reshape(-1,1))]

final_pipe = dataset_builder().ml_pipe(model=XGBClassifier(colsample_bytree=0.7, gamma=0, max_depth=3, n_estimators=100, subsample=1))

final_pipe.fit(x, y)

In [34]:
with open('data/models/final_model_xgb.pkl' , 'wb') as fpath:
    pkl.dump(final_pipe, fpath)