In [None]:
PUT THE CONSTANTS IMPORT FROM BUILD DATASETS INTO AN INIT??

In [1]:
# General Imports
import pickle as pkl
import numpy as np
import pandas as pd
import warnings

# Domain Imports
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from xgboost import XGBClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from IPython.display import clear_output

# Local Package Imports
from build_datasets.build_datasets import dataset_builder
import gcloud_helper as gc

# Turn Off Warnings
warnings.filterwarnings("ignore")


## Define General Functions and Constants

In [2]:
def build_train_test_set(fpath, encode_y=False, y_col='play_type'):
    with open(fpath, "rb") as file:
        data = pkl.load(file)
        x = data.drop(columns=['play_type', 'is_on_base'])#, 'batter', 'pitcher'])
        y = data[y_col]

        if encode_y:
            encoder = OrdinalEncoder()
            y = encoder.fit_transform(np.array(y).reshape(-1,1))
            y = [int(x[0]) for x in y]
            with open('data/y-label_encoder.pkl', 'wb') as fpath:
                pkl.dump(encoder, fpath)

    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=.2)

    return x_train, x_test, y_train, y_test

In [8]:
rolling_windows = [[20, 45, 75, 504],
                   ]

nonML_fpaths = [
    f"../build_datasets/data/processed_data/final_dataset_nonML_2021-2023_rolling_windows_{'_'.join(map(str, windows))}" 
    if len(windows) > 0 
    else f"../build_datasets/Data/final_dataset_nonML_2021-2023_rolling_windows_{windows[0]}"
    for windows in rolling_windows
]

In [9]:
strat_kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Create global training and testing data
# Define the filepath to grab data
fpath = f"../build_datasets/data/processed_data/final_dataset_nonML_2021-2023_rolling_windows_20_45_75_504"

# Create the train and test sets
global_x_train, global_x_test, global_y_train, global_y_test = build_train_test_set(fpath)

# Create encoded versions of y_train and y_test
global_y_train_encoded = [int(x[0]) for x in OrdinalEncoder().fit_transform(np.array(global_y_train).reshape(-1,1))]
global_y_test_encoded = [int(x[0]) for x in OrdinalEncoder().fit_transform(np.array(global_y_test).reshape(-1,1))]

# Models
## Dumb Guesser Model

In [10]:
dataset_fpath = f"../build_datasets/data/processed_data/final_dataset_nonML_2021-2023_rolling_windows_20_45_75_504"
x_train, x_test, y_train, y_test = build_train_test_set(dataset_fpath)

dumb_log_loss_plays = 0
league_averages_plays = {}
for play in np.unique(y_train):
    play_share = len(y_train[y_train == play])/len(y_train)

    league_averages_plays[play] = play_share
    dumb_log_loss_plays += play_share * np.log(play_share) # Generalization of the log loss formula to a baseline guesser case
    
    
print("The Crappy Average Estimator for Predicting Plays Has a Neg Log Loss of: {}".format(round(dumb_log_loss_plays,3)))

The Crappy Average Estimator for Predicting Plays Has a Neg Log Loss of: -1.642


## Logistic Model

In [11]:
# Define the pipeline for a Logistic Model
logistic_pipe = dataset_builder().ml_pipe(model=LogisticRegression(max_iter = 1000))

### Determine Best Dataset

In [12]:
for dataset_fpath in nonML_fpaths:
    x_train, x_test, y_train, y_test = build_train_test_set(dataset_fpath)

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        logistic_pipe.fit(x_train, y_train)

    # Get predicted probabilities for the training and test sets
    y_train_prob = logistic_pipe.predict_proba(x_train)

    # Calculate negative log loss
    train_neg_log_loss = log_loss(y_train, y_train_prob)

    print(f'Train Negative Log Loss on Windows = {dataset_fpath.split("s_")[-1]}: {train_neg_log_loss:.4f}')

Train Negative Log Loss on Windows = 20_45_75_504: 1.5895


### Train Baseline Model

Given the 10_40_75_504 dataset is the most accurate, we move forward with that for Logistic Optimization

In [13]:
# Make the baseline model with a warning catcher for logistic convergence failure
score = cross_val_score(logistic_pipe, global_x_train,
                         global_y_train, cv=strat_kfold, scoring='neg_log_loss', n_jobs=4).mean()

print(f"Logistic Baseline Score: {round(score, 2)}")

Logistic Baseline Score: -1.59


### Determine Optimal Model

In [14]:
# Build a parameter dictionary to grid search the model on 
#parameters = {'model__C':[.0001, .001, .01, 1, 10, 100], "model__class_weight":[None, 'balanced']}
parameters = {'model__C':[.5, 1, 2, 3]}

# Build the GridSearch
grid_search = GridSearchCV(logistic_pipe, parameters, cv=strat_kfold, scoring='neg_log_loss', n_jobs=4, verbose=3)
grid_search.fit(x_train, y_train)

best_logistic_score = min(grid_search.best_score_, score)

# Clear output from GridSearch Verbose
clear_output(wait=False)

# Print Results
print(f'Best Model Parameters: {grid_search.best_params_}')
print(f"Best Model Score: {grid_search.best_score_}")

Best Model Parameters: {'model__C': 1}
Best Model Score: -1.593079114358973


### Save Best Performing Model

In [16]:
best_logistic_pipe = dataset_builder().ml_pipe(model=LogisticRegression(max_iter = 1000, C=1))
best_logistic_pipe.fit(global_x_train, global_y_train)

In [17]:
with open('data/models/logistic_regression_model.pkl', 'wb') as path:
    pkl.dump(best_logistic_pipe, path)

## XGBoost

In [18]:
# Define the pipeline for a Logistic Model
xgb_pipe = dataset_builder().ml_pipe(model=XGBClassifier())

In [19]:
for dataset_fpath in nonML_fpaths:
    x_train, x_test, y_train, y_test = build_train_test_set(dataset_fpath, encode_y=True)
    x_train_mini, y_train_mini = x_train[:100000], y_train[:100000]

    # Fit the pipe to our data
    xgb_pipe.fit(x_train_mini, y_train_mini)

    # Get predicted probabilities for the training and test sets
    y_train_prob = xgb_pipe.predict_proba(x_train)

    # Calculate negative log loss
    train_neg_log_loss = log_loss(y_train, y_train_prob)

    print(f'Train Negative Log Loss on Windows = {dataset_fpath.split("s_")[-1]}: {train_neg_log_loss:.4f}')

Train Negative Log Loss on Windows = 20_45_75_504: 1.4754


### Train Baseline Model
Once again we move forward with the 10_40_75_504 dataset

In [22]:
# Define the filepath to grab data
fpath = f"../build_datasets/data/processed_data/final_dataset_nonML_2021-2023_rolling_windows_20_45_75_504"

# Create the train and test sets
x_train, x_test, y_train, y_test = build_train_test_set(fpath, encode_y=True)

# Make the baseline model with a warning catcher for logistic convergence failure
score = cross_val_score(xgb_pipe, x_train,
                         y_train, cv=strat_kfold, scoring='neg_log_loss', n_jobs=4).mean()

print(f"XGBoost Baseline Score: {round(score, 2)}")

Python(53915) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(53916) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(53917) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(53918) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


XGBoost Baseline Score: -1.6


### Determine Optimal Model

In [23]:
# Build a parameter dictionary to grid search the model on 
# parameters = {
#     'model__learning_rate': [0.01, 0.1, 0.2],
#     'model__n_estimators': [100, 200, 300],
#     'model__max_depth': [3, 5, 7],
#     'model__min_child_weight': [1, 3, 5],
#     'model__subsample': [0.6, 0.8, 1.0],
#     'model__colsample_bytree': [0.3, 0.5, 0.7],
#     'model__gamma': [0, 0.1, 0.2],
#     'model__alpha': [0, 0.1, 1],
#     'model__lambda': [0, 0.1, 1],
# }

parameters = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5],
    'model__subsample': [0.7, 1.0],
    'model__colsample_bytree': [0.7, 1.0],
    'model__gamma': [0, 0.1, 0.5],
}

# Build the GridSearch
grid_search = GridSearchCV(xgb_pipe, parameters, cv=strat_kfold, scoring='neg_log_loss', n_jobs=4, verbose=3)
grid_search.fit(x_train_mini, y_train_mini)

best_xgb_score = min(grid_search.best_score_, score)

# Clear output from GridSearch Verbose
clear_output(wait=False)

# Print Results
print(f'Best Model Parameters: {grid_search.best_params_}')
print(f"Best Model Score: {grid_search.best_score_}")

Best Model Parameters: {'model__colsample_bytree': 0.7, 'model__gamma': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__subsample': 1.0}
Best Model Score: -1.5913549915516465


### Save Best Performing Model

In [24]:
best_xgb_pipe = dataset_builder().ml_pipe(model=grid_search.best_estimator_['model'])\

best_xgb_pipe.fit(global_x_train, global_y_train_encoded)

Becuase we did a grid search on the mini x_train, we do a quick check on the scoring for the full dataset

In [25]:
# Make the baseline model with a warning catcher for logistic convergence failure
score = cross_val_score(best_xgb_pipe, x_train,
                         y_train, cv=strat_kfold, scoring='neg_log_loss', n_jobs=4).mean()

print(f"XGBoost Optimal Score: {round(score, 2)}")

XGBoost Optimal Score: -1.58


In [27]:
with open('data/models/XGBoost_model.pkl', 'wb') as path:
    pkl.dump(best_xgb_pipe, path)

## Neural Network

In [29]:
NN_pipe = dataset_builder().ml_pipe(model=None)
x_train = NN_pipe.fit_transform(global_x_train)

# Ensure data is in the correct format
x_train = np.array(x_train, dtype=np.float32)
y_train_encoded = np.array(global_y_train_encoded, dtype=np.int32)

# Early stopping callback to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,  # Number of epochs with no improvement before stopping
    restore_best_weights=True  # Restore the best weights after stopping
)

# Define the model
model = Sequential([
    Dense(64, input_shape=(108,), activation='relu'),
    Dense(32, activation='relu'),
    Dense(12, activation='softmax')
])

# Compile the model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['sparse_categorical_crossentropy']
)

# Fit the model
# Fit the model
history = model.fit(x_train, y_train_encoded, validation_split=.2,
                        epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

# Save the model
model.save('data/models/NN_64_32.keras')

Epoch 1/100
[1m8822/8822[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 626us/step - loss: 1.6604 - sparse_categorical_crossentropy: 1.6604 - val_loss: 1.5877 - val_sparse_categorical_crossentropy: 1.5877
Epoch 2/100
[1m8822/8822[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 599us/step - loss: 1.5861 - sparse_categorical_crossentropy: 1.5861 - val_loss: 1.5844 - val_sparse_categorical_crossentropy: 1.5844
Epoch 3/100
[1m8822/8822[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 633us/step - loss: 1.5800 - sparse_categorical_crossentropy: 1.5800 - val_loss: 1.5842 - val_sparse_categorical_crossentropy: 1.5842
Epoch 4/100
[1m8822/8822[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 567us/step - loss: 1.5781 - sparse_categorical_crossentropy: 1.5781 - val_loss: 1.5826 - val_sparse_categorical_crossentropy: 1.5826
Epoch 5/100
[1m8822/8822[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 572us/step - loss: 1.5758 - sparse_categorical_crossentropy: 1.5758 

# Testing Models

## Logistic Model

In [30]:
with open('data/models/logistic_regression_model.pkl', 'rb') as fpath:
    logistic_pipe = pkl.load(fpath)

test_predictions = logistic_pipe.predict_proba(global_x_test)
test_score = log_loss(global_y_test, test_predictions)

print(f"The Logistic Model's Final Test Score: {round(test_score, 2)}")

The Logistic Model's Final Test Score: 1.59


In [31]:
with open('data/models/XGBoost_model.pkl', 'rb') as fpath:
    xgb_pipe = pkl.load(fpath)

test_predictions = xgb_pipe.predict_proba(global_x_test)
test_score = log_loss(global_y_test, test_predictions)

print(f"The XGBoost Model's Final Test Score: {round(test_score, 2)}")

The XGBoost Model's Final Test Score: 1.58


In [32]:
NN_model = tf.keras.models.load_model('data/models/NN_64_32.keras')

test_pipe = dataset_builder().ml_pipe(model=None).fit(global_x_train)
processed_x_test = test_pipe.transform(global_x_test)

test_predictions = NN_model.predict(processed_x_test)
test_score = log_loss(global_y_test, test_predictions)

print(f"The Neual Network's Final Test Score: {round(test_score, 2)}")

[1m2757/2757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 249us/step


ValueError: y_true and y_pred contain different number of classes 11, 12. Please provide the true labels explicitly through the labels argument. Classes found in y_true: ['double' 'double_play' 'error' 'field_out' 'fielders_choice' 'home_run'
 'sacrifice' 'single' 'strikeout' 'triple' 'walk']

Because the model retains it's low log loss, and given it is easily integreated into the pipe format, we will carry the XGBoost model forward into predictions. We start by training it on the FULL dataset from 2016-2018 so that we eventually model games from 2019 with it.

In [33]:
fpath="../build_datasets/data/processed_data/final_dataset_nonML_2021-2023_rolling_windows_20_45_75_504"
with open(fpath, "rb") as file:
    data = pkl.load(file)
    x = data.drop(columns=['play_type', 'is_on_base'])#, 'batter', 'pitcher'])
    y = data['play_type']
    y = [int(x[0]) for x in OrdinalEncoder().fit_transform(np.array(y).reshape(-1,1))]

final_pipe = dataset_builder().ml_pipe(model=XGBClassifier(colsample_bytree=0.7, gamma=0, max_depth=3, n_estimators=100, subsample=1))

final_pipe.fit(x, y)

In [34]:
with open('data/models/final_model_xgb.pkl' , 'wb') as fpath:
    pkl.dump(final_pipe, fpath)