In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [52]:
import itertools
import xgboost as xgb
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
import time
import pickle

In [42]:
data_directory = '/content/drive/MyDrive/FA2023/IntroToAI/jlamber4-CSE30124-Fall2023-submissions/ProjectData/'
filename = 'model_data.csv'

In [44]:
data = pd.read_csv(data_directory + filename).dropna()
data

Unnamed: 0.1,Unnamed: 0,batter,p_throws,pitch_type_condensed,out_base_count_state,pitcher,release_speed,release_pos_x,release_pos_z,release_pos_y,...,plate_x,plate_z,stand,sz_bot,sz_top,events,avgEV,avgLA,zSwingPCT,oSwingPCT
0,1,408234,L,CH,2_000_22,682227,84.1,1.01,6.17,54.52,...,0.83,3.09,R,1.80,3.56,foul,78.368750,-9.562500,0.592593,0.232558
1,2,408234,L,CH,1_011_12,676664,85.5,1.59,4.35,54.31,...,1.56,1.95,R,1.83,3.58,ball,78.368750,-9.562500,0.592593,0.232558
2,3,408234,L,CH,1_000_21,663776,81.8,1.24,5.80,54.48,...,0.69,1.09,R,1.79,3.56,ball,78.368750,-9.562500,0.592593,0.232558
3,4,408234,L,CH,1_000_12,669684,84.5,2.39,5.16,54.01,...,0.95,2.54,R,1.77,3.34,strike,78.368750,-9.562500,0.592593,0.232558
4,5,408234,L,CH,0_110_21,664285,89.4,0.66,5.96,54.29,...,0.38,3.07,R,1.82,3.47,strike,78.368750,-9.562500,0.592593,0.232558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707452,707453,663609,R,SL,2_100_12,608371,83.3,-1.90,5.70,54.77,...,-0.44,3.42,R,1.65,3.61,foul,80.421696,14.992125,0.682665,0.313929
707453,707454,663609,R,SL,1_110_21,571656,83.2,-1.37,5.70,54.26,...,-0.29,3.95,R,1.82,3.82,ball,80.421696,14.992125,0.682665,0.313929
707454,707455,687093,R,SL,1_000_10,663773,82.7,-1.73,6.09,53.95,...,0.29,0.91,R,1.64,3.36,strike,80.421696,14.992125,0.682665,0.313929
707455,707456,644374,R,SL,2_000_01,665734,86.0,-0.83,6.01,53.96,...,1.75,1.41,R,1.53,3.37,ball,80.421696,14.992125,0.682665,0.313929


In [46]:
data['outs'] = data['out_base_count_state'].apply(lambda x: int(x.split('_')[0]))
data['balls'] = data['out_base_count_state'].apply(lambda x: int(x.split('_')[2][0]))
data['strikes'] = data['out_base_count_state'].apply(lambda x: int(x.split('_')[2][1]))

data.reset_index(drop=True, inplace=True)

In [67]:
X = data[['p_throws', 'release_speed', 'release_pos_x', 'release_pos_y', 'release_pos_z', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z',
          'stand', 'sz_bot', 'sz_top', 'avgEV', 'avgLA', 'zSwingPCT', 'oSwingPCT',
          'outs', 'balls', 'strikes']]
X = pd.get_dummies(X, columns=['stand', 'p_throws'])
y = data['events'].astype('category').cat.codes
category_mapping = dict(enumerate(data['events'].astype('category').cat.categories))

In [89]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200]
}

all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

num_folds = 5
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

results = []

for params in all_params:
    print(f"Testing parameters: {params}")
    fold_results = []
    fold_models = []
    fold_number = 1
    for train_index, test_index in kf.split(X, y):
        print(f"  Cross-validation fold {fold_number}/{num_folds}")
        start_time = time.time()

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = xgb.XGBClassifier(**params, objective='multi:softprob', eval_metric='mlogloss', num_class=9, use_label_encoder=False)
        model.fit(X_train, y_train)
        fold_models.append(model)

        test_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, test_pred)
        fold_results.append(test_accuracy)

        end_time = time.time()
        print(f"    Completed in {end_time - start_time:.2f} seconds")
        fold_number += 1

    avg_test_accuracy = np.mean(fold_results)

    results.append({
        'params': params,
        'average_test_accuracy': avg_test_accuracy,
        'model': fold_models
    })
    print(results[-1]['average_test_accuracy'])

results_df = pd.DataFrame(results)
print(results_df)

Testing parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 50}
  Cross-validation fold 1/5
    Completed in 32.61 seconds
  Cross-validation fold 2/5
    Completed in 32.08 seconds
  Cross-validation fold 3/5
    Completed in 32.07 seconds
  Cross-validation fold 4/5
    Completed in 32.11 seconds
  Cross-validation fold 5/5
    Completed in 32.51 seconds
0.5594760495173293
Testing parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100}
  Cross-validation fold 1/5
    Completed in 63.26 seconds
  Cross-validation fold 2/5
    Completed in 59.78 seconds
  Cross-validation fold 3/5
    Completed in 62.50 seconds
  Cross-validation fold 4/5
    Completed in 63.09 seconds
  Cross-validation fold 5/5
    Completed in 62.32 seconds
0.5644041041748367
Testing parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 200}
  Cross-validation fold 1/5
    Completed in 119.56 seconds
  Cross-validation fold 2/5
    Completed in 123.57 seconds
  Cross-va

In [90]:
best_model_index = results_df['average_test_accuracy'].idxmax()

best_model = results_df.loc[best_model_index, 'model'][0]

print("Best Model Parameters:", results_df.loc[best_model_index, 'params'])

Best Model Parameters: {'max_depth': 7, 'learning_rate': 0.1, 'n_estimators': 200}


In [97]:
results_df

Unnamed: 0,params,average_test_accuracy,model
0,"{'max_depth': 3, 'learning_rate': 0.01, 'n_est...",0.559476,"[XGBClassifier(base_score=None, booster=None, ..."
1,"{'max_depth': 3, 'learning_rate': 0.01, 'n_est...",0.564404,"[XGBClassifier(base_score=None, booster=None, ..."
2,"{'max_depth': 3, 'learning_rate': 0.01, 'n_est...",0.565828,"[XGBClassifier(base_score=None, booster=None, ..."
3,"{'max_depth': 3, 'learning_rate': 0.05, 'n_est...",0.566359,"[XGBClassifier(base_score=None, booster=None, ..."
4,"{'max_depth': 3, 'learning_rate': 0.05, 'n_est...",0.572261,"[XGBClassifier(base_score=None, booster=None, ..."
5,"{'max_depth': 3, 'learning_rate': 0.05, 'n_est...",0.581088,"[XGBClassifier(base_score=None, booster=None, ..."
6,"{'max_depth': 3, 'learning_rate': 0.1, 'n_esti...",0.572467,"[XGBClassifier(base_score=None, booster=None, ..."
7,"{'max_depth': 3, 'learning_rate': 0.1, 'n_esti...",0.580837,"[XGBClassifier(base_score=None, booster=None, ..."
8,"{'max_depth': 3, 'learning_rate': 0.1, 'n_esti...",0.586635,"[XGBClassifier(base_score=None, booster=None, ..."
9,"{'max_depth': 5, 'learning_rate': 0.01, 'n_est...",0.565999,"[XGBClassifier(base_score=None, booster=None, ..."


In [91]:
data_directory = '/content/drive/MyDrive/FA2023/IntroToAI/jlamber4-CSE30124-Fall2023-submissions/ProjectData/'
filename = 'best_model.pkl'

model_path = data_directory + filename

with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

print(f"Model saved successfully at {model_path}")

Model saved successfully at /content/drive/MyDrive/FA2023/IntroToAI/jlamber4-CSE30124-Fall2023-submissions/ProjectData/best_model.pkl


In [93]:
# test prediction function

def predict_outcome_probabilities(features):

    if isinstance(features, pd.Series):
        features = features.to_frame().transpose()

    probabilities = best_model.predict_proba(features)[0]

    category_names = data['events'].astype('category').cat.categories

    outcome_probabilities = {category_names[i]: prob for i, prob in enumerate(probabilities)}

    return outcome_probabilities

In [92]:
# testing reading in the model works

model_path = data_directory + filename

with open(model_path, 'rb') as file:
    best_model = pickle.load(file)

predict_outcome_probabilities(X.loc[1])