In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import joblib
import math
import scipy.stats as stats
from catboost import Pool
import optuna
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score, precision_score
import sqlite3

In [3]:
with sqlite3.connect("../../../Desktop/MLB Statcast.db") as conn:
    sc_23 = pd.read_sql_query("SELECT * FROM statcast_data_2023", conn)
    sc_22 = pd.read_sql_query("SELECT * FROM statcast_data_2022", conn)
    sc_21 = pd.read_sql_query("SELECT * FROM statcast_data_2021", conn)
    sc_20 = pd.read_sql_query("SELECT * FROM statcast_data_2020", conn)
conn.close()

In [4]:
spin_23 = pd.read_csv('../../../Documents/spin_dir_2023.csv').drop(columns=['release_speed'])
spin_22 = pd.read_csv('../../../Documents/spin_dir_2022.csv').drop(columns=['release_speed'])
spin_21 = pd.read_csv('../../../Documents/spin_dir_2021.csv').drop(columns=['release_speed'])
spin_20 = pd.read_csv('../../../Documents/spin_dir_2020.csv').drop(columns=['release_speed'])

In [5]:
sc_23 = pd.merge(sc_23, spin_23, left_on=['player_name', 'pitch_type'], right_on=['last_name, first_name', 'api_pitch_type'], how='left')
sc_22 = pd.merge(sc_22, spin_22, left_on=['player_name', 'pitch_type'], right_on=['last_name, first_name', 'api_pitch_type'], how='left')
sc_21 = pd.merge(sc_21, spin_21, left_on=['player_name', 'pitch_type'], right_on=['last_name, first_name', 'api_pitch_type'], how='left')
sc_20 = pd.merge(sc_20, spin_20, left_on=['player_name', 'pitch_type'], right_on=['last_name, first_name', 'api_pitch_type'], how='left')

In [6]:
total_sc = pd.concat([sc_23, sc_22, sc_21, sc_20])

In [7]:
features = ['release_speed', 'spin_axis', 'active_spin', 'pfx_x', 'pfx_z', 'release_extension', 'release_pos_x', 'release_pos_z']
total_sc = total_sc[total_sc[features].notnull().all(axis=1)]

In [8]:
total_sc['pfx_x'] *= 12
total_sc.loc[total_sc['p_throws'] == 'L', 'pfx_x'] *= -1
total_sc.loc[total_sc['p_throws'] == 'L', 'release_pos_x'] *= -1
total_sc.loc[total_sc['p_throws'] == 'L', 'spin_axis'] = 360 - total_sc.loc[total_sc['p_throws'] == 'L', 'spin_axis']
total_sc['pfx_z'] *= 12

In [9]:
total_sc = total_sc[~total_sc['pitch_name'].isin(['Pitch Out', 'Eephus', 'Knuckleball'])]

In [10]:
fastballs = ['4-Seam Fastball', 'Sinker']
offspeed = ['Split-Finger', 'Changeup', 'Forkball']
breaking = ['Curveball', 'Slider', 'Cutter', 'Knuckle Curve', 'Sweeper', 'Screwball', 'Slow Curve']

In [11]:
def get_pitch_data(total_sc, pitch_name, players):
    pitch_data = total_sc.loc[(total_sc['pitch_name'] == pitch_name) & (total_sc['player_name'].isin(players))]
    grouped_data = pitch_data.groupby('player_name').agg({
        'release_speed': 'median',
        'pfx_x': 'median',
        'pfx_z': 'median'
    }).reset_index()
    
    return grouped_data

all_players = total_sc['player_name'].unique()

fastball_data = get_pitch_data(total_sc, '4-Seam Fastball', all_players)
fastball_data = fastball_data.dropna(subset=['release_speed'])
fastball_velo_map = dict(zip(fastball_data['player_name'], fastball_data['release_speed']))
fastball_h_mov_map = dict(zip(fastball_data['player_name'], fastball_data['pfx_x']))
fastball_v_mov_map = dict(zip(fastball_data['player_name'], fastball_data['pfx_z']))

sinker_data = get_pitch_data(total_sc, 'Sinker', all_players)
sinker_data = sinker_data.dropna(subset=['release_speed'])
sinker_velo_map = dict(zip(sinker_data['player_name'], sinker_data['release_speed']))
sinker_h_mov_map = dict(zip(sinker_data['player_name'], sinker_data['pfx_x']))
sinker_v_mov_map = dict(zip(sinker_data['player_name'], sinker_data['pfx_z']))

cutter_data = get_pitch_data(total_sc, 'Cutter', all_players)
cutter_data = cutter_data.dropna(subset=['release_speed'])
cutter_velo_map = dict(zip(cutter_data['player_name'], cutter_data['release_speed']))
cutter_h_mov_map = dict(zip(cutter_data['player_name'], cutter_data['pfx_x']))
cutter_v_mov_map = dict(zip(cutter_data['player_name'], cutter_data['pfx_z']))

In [12]:
pitch_data = {
    '4-Seam Fastball': {
        'velo_map': fastball_velo_map,
        'h_mov_map': fastball_h_mov_map,
        'v_mov_map': fastball_v_mov_map
    },
    'Sinker': {
        'velo_map': sinker_velo_map,
        'h_mov_map': sinker_h_mov_map,
        'v_mov_map': sinker_v_mov_map
    },
    'Cutter': {
        'velo_map': cutter_velo_map,
        'h_mov_map': cutter_h_mov_map,
        'v_mov_map': cutter_v_mov_map
    }
}

def calculate_differences(row):
    player_name = row['player_name']
    for pitch_type in ['4-Seam Fastball', 'Sinker', 'Cutter']:
        if player_name in pitch_data[pitch_type]['velo_map']:
            velo_dif = row['release_speed'] - pitch_data[pitch_type]['velo_map'][player_name]
            h_mov_dif = row['pfx_x'] - pitch_data[pitch_type]['h_mov_map'][player_name]
            v_mov_dif = row['pfx_z'] - pitch_data[pitch_type]['v_mov_map'][player_name]
            return velo_dif, h_mov_dif, v_mov_dif

    return np.nan, np.nan, np.nan

total_sc['velo_dif'], total_sc['h_mov_dif'], total_sc['v_mov_dif'] = zip(*total_sc.apply(calculate_differences, axis=1))

In [13]:
total_sc = total_sc[(total_sc['velo_dif'] <= 0) | (total_sc['pitch_name'].isin(['Sinker', '4-Seam Fastball']))]

In [14]:
features_with_dif = features + ['h_mov_dif', 'v_mov_dif', 'velo_dif']

In [15]:
total_sc['description'] = np.where(total_sc['description'] == 'hit_into_play', total_sc['events'], total_sc['description'])
field_outs = ['force_out', 'grounded_into_double_play', 'fielders_choice_out', 'fielders_choice', 'field_out', 'double_play', 'sac_fly', 'field_error', 'sac_fly_double_play', 'triple_play']
total_sc['description'] = total_sc['description'].replace(field_outs, 'field_out')

In [16]:
pitch_data = {
    '4-Seam Fastball': {
        'velo_map': fastball_velo_map,
        'h_mov_map': fastball_h_mov_map,
        'v_mov_map': fastball_v_mov_map
    },
    'Sinker': {
        'velo_map': sinker_velo_map,
        'h_mov_map': sinker_h_mov_map,
        'v_mov_map': sinker_v_mov_map
    },
    'Cutter': {
        'velo_map': cutter_velo_map,
        'h_mov_map': cutter_h_mov_map,
        'v_mov_map': cutter_v_mov_map
    }
}

def calculate_differences(row):
    player_name = row['player_name']
    for pitch_type in ['4-Seam Fastball', 'Sinker', 'Cutter']:
        if player_name in pitch_data[pitch_type]['velo_map']:
            velo_dif = row['release_speed'] - pitch_data[pitch_type]['velo_map'][player_name]
            h_mov_dif = row['pfx_x'] - pitch_data[pitch_type]['h_mov_map'][player_name]
            v_mov_dif = row['pfx_z'] - pitch_data[pitch_type]['v_mov_map'][player_name]
            return velo_dif, h_mov_dif, v_mov_dif

    return np.nan, np.nan, np.nan

total_sc['velo_dif'], total_sc['h_mov_dif'], total_sc['v_mov_dif'] = zip(*total_sc.apply(calculate_differences, axis=1))

In [17]:
total_sc['whiff'] = total_sc['description'].isin(['swinging_strike', 'swinging_strike_blocked'])
total_sc['foul'] = total_sc['description'].isin(['foul', 'foul_tip'])
total_sc['in_play'] = total_sc['description'].isin(['single', 'double', 'triple', 'home_run', 'field_out'])
total_sc['swing'] = (total_sc['whiff'] | total_sc['foul'] | total_sc['in_play'])

total_sc['take'] = (~total_sc['swing'] & (total_sc['description'].isin(['hit_by_pitch', 'ball', 'called_strike', 'blocked_ball'])))
total_sc['hbp'] = total_sc['description'] == 'hit_by_pitch'
total_sc['ball'] = total_sc['description'].isin(['blocked_ball', 'ball'])
total_sc['strike'] = total_sc['description'] == 'called_strike'

total_sc['single'] = total_sc['description'] == 'single'
total_sc['double'] = total_sc['description'] == 'double'
total_sc['triple'] = total_sc['description'] == 'triple'
total_sc['home_run'] = total_sc['description'] == 'home_run'
total_sc['field_out'] = total_sc['description'] == 'field_out'

In [18]:
total_sc.loc[total_sc['swing'] & total_sc['foul'], 'type_swing'] = 'foul'
total_sc.loc[total_sc['swing'] & total_sc['in_play'], 'type_swing'] = 'in_play'
total_sc.loc[total_sc['swing'] & total_sc['whiff'], 'type_swing'] = 'whiff'

total_sc.loc[total_sc['take'] & total_sc['hbp'], 'type_take'] = 'hbp'
total_sc.loc[total_sc['take'] & total_sc['ball'], 'type_take'] = 'ball'
total_sc.loc[total_sc['take'] & total_sc['strike'], 'type_take'] = 'strike'

total_sc.loc[total_sc['in_play'] & total_sc['single'], 'type_in_play'] = 'single'
total_sc.loc[total_sc['in_play'] & total_sc['double'], 'type_in_play'] = 'double'
total_sc.loc[total_sc['in_play'] & total_sc['triple'], 'type_in_play'] = 'triple'
total_sc.loc[total_sc['in_play'] & total_sc['home_run'], 'type_in_play'] = 'home_run'
total_sc.loc[total_sc['in_play'] & total_sc['field_out'], 'type_in_play'] = 'field_out'

In [19]:
total_sc = total_sc.dropna(subset=['swing', 'take'])
total_sc = total_sc[total_sc['swing'] != total_sc['take']]

In [20]:
from sklearn.preprocessing import LabelEncoder
from hyperopt import hp, fmin, tpe

def objective(space, X_train, X_test, y_train, y_test):
    model = XGBClassifier(
        max_depth=int(space['max_depth']),
        gamma=space['gamma'],
        reg_alpha=int(space['reg_alpha']),
        reg_lambda=space['reg_lambda'],
        colsample_bytree=space['colsample_bytree'],
        min_child_weight=int(space['min_child_weight']),
        n_estimators=int(space['n_estimators']))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy

le_swing = LabelEncoder()

swing_df = total_sc[(total_sc['swing']) & total_sc['pitch_name'].isin(fastballs)]
swing_X = swing_df[features]
swing_y = le_swing.fit_transform(swing_df['type_swing'])

X_train, X_test, y_train, y_test = train_test_split(swing_X, swing_y, test_size=0.2, random_state=np.random.seed())

space = {
    'max_depth': hp.quniform("max_depth", 3, 18, 1),
    'gamma': hp.uniform('gamma', 1, 9),
    'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'seed': 12
}

best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print("Best parameters:", best_params)

swing_model = XGBClassifier(**best_params)
swing_model.fit(X_train, y_train)

100%|████████████████████████████████████████████████| 10/10 [06:16<00:00, 37.69s/trial, best loss: -0.471025326143058]
Best parameters: {'max_depth': 7, 'gamma': 6.506444472877417, 'reg_alpha': 114, 'reg_lambda': 0.3742616375105561, 'colsample_bytree': 0.7097371742998273, 'min_child_weight': 6, 'n_estimators': 175}


In [21]:
swing_df = total_sc[(total_sc['swing']) & total_sc['pitch_name'].isin(breaking)]
swing_X = swing_df[features_with_dif]
swing_y = le_swing.fit_transform(swing_df['type_swing'])

X_train, X_test, y_train, y_test = train_test_split(swing_X, swing_y, test_size=0.2, random_state=np.random.seed())

best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

swing_bb_model = XGBClassifier(**best_params)
swing_bb_model.fit(X_train, y_train)

100%|██████████████████████████████████████████████| 10/10 [08:35<00:00, 51.59s/trial, best loss: -0.39594816923058446]
{'max_depth': 17, 'gamma': 1.101935826081121, 'reg_alpha': 54, 'reg_lambda': 0.6853673162324807, 'colsample_bytree': 0.9122069104070036, 'min_child_weight': 5, 'n_estimators': 127}


In [22]:
swing_df = total_sc[(total_sc['swing']) & total_sc['pitch_name'].isin(offspeed)]
swing_X = swing_df[features_with_dif]
swing_y = le_swing.fit_transform(swing_df['type_swing'])

X_train, X_test, y_train, y_test = train_test_split(swing_X, swing_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

swing_offs_model = XGBClassifier(**best_params)
swing_offs_model.fit(X_train, y_train)

100%|██████████████████████████████████████████████| 10/10 [01:54<00:00, 11.45s/trial, best loss: -0.39421932736268556]
{'max_depth': 16, 'gamma': 1.6972744731109684, 'reg_alpha': 76, 'reg_lambda': 0.45263868449173283, 'colsample_bytree': 0.9928971279776679, 'min_child_weight': 9, 'n_estimators': 52}


In [23]:
le_take = LabelEncoder()

take_total_sc = total_sc[(total_sc['take']) & total_sc['pitch_name'].isin(fastballs)]
take_X = take_total_sc[features]
take_y = le_take.fit_transform(take_total_sc['type_take'])

X_train, X_test, y_train, y_test = train_test_split(take_X, take_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

take_model = XGBClassifier(**best_params)
take_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [07:19<00:00, 43.92s/trial, best loss: -0.6398352736200206]
{'max_depth': 18, 'gamma': 1.5046159624200932, 'reg_alpha': 121, 'reg_lambda': 0.9361658555820437, 'colsample_bytree': 0.7379903718211573, 'min_child_weight': 9, 'n_estimators': 77}


In [24]:
take_total_sc = total_sc[(total_sc['take']) & total_sc['pitch_name'].isin(breaking)]
take_X = take_total_sc[features_with_dif]
take_y = le_take.fit_transform(take_total_sc['type_take'])

X_train, X_test, y_train, y_test = train_test_split(take_X, take_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

take_bb_model = XGBClassifier(**best_params)
take_bb_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [06:12<00:00, 37.26s/trial, best loss: -0.6955796149112377]
{'max_depth': 14, 'gamma': 7.998365635824791, 'reg_alpha': 70, 'reg_lambda': 0.8576964521702992, 'colsample_bytree': 0.6827406557065885, 'min_child_weight': 1, 'n_estimators': 81}


In [25]:
take_total_sc = total_sc[(total_sc['take']) & total_sc['pitch_name'].isin(offspeed)]
take_X = take_total_sc[features_with_dif]
take_y = le_take.fit_transform(take_total_sc['type_take'])

X_train, X_test, y_train, y_test = train_test_split(take_X, take_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

take_offs_model = XGBClassifier(**best_params)
take_offs_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [01:24<00:00,  8.49s/trial, best loss: -0.8033187715980961]
{'max_depth': 17, 'gamma': 4.175757348871056, 'reg_alpha': 126, 'reg_lambda': 0.7519319482190004, 'colsample_bytree': 0.9577193747544183, 'min_child_weight': 5, 'n_estimators': 57}


In [26]:
le_woba = LabelEncoder()

woba_total_sc = total_sc[(total_sc['in_play']) & total_sc['pitch_name'].isin(fastballs)]
woba_X = woba_total_sc[features]
woba_y = le_woba.fit_transform(woba_total_sc['type_in_play'])

X_train, X_test, y_train, y_test = train_test_split(woba_X, woba_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

woba_model = XGBClassifier(**best_params)
woba_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [03:27<00:00, 20.71s/trial, best loss: -0.6669216975493126]
{'max_depth': 13, 'gamma': 2.04505664698567, 'reg_alpha': 79, 'reg_lambda': 0.30350013600727177, 'colsample_bytree': 0.6967775298767929, 'min_child_weight': 0, 'n_estimators': 171}


In [27]:
woba_total_sc = total_sc[(total_sc['in_play']) & total_sc['pitch_name'].isin(breaking)]
woba_X = woba_total_sc[features_with_dif]
woba_y = le_woba.fit_transform(woba_total_sc['type_in_play'])

X_train, X_test, y_train, y_test = train_test_split(woba_X, woba_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

woba_bb_model = XGBClassifier(**best_params)
woba_bb_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [03:13<00:00, 19.32s/trial, best loss: -0.6787956571507857]
{'max_depth': 5, 'gamma': 6.305448259388227, 'reg_alpha': 137, 'reg_lambda': 0.6543448282696338, 'colsample_bytree': 0.9099347087781734, 'min_child_weight': 4, 'n_estimators': 187}


In [28]:
woba_total_sc = total_sc[(total_sc['in_play']) & total_sc['pitch_name'].isin(offspeed)]
woba_X = woba_total_sc[features_with_dif]
woba_y = le_woba.fit_transform(woba_total_sc['type_in_play'])

X_train, X_test, y_train, y_test = train_test_split(woba_X, woba_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

woba_offs_model = XGBClassifier(**best_params)
woba_offs_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [00:47<00:00,  4.72s/trial, best loss: -0.6998256248443079]
{'max_depth': 8, 'gamma': 3.5480265372637545, 'reg_alpha': 165, 'reg_lambda': 0.4066451536879263, 'colsample_bytree': 0.7676352092738795, 'min_child_weight': 2, 'n_estimators': 107}


In [29]:
def will_swing_objective(trial, will_swing_X, will_swing_y):
    X_train, X_test, y_train, y_test = train_test_split(will_swing_X, will_swing_y, test_size=0.2, random_state=np.random.seed())
    
    params = {
        "iterations": trial.suggest_int("iterations", 1000, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", .05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100)
    }

    model = CatBoostClassifier(**params, silent=True, thread_count=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1]  # Probability of class 1 (swing)
    logloss = log_loss(y_test, y_pred)
    return logloss

will_swing_total_sc = total_sc[total_sc['pitch_name'].isin(fastballs)]
will_swing_X = will_swing_total_sc[features]
will_swing_y = will_swing_total_sc['swing']

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: will_swing_objective(trial, will_swing_X, will_swing_y), n_trials=5)
best_params = study.best_params
best_ll = study.best_value

print('Best hyperparameters:', best_params)
print('Best logloss:', best_ll)
X_train, X_test, y_train, y_test = train_test_split(will_swing_X, will_swing_y, test_size=0.2, random_state=np.random.seed())

will_swing_model = CatBoostClassifier(**best_params, silent=True)
will_swing_model.fit(X_train, y_train)

[I 2023-12-05 20:25:15,856] A new study created in memory with name: no-name-f3a91d25-0ee8-4bee-95cd-6b1a8cff6bc0
[I 2023-12-05 20:27:03,346] Trial 0 finished with value: 0.6897973040011832 and parameters: {'iterations': 1058, 'learning_rate': 0.0011883310191036082, 'depth': 8, 'colsample_bylevel': 0.9852981154526754, 'min_data_in_leaf': 87}. Best is trial 0 with value: 0.6897973040011832.
[I 2023-12-05 20:28:48,827] Trial 1 finished with value: 0.6883469118826734 and parameters: {'iterations': 1122, 'learning_rate': 0.014061967019782877, 'depth': 10, 'colsample_bylevel': 0.27594471345969973, 'min_data_in_leaf': 62}. Best is trial 1 with value: 0.6883469118826734.
[I 2023-12-05 20:29:39,176] Trial 2 finished with value: 0.6893934765631662 and parameters: {'iterations': 1154, 'learning_rate': 0.047619295040858906, 'depth': 9, 'colsample_bylevel': 0.051787628814383646, 'min_data_in_leaf': 74}. Best is trial 1 with value: 0.6883469118826734.
[I 2023-12-05 20:31:08,202] Trial 3 finished wi

Best hyperparameters: {'iterations': 1122, 'learning_rate': 0.014061967019782877, 'depth': 10, 'colsample_bylevel': 0.27594471345969973, 'min_data_in_leaf': 62}
Best logloss: 0.6883469118826734


<catboost.core.CatBoostClassifier at 0x24563e3e190>

In [30]:
will_swing_total_sc = total_sc[total_sc['pitch_name'].isin(breaking)]
will_swing_X = will_swing_total_sc[features_with_dif]
will_swing_y = will_swing_total_sc['swing']

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: will_swing_objective(trial, will_swing_X, will_swing_y), n_trials=5)
best_params = study.best_params
best_ll = study.best_value

print('Best hyperparameters:', best_params)
print('Best logloss:', best_ll)
X_train, X_test, y_train, y_test = train_test_split(will_swing_X, will_swing_y, test_size=0.2, random_state=np.random.seed())

will_swing_bb_model = CatBoostClassifier(**best_params, silent=True)
will_swing_bb_model.fit(X_train, y_train)

[I 2023-12-05 20:35:30,688] A new study created in memory with name: no-name-dbeaad5b-2ab9-41e9-bd4e-c95ba06c26e6
[I 2023-12-05 20:36:45,786] Trial 0 finished with value: 0.6850654939790604 and parameters: {'iterations': 1125, 'learning_rate': 0.007445441860169853, 'depth': 6, 'colsample_bylevel': 0.29690503537684476, 'min_data_in_leaf': 34}. Best is trial 0 with value: 0.6850654939790604.
[I 2023-12-05 20:37:55,844] Trial 1 finished with value: 0.6851908415037405 and parameters: {'iterations': 1542, 'learning_rate': 0.011499015251675187, 'depth': 3, 'colsample_bylevel': 0.24487724988951154, 'min_data_in_leaf': 51}. Best is trial 0 with value: 0.6850654939790604.
[I 2023-12-05 20:40:14,242] Trial 2 finished with value: 0.6836111401372141 and parameters: {'iterations': 1372, 'learning_rate': 0.007849841721202708, 'depth': 9, 'colsample_bylevel': 0.409018551868205, 'min_data_in_leaf': 42}. Best is trial 2 with value: 0.6836111401372141.
[I 2023-12-05 20:41:25,993] Trial 3 finished with v

Best hyperparameters: {'iterations': 1162, 'learning_rate': 0.08381307068145152, 'depth': 7, 'colsample_bylevel': 0.976318470250456, 'min_data_in_leaf': 40}
Best logloss: 0.6825149572917193


<catboost.core.CatBoostClassifier at 0x244f257b910>

In [31]:
will_swing_total_sc = total_sc[total_sc['pitch_name'].isin(offspeed)]
will_swing_X = will_swing_total_sc[features_with_dif]
will_swing_y = will_swing_total_sc['swing']

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: will_swing_objective(trial, will_swing_X, will_swing_y), n_trials=5)
best_params = study.best_params
best_ll = study.best_value

print('Best hyperparameters:', best_params)
print('Best logloss:', best_ll)
X_train, X_test, y_train, y_test = train_test_split(will_swing_X, will_swing_y, test_size=0.2, random_state=np.random.seed())

will_swing_offs_model = CatBoostClassifier(**best_params, silent=True)
will_swing_offs_model.fit(X_train, y_train)

[I 2023-12-05 20:44:15,768] A new study created in memory with name: no-name-90613687-9c08-4ebd-ac00-b4551dd6e196
[I 2023-12-05 20:44:34,004] Trial 0 finished with value: 0.6916488217491829 and parameters: {'iterations': 1298, 'learning_rate': 0.0384869349436604, 'depth': 1, 'colsample_bylevel': 0.07855564587791239, 'min_data_in_leaf': 37}. Best is trial 0 with value: 0.6916488217491829.
[I 2023-12-05 20:45:09,001] Trial 1 finished with value: 0.6884224939237406 and parameters: {'iterations': 1352, 'learning_rate': 0.061758035838566205, 'depth': 6, 'colsample_bylevel': 0.9434781069344901, 'min_data_in_leaf': 99}. Best is trial 1 with value: 0.6884224939237406.
[I 2023-12-05 20:45:50,353] Trial 2 finished with value: 0.6882146713713385 and parameters: {'iterations': 1825, 'learning_rate': 0.02926841819403217, 'depth': 5, 'colsample_bylevel': 0.9621418346806617, 'min_data_in_leaf': 28}. Best is trial 2 with value: 0.6882146713713385.
[I 2023-12-05 20:46:18,447] Trial 3 finished with valu

Best hyperparameters: {'iterations': 1825, 'learning_rate': 0.02926841819403217, 'depth': 5, 'colsample_bylevel': 0.9621418346806617, 'min_data_in_leaf': 28}
Best logloss: 0.6882146713713385


<catboost.core.CatBoostClassifier at 0x2454a122100>

In [59]:
fl = pd.read_csv('../../../Desktop/Joliet Slammers/AutomatedPitchTagging/AutoTaggedCSVs/2023 Frontier League Autotagged.csv')

fl = fl.rename(columns={
    'PitcherThrows': 'p_throws',
    'HorzBreak': 'pfx_x',
    'InducedVertBreak': 'pfx_z',
    'yt_Efficiency': 'active_spin',
    'RelSpeed': 'release_speed',
    'Extension': 'release_extension',
    'SpinAxis': 'spin_axis',
    'RelSide': 'release_pos_x',
    'RelHeight': 'release_pos_z',
    'Pitcher': 'player_name',
    'AutoPitchType': 'pitch_name',
    
})

fl = fl.dropna(subset=features)

In [60]:
fl.loc[fl['p_throws'] == 'Left', 'pfx_x'] *= -1
fl.loc[fl['p_throws'] == 'Left', 'release_pos_x'] *= -1
fl.loc[fl['p_throws'] == 'Left', 'spin_axis'] = 360 - fl.loc[fl['p_throws'] == 'Left', 'spin_axis']

In [61]:
fastballs = ['Fastball', 'Sinker']
breaking = ['Curveball', 'Slider', 'Cutter']
offspeed = ['Splitter', 'Changeup']

In [62]:
all_players = fl['player_name'].unique()

fastball_data = get_pitch_data(fl, 'Fastball', all_players)
fastball_data = fastball_data.dropna(subset=['release_speed'])
fastball_velo_map = dict(zip(fastball_data['player_name'], fastball_data['release_speed']))
fastball_h_mov_map = dict(zip(fastball_data['player_name'], fastball_data['pfx_x']))
fastball_v_mov_map = dict(zip(fastball_data['player_name'], fastball_data['pfx_z']))

sinker_data = get_pitch_data(fl, 'Sinker', all_players)
sinker_data = sinker_data.dropna(subset=['release_speed'])
sinker_velo_map = dict(zip(sinker_data['player_name'], sinker_data['release_speed']))
sinker_h_mov_map = dict(zip(sinker_data['player_name'], sinker_data['pfx_x']))
sinker_v_mov_map = dict(zip(sinker_data['player_name'], sinker_data['pfx_z']))

cutter_data = get_pitch_data(fl, 'Cutter', all_players)
cutter_data = cutter_data.dropna(subset=['release_speed'])
cutter_velo_map = dict(zip(cutter_data['player_name'], cutter_data['release_speed']))
cutter_h_mov_map = dict(zip(cutter_data['player_name'], cutter_data['pfx_x']))
cutter_v_mov_map = dict(zip(cutter_data['player_name'], cutter_data['pfx_z']))

In [63]:
pitch_data = {
    'Fastball': {
        'velo_map': fastball_velo_map,
        'h_mov_map': fastball_h_mov_map,
        'v_mov_map': fastball_v_mov_map
    },
    'Sinker': {
        'velo_map': sinker_velo_map,
        'h_mov_map': sinker_h_mov_map,
        'v_mov_map': sinker_v_mov_map
    },
    'Cutter': {
        'velo_map': cutter_velo_map,
        'h_mov_map': cutter_h_mov_map,
        'v_mov_map': cutter_v_mov_map
    }
}

def calculate_differences_fl(row):
    player_name = row['player_name']
    for pitch_type in ['Fastball', 'Sinker', 'Cutter']:
        if player_name in pitch_data[pitch_type]['velo_map']:
            velo_dif = row['release_speed'] - pitch_data[pitch_type]['velo_map'][player_name]
            h_mov_dif = row['pfx_x'] - pitch_data[pitch_type]['h_mov_map'][player_name]
            v_mov_dif = row['pfx_z'] - pitch_data[pitch_type]['v_mov_map'][player_name]
            return velo_dif, h_mov_dif, v_mov_dif

    return np.nan, np.nan, np.nan

fl['velo_dif'], fl['h_mov_dif'], fl['v_mov_dif'] = zip(*fl.apply(calculate_differences_fl, axis=1))

In [64]:
fastball_df = fl[fl['pitch_name'].isin(fastballs)]
bb_df = fl[fl['pitch_name'].isin(breaking)]
offs_df = fl[fl['pitch_name'].isin(offspeed)]

In [65]:
values = {
    'home_run': 1.374328827219,
    'triple': 1.05755624961515,
    'double': 0.766083123898271,
    'single': 0.467292970729251,
    'ball': 0.0636883289483747,
    'hit_by_pitch': 0.0636883289483747,
    'blocked_ball': 0.0636883289483747,
    'foul': -0.0380502742575014,
    'foul_tip': -0.0380502742575014,
    'bunt_foul': -0.0380502742575014,
    'bunt_foul_tip': -0.0380502742575014,
    'called_strike': -0.065092516089806,
    'swinging_strike': -0.118124935770601,
    'swinging_strike_blocked': -0.118124935770601,
    'force_out': -0.1955687665555,
    'grounded_into_double_play': -0.1955687665555,
    'fielders_choice_out': -0.1955687665555,
    'fielders_choice': -0.1955687665555,
    'field_out': -0.1955687665555,
    'double_play': -0.1955687665555,
    'sac_fly': -0.236889645519856,
    'field_error': -0.236889645519856,
    'catcher_interf': -0.789788814378052,
    'sac_fly_double_play': -0.789788814378052,
    'triple_play': -0.789788814378052
}

In [66]:
swing_probs = swing_model.predict_proba(fastball_df[features])
take_probs = take_model.predict_proba(fastball_df[features])
ws_probs = will_swing_model.predict_proba(fastball_df[features])
woba_probs = woba_model.predict_proba(fastball_df[features])

fastball_df['whiff_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('whiff')]
fastball_df['in_play_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('in_play')]
fastball_df['foul_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('foul')]

fastball_df['strike_prob'] = take_probs[:, list(le_take.inverse_transform(take_model.classes_)).index('strike')]
fastball_df['ball_prob'] = take_probs[:, list(le_take.inverse_transform(take_model.classes_)).index('ball')]
fastball_df['hbp_prob'] = take_probs[:, list(le_take.inverse_transform(take_model.classes_)).index('hbp')]

fastball_df['single_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('single')]
fastball_df['double_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('double')]
fastball_df['triple_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('triple')]
fastball_df['hr_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('home_run')]
fastball_df['fo_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('field_out')]
fastball_df['xwOBAcon'] = values['single'] * fastball_df['single_prob'] + values['double'] * fastball_df['double_prob'] + values['triple'] * fastball_df['triple_prob'] + values['home_run'] * fastball_df['hr_prob'] + values['field_out'] * fastball_df['fo_prob']

fastball_df['swing_prob'] = ws_probs[:, list(will_swing_model.classes_).index('True')]
fastball_df['take_prob'] = 1 - fastball_df['swing_prob'] 

fastball_df['val_swing'] = values['swinging_strike'] * fastball_df['whiff_prob'] + values['foul'] * fastball_df['foul_prob']
+ fastball_df['xwOBAcon'] * fastball_df['in_play_prob']
fastball_df['val_take'] = values['called_strike'] * fastball_df['strike_prob'] + values['ball'] * fastball_df['ball_prob'] + values['hit_by_pitch'] * fastball_df['hbp_prob']
fastball_df['xRV'] = fastball_df['val_swing'] * fastball_df['swing_prob'] + fastball_df['val_take'] * fastball_df['take_prob']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fastball_df['whiff_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('whiff')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fastball_df['in_play_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('in_play')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [67]:
swing_bb_probs = swing_bb_model.predict_proba(bb_df[features_with_dif])
take_bb_probs = take_bb_model.predict_proba(bb_df[features_with_dif])
ws_bb_probs = will_swing_bb_model.predict_proba(bb_df[features_with_dif])
woba_bb_probs = woba_bb_model.predict_proba(bb_df[features_with_dif])

bb_df['whiff_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('whiff')]
bb_df['in_play_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('in_play')]
bb_df['foul_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('foul')]

bb_df['strike_prob'] = take_bb_probs[:, list(le_take.inverse_transform(take_bb_model.classes_)).index('strike')]
bb_df['ball_prob'] = take_bb_probs[:, list(le_take.inverse_transform(take_bb_model.classes_)).index('ball')]
bb_df['hbp_prob'] = take_bb_probs[:, list(le_take.inverse_transform(take_bb_model.classes_)).index('hbp')]

bb_df['single_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('single')]
bb_df['double_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('double')]
bb_df['triple_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('triple')]
bb_df['hr_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('home_run')]
bb_df['fo_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('field_out')]
bb_df['xwOBAcon'] = values['single'] * bb_df['single_prob'] + values['double'] * bb_df['double_prob'] + values['triple'] * bb_df['triple_prob'] + values['home_run'] * bb_df['hr_prob'] + values['field_out'] * bb_df['fo_prob']

bb_df['swing_prob'] = ws_bb_probs[:, list(will_swing_bb_model.classes_).index('True')]
bb_df['take_prob'] = 1 - bb_df['swing_prob'] 

bb_df['val_swing'] = values['swinging_strike'] * bb_df['whiff_prob'] + values['foul'] * bb_df['foul_prob']
+ bb_df['xwOBAcon'] * bb_df['in_play_prob']
bb_df['val_take'] = values['called_strike'] * bb_df['strike_prob'] + values['ball'] * bb_df['ball_prob'] + values['hit_by_pitch'] * bb_df['hbp_prob']
bb_df['xRV'] = bb_df['val_swing'] * bb_df['swing_prob'] + bb_df['val_take'] * bb_df['take_prob']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['whiff_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('whiff')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['in_play_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('in_play')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [68]:
swing_offs_probs = swing_offs_model.predict_proba(offs_df[features_with_dif])
take_offs_probs = take_offs_model.predict_proba(offs_df[features_with_dif])
ws_offs_probs = will_swing_offs_model.predict_proba(offs_df[features_with_dif])
woba_offs_probs = woba_offs_model.predict_proba(offs_df[features_with_dif])

offs_df['whiff_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('whiff')]
offs_df['in_play_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('in_play')]
offs_df['foul_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('foul')]

offs_df['strike_prob'] = take_offs_probs[:, list(le_take.inverse_transform(take_offs_model.classes_)).index('strike')]
offs_df['ball_prob'] = take_offs_probs[:, list(le_take.inverse_transform(take_offs_model.classes_)).index('ball')]
offs_df['hbp_prob'] = take_offs_probs[:, list(le_take.inverse_transform(take_offs_model.classes_)).index('hbp')]

offs_df['single_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('single')]
offs_df['double_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('double')]
offs_df['triple_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('triple')]
offs_df['hr_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('home_run')]
offs_df['fo_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('field_out')]
offs_df['xwOBAcon'] = values['single'] * offs_df['single_prob'] + values['double'] * offs_df['double_prob'] + values['triple'] * offs_df['triple_prob'] + values['home_run'] * offs_df['hr_prob'] + values['field_out'] * offs_df['fo_prob']

offs_df['swing_prob'] = ws_offs_probs[:, list(will_swing_offs_model.classes_).index('True')]
offs_df['take_prob'] = 1 - offs_df['swing_prob'] 

offs_df['val_swing'] = values['swinging_strike'] * offs_df['whiff_prob'] + values['foul'] * offs_df['foul_prob']
+ offs_df['xwOBAcon'] * offs_df['in_play_prob']
offs_df['val_take'] = values['called_strike'] * offs_df['strike_prob'] + values['ball'] * offs_df['ball_prob'] + values['hit_by_pitch'] * offs_df['hbp_prob']
offs_df['xRV'] = offs_df['val_swing'] * offs_df['swing_prob'] + offs_df['val_take'] * offs_df['take_prob']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offs_df['whiff_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('whiff')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offs_df['in_play_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('in_play')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [69]:
total_df = pd.concat([fastball_df, bb_df, offs_df])

In [70]:
def calculate_stuff_plus(pitch_sc, min_num_pitches=100):
    agg_stuff_plus = pitch_sc.groupby(['player_name', 'pitch_name']).agg(
        mean_xrv=('xRV', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()
    
    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff_plus'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff_plus', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

def calculate_agg_stuff_plus(pitch_sc, min_num_pitches=100):
    agg_stuff_plus = pitch_sc.groupby(['player_name']).agg(
        mean_xrv=('xRV', 'mean'),  
        swing_prob=('swing_prob', 'mean'),
        whiff_prob=('whiff_prob', 'mean'),
        take_prob=('take_prob', 'mean'),
        xwobacon=('xwOBAcon', 'mean'),
        fo_prob=('fo_prob', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff_plus'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff_plus', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

agg_total = calculate_agg_stuff_plus(total_df, 500)
agg_fb = calculate_stuff_plus(fastball_df, 50)
agg_bb = calculate_stuff_plus(bb_df, 50)
agg_offs = calculate_stuff_plus(offs_df, 50)

In [71]:
agg_offs

Unnamed: 0,player_name,pitch_name,mean_xrv,num_pitches,velo,vert_break,horz_break,xrv_100,xrv_100_adj,stuff_plus
259,John Baker,Changeup,-0.020205,81,80.618275,12.066410,8.162151,-2.020471,5.126512,150.720708
204,Griffin Baker,Changeup,-0.019666,99,77.695276,11.807158,9.195112,-1.966566,5.072607,149.135892
118,Christian Cosby,Changeup,-0.019494,54,84.734505,10.121728,6.512684,-1.949354,5.055395,148.629864
276,Justin Ferrell,Changeup,-0.019103,83,77.540591,9.686784,15.509793,-1.910337,5.016378,147.482744
135,Cole Davis,Changeup,-0.018244,87,78.758480,7.856818,11.872292,-1.824407,4.930448,144.956391
...,...,...,...,...,...,...,...,...,...,...
280,Justin Showalter,Changeup,0.002685,269,83.927413,4.543962,8.702485,0.268499,2.837542,83.424427
417,Stephen Knapp,Changeup,0.002714,411,76.665637,0.976059,13.190083,0.271363,2.834679,83.340245
168,Derrick Edington,Changeup,0.004833,57,79.993538,5.266897,4.898900,0.483280,2.622761,77.109815
24,Andres Rodriguez,Changeup,0.005346,110,82.677070,4.633478,-2.272826,0.534563,2.571478,75.602098


In [72]:
leaderboard = pd.DataFrame(columns=['Pitcher', 'FB+', 'CB+', 'SI+', 'CUT+', 'SPL+', 'CH+', 'SL+', 'Stuff+'])

for pitcher in agg_total.player_name.unique():
    p = pitcher
    stuff = agg_total.loc[agg_total['player_name'] == pitcher, 'stuff_plus'].values[0]
    
    fb_d = agg_fb[agg_fb['player_name'] == pitcher]
    bb_d = agg_bb[agg_bb['player_name'] == pitcher]
    offs_d = agg_offs[agg_offs['player_name'] == pitcher]
    
    fb_sp = np.nan
    si_sp = np.nan
    
    sl_sp = np.nan
    cb_sp = np.nan
    cut_sp = np.nan
    
    ch_sp = np.nan
    spl_sp = np.nan
    
    if 'Fastball' in fb_d.pitch_name.unique():
        fb_sp = fb_d[fb_d['pitch_name'] == 'Fastball']['stuff_plus'].values[0]
    if 'Sinker' in fb_d.pitch_name.unique():
        si_sp = fb_d[fb_d['pitch_name'] == 'Sinker']['stuff_plus'].values[0]
    
    if 'Curveball' in bb_d.pitch_name.unique():
        cb_sp = bb_d[bb_d['pitch_name'] == 'Curveball']['stuff_plus'].values[0]
    if 'Cutter' in bb_d.pitch_name.unique():
        cut_sp = bb_d[bb_d['pitch_name'] == 'Cutter']['stuff_plus'].values[0]
    if 'Slider' in bb_d.pitch_name.unique():
        sl_sp = bb_d[bb_d['pitch_name'] == 'Slider']['stuff_plus'].values[0]
    
    if 'Changeup' in offs_d.pitch_name.unique():
        ch_sp = offs_d[offs_d['pitch_name'] == 'Changeup']['stuff_plus'].values[0]
    if 'Splitter' in offs_d.pitch_name.unique():
        spl_sp = offs_d[offs_d['pitch_name'] == 'Splitter']['stuff_plus'].values[0]
    
    leaderboard = leaderboard.append({
        'Pitcher': p,
        'FB+': fb_sp,
        'CB+': cb_sp,
        'SI+': si_sp,
        'CUT+': cut_sp,
        'SPL+': spl_sp,
        'CH+': ch_sp,
        'SL+': sl_sp,
        'Stuff+': stuff
    }, ignore_index=True)


  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
 

  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
 

  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
 

  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({


In [73]:
leaderboard.to_csv('../../../Documents/GitHub/Frontier-League-Savant/csvs/stuff+.csv')