In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from pybaseball import statcast, cache
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import joblib
import math
from pybaseball import statcast
import scipy.stats as stats
from catboost import Pool
import optuna
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score, precision_score
import sqlite3
cache.enable()

In [2]:
with sqlite3.connect("../../../Desktop/MLB Statcast.db") as conn:
    sc_23 = pd.read_sql_query("SELECT * FROM statcast_data_2023", conn)
    sc_22 = pd.read_sql_query("SELECT * FROM statcast_data_2022", conn)
    sc_21 = pd.read_sql_query("SELECT * FROM statcast_data_2021", conn)
    sc_20 = pd.read_sql_query("SELECT * FROM statcast_data_2020", conn)
conn.close()

In [3]:
spin_23 = pd.read_csv('../../../Documents/spin_dir_2023.csv').drop(columns=['release_speed'])
spin_22 = pd.read_csv('../../../Documents/spin_dir_2022.csv').drop(columns=['release_speed'])
spin_21 = pd.read_csv('../../../Documents/spin_dir_2021.csv').drop(columns=['release_speed'])
spin_20 = pd.read_csv('../../../Documents/spin_dir_2020.csv').drop(columns=['release_speed'])

In [4]:
sc_23 = pd.merge(sc_23, spin_23, left_on=['player_name', 'pitch_type'], right_on=['last_name, first_name', 'api_pitch_type'], how='left')
sc_22 = pd.merge(sc_22, spin_22, left_on=['player_name', 'pitch_type'], right_on=['last_name, first_name', 'api_pitch_type'], how='left')
sc_21 = pd.merge(sc_21, spin_21, left_on=['player_name', 'pitch_type'], right_on=['last_name, first_name', 'api_pitch_type'], how='left')
sc_20 = pd.merge(sc_20, spin_20, left_on=['player_name', 'pitch_type'], right_on=['last_name, first_name', 'api_pitch_type'], how='left')

In [5]:
total_sc = pd.concat([sc_23, sc_22, sc_21, sc_20])

In [6]:
features = ['release_speed', 'spin_axis', 'active_spin', 'pfx_x', 'pfx_z', 'release_extension', 'release_pos_x', 'release_pos_z', 'balls', 'strikes', 'plate_x', 'plate_z', 'stand']
total_sc = total_sc[total_sc[features].notnull().all(axis=1)]

In [7]:
total_sc = total_sc[(total_sc['balls'] < 4) & (total_sc['strikes'] < 3)]

In [8]:
total_sc.loc[total_sc['p_throws'] == 'L', 'pfx_x'] *= -1
total_sc.loc[total_sc['p_throws'] == 'L', 'release_pos_x'] *= -1
total_sc.loc[total_sc['p_throws'] == 'L', 'spin_axis'] = 360 - total_sc.loc[total_sc['p_throws'] == 'L', 'spin_axis']

In [9]:
total_sc = total_sc[~total_sc['pitch_name'].isin(['Pitch Out', 'Eephus', 'Knuckleball'])]

In [10]:
fastballs = ['4-Seam Fastball', 'Sinker']
offspeed = ['Split-Finger', 'Changeup', 'Forkball']
breaking = ['Curveball', 'Slider', 'Cutter', 'Knuckle Curve', 'Sweeper', 'Screwball', 'Slow Curve']

In [11]:
def get_pitch_data(total_sc, pitch_name, players):
    pitch_data = total_sc.loc[(total_sc['pitch_name'] == pitch_name) & (total_sc['player_name'].isin(players))]
    grouped_data = pitch_data.groupby('player_name').agg({
        'release_speed': 'median',
        'pfx_x': 'median',
        'pfx_z': 'median'
    }).reset_index()
    
    return grouped_data

all_players = total_sc['player_name'].unique()

fastball_data = get_pitch_data(total_sc, '4-Seam Fastball', all_players)
fastball_data = fastball_data.dropna(subset=['release_speed'])
fastball_velo_map = dict(zip(fastball_data['player_name'], fastball_data['release_speed']))
fastball_h_mov_map = dict(zip(fastball_data['player_name'], fastball_data['pfx_x']))
fastball_v_mov_map = dict(zip(fastball_data['player_name'], fastball_data['pfx_z']))

sinker_data = get_pitch_data(total_sc, 'Sinker', all_players)
sinker_data = sinker_data.dropna(subset=['release_speed'])
sinker_velo_map = dict(zip(sinker_data['player_name'], sinker_data['release_speed']))
sinker_h_mov_map = dict(zip(sinker_data['player_name'], sinker_data['pfx_x']))
sinker_v_mov_map = dict(zip(sinker_data['player_name'], sinker_data['pfx_z']))

cutter_data = get_pitch_data(total_sc, 'Cutter', all_players)
cutter_data = cutter_data.dropna(subset=['release_speed'])
cutter_velo_map = dict(zip(cutter_data['player_name'], cutter_data['release_speed']))
cutter_h_mov_map = dict(zip(cutter_data['player_name'], cutter_data['pfx_x']))
cutter_v_mov_map = dict(zip(cutter_data['player_name'], cutter_data['pfx_z']))

In [12]:
pitch_data = {
    '4-Seam Fastball': {
        'velo_map': fastball_velo_map,
        'h_mov_map': fastball_h_mov_map,
        'v_mov_map': fastball_v_mov_map
    },
    'Sinker': {
        'velo_map': sinker_velo_map,
        'h_mov_map': sinker_h_mov_map,
        'v_mov_map': sinker_v_mov_map
    },
    'Cutter': {
        'velo_map': cutter_velo_map,
        'h_mov_map': cutter_h_mov_map,
        'v_mov_map': cutter_v_mov_map
    }
}

def calculate_differences(row):
    player_name = row['player_name']
    for pitch_type in ['4-Seam Fastball', 'Sinker', 'Cutter']:
        if player_name in pitch_data[pitch_type]['velo_map']:
            velo_dif = row['release_speed'] - pitch_data[pitch_type]['velo_map'][player_name]
            h_mov_dif = row['pfx_x'] - pitch_data[pitch_type]['h_mov_map'][player_name]
            v_mov_dif = row['pfx_z'] - pitch_data[pitch_type]['v_mov_map'][player_name]
            return velo_dif, h_mov_dif, v_mov_dif

    return np.nan, np.nan, np.nan

total_sc['velo_dif'], total_sc['h_mov_dif'], total_sc['v_mov_dif'] = zip(*total_sc.apply(calculate_differences, axis=1))

In [13]:
total_sc = total_sc[(total_sc['velo_dif'] <= 0) | (total_sc['pitch_name'].isin(['Sinker', '4-Seam Fastball']))]

In [14]:
features_with_dif = features + ['h_mov_dif', 'v_mov_dif', 'velo_dif']

In [15]:
total_sc['description'] = np.where(total_sc['description'] == 'hit_into_play', total_sc['events'], total_sc['description'])
field_outs = ['force_out', 'grounded_into_double_play', 'fielders_choice_out', 'fielders_choice', 'field_out', 'double_play', 'sac_fly', 'field_error', 'sac_fly_double_play', 'triple_play']
total_sc['description'] = total_sc['description'].replace(field_outs, 'field_out')

In [16]:
total_sc['whiff'] = total_sc['description'].isin(['swinging_strike', 'swinging_strike_blocked'])
total_sc['foul'] = total_sc['description'].isin(['foul', 'foul_tip'])
total_sc['in_play'] = total_sc['description'].isin(['single', 'double', 'triple', 'home_run', 'field_out'])
total_sc['swing'] = (total_sc['whiff'] | total_sc['foul'] | total_sc['in_play'])

total_sc['take'] = (~total_sc['swing'] & (total_sc['description'].isin(['hit_by_pitch', 'ball', 'called_strike', 'blocked_ball'])))
total_sc['hbp'] = total_sc['description'] == 'hit_by_pitch'
total_sc['ball'] = total_sc['description'].isin(['blocked_ball', 'ball'])
total_sc['strike'] = total_sc['description'] == 'called_strike'

total_sc['single'] = total_sc['description'] == 'single'
total_sc['double'] = total_sc['description'] == 'double'
total_sc['triple'] = total_sc['description'] == 'triple'
total_sc['home_run'] = total_sc['description'] == 'home_run'
total_sc['field_out'] = total_sc['description'] == 'field_out'

In [17]:
total_sc.loc[total_sc['swing'] & total_sc['foul'], 'type_swing'] = 'foul'
total_sc.loc[total_sc['swing'] & total_sc['in_play'], 'type_swing'] = 'in_play'
total_sc.loc[total_sc['swing'] & total_sc['whiff'], 'type_swing'] = 'whiff'

total_sc.loc[total_sc['take'] & total_sc['hbp'], 'type_take'] = 'hbp'
total_sc.loc[total_sc['take'] & total_sc['ball'], 'type_take'] = 'ball'
total_sc.loc[total_sc['take'] & total_sc['strike'], 'type_take'] = 'strike'

total_sc.loc[total_sc['in_play'] & total_sc['single'], 'type_in_play'] = 'single'
total_sc.loc[total_sc['in_play'] & total_sc['double'], 'type_in_play'] = 'double'
total_sc.loc[total_sc['in_play'] & total_sc['triple'], 'type_in_play'] = 'triple'
total_sc.loc[total_sc['in_play'] & total_sc['home_run'], 'type_in_play'] = 'home_run'
total_sc.loc[total_sc['in_play'] & total_sc['field_out'], 'type_in_play'] = 'field_out'

In [18]:
total_sc['stand'] = total_sc['stand'].replace({'R': 0, 'L': 1})

In [19]:
total_sc = total_sc.dropna(subset=['swing', 'take'])
total_sc = total_sc[total_sc['swing'] != total_sc['take']]

In [20]:
from sklearn.preprocessing import LabelEncoder
from hyperopt import hp, fmin, tpe

def objective(space, X_train, X_test, y_train, y_test):
    model = XGBClassifier(
        max_depth=int(space['max_depth']),
        gamma=space['gamma'],
        reg_alpha=int(space['reg_alpha']),
        reg_lambda=space['reg_lambda'],
        colsample_bytree=space['colsample_bytree'],
        min_child_weight=int(space['min_child_weight']),
        n_estimators=int(space['n_estimators']))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy

le_swing = LabelEncoder()

swing_df = total_sc[(total_sc['swing']) & total_sc['pitch_name'].isin(fastballs)]
swing_X = swing_df[features]
swing_y = le_swing.fit_transform(swing_df['type_swing'])

X_train, X_test, y_train, y_test = train_test_split(swing_X, swing_y, test_size=0.2, random_state=np.random.seed())

space = {
    'max_depth': hp.quniform("max_depth", 3, 18, 1),
    'gamma': hp.uniform('gamma', 1, 9),
    'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'seed': 12
}

best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print("Best parameters:", best_params)

swing_model = XGBClassifier(**best_params)
swing_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [08:04<00:00, 48.44s/trial, best loss: -0.5086674853613825]
Best parameters: {'max_depth': 10, 'gamma': 4.623678551692081, 'reg_alpha': 83, 'reg_lambda': 0.08049627182903285, 'colsample_bytree': 0.5843414516101195, 'min_child_weight': 4, 'n_estimators': 132}


In [21]:
swing_df = total_sc[(total_sc['swing']) & total_sc['pitch_name'].isin(breaking)]
swing_X = swing_df[features_with_dif]
swing_y = le_swing.fit_transform(swing_df['type_swing'])

X_train, X_test, y_train, y_test = train_test_split(swing_X, swing_y, test_size=0.2, random_state=np.random.seed())

best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

swing_bb_model = XGBClassifier(**best_params)
swing_bb_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [06:46<00:00, 40.64s/trial, best loss: -0.5307673832112405]
{'max_depth': 17, 'gamma': 1.00662241560602, 'reg_alpha': 70, 'reg_lambda': 0.44090234088909774, 'colsample_bytree': 0.6429557705601436, 'min_child_weight': 8, 'n_estimators': 76}


In [22]:
swing_df = total_sc[(total_sc['swing']) & total_sc['pitch_name'].isin(offspeed)]
swing_X = swing_df[features_with_dif]
swing_y = le_swing.fit_transform(swing_df['type_swing'])

X_train, X_test, y_train, y_test = train_test_split(swing_X, swing_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

swing_offs_model = XGBClassifier(**best_params)
swing_offs_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [02:01<00:00, 12.15s/trial, best loss: -0.5052921650904991]
{'max_depth': 6, 'gamma': 3.410078896045137, 'reg_alpha': 49, 'reg_lambda': 0.737991619815394, 'colsample_bytree': 0.9827819121641161, 'min_child_weight': 4, 'n_estimators': 179}


In [23]:
le_take = LabelEncoder()

take_total_sc = total_sc[(total_sc['take']) & total_sc['pitch_name'].isin(fastballs)]
take_X = take_total_sc[features]
take_y = le_take.fit_transform(take_total_sc['type_take'])

X_train, X_test, y_train, y_test = train_test_split(take_X, take_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

take_model = XGBClassifier(**best_params)
take_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [05:50<00:00, 35.02s/trial, best loss: -0.9215167618338627]
{'max_depth': 18, 'gamma': 2.804248969594032, 'reg_alpha': 97, 'reg_lambda': 0.9155841176972604, 'colsample_bytree': 0.6932162834009113, 'min_child_weight': 4, 'n_estimators': 126}


In [24]:
take_total_sc = total_sc[(total_sc['take']) & total_sc['pitch_name'].isin(breaking)]
take_X = take_total_sc[features_with_dif]
take_y = le_take.fit_transform(take_total_sc['type_take'])

X_train, X_test, y_train, y_test = train_test_split(take_X, take_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

take_bb_model = XGBClassifier(**best_params)
take_bb_model.fit(X_train, y_train)

100%|████████████████████████████████████████████████| 10/10 [10:38<00:00, 63.83s/trial, best loss: -0.935957611031145]
{'max_depth': 18, 'gamma': 1.381340143622201, 'reg_alpha': 57, 'reg_lambda': 0.03398594378047526, 'colsample_bytree': 0.9919986225879914, 'min_child_weight': 6, 'n_estimators': 156}


In [25]:
take_total_sc = total_sc[(total_sc['take']) & total_sc['pitch_name'].isin(offspeed)]
take_X = take_total_sc[features_with_dif]
take_y = le_take.fit_transform(take_total_sc['type_take'])

X_train, X_test, y_train, y_test = train_test_split(take_X, take_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

take_offs_model = XGBClassifier(**best_params)
take_offs_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [01:56<00:00, 11.68s/trial, best loss: -0.9481645693421138]
{'max_depth': 9, 'gamma': 8.307509067373891, 'reg_alpha': 50, 'reg_lambda': 0.36314336624075627, 'colsample_bytree': 0.8320062056820141, 'min_child_weight': 9, 'n_estimators': 158}


In [26]:
le_woba = LabelEncoder()

woba_total_sc = total_sc[(total_sc['in_play']) & total_sc['pitch_name'].isin(fastballs)]
woba_X = woba_total_sc[features]
woba_y = le_woba.fit_transform(woba_total_sc['type_in_play'])

X_train, X_test, y_train, y_test = train_test_split(woba_X, woba_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

woba_model = XGBClassifier(**best_params)
woba_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [03:55<00:00, 23.56s/trial, best loss: -0.6657023311416617]
{'max_depth': 17, 'gamma': 5.577792349392124, 'reg_alpha': 137, 'reg_lambda': 0.9567898790770637, 'colsample_bytree': 0.6814810509040078, 'min_child_weight': 9, 'n_estimators': 89}


In [27]:
woba_total_sc = total_sc[(total_sc['in_play']) & total_sc['pitch_name'].isin(breaking)]
woba_X = woba_total_sc[features_with_dif]
woba_y = le_woba.fit_transform(woba_total_sc['type_in_play'])

X_train, X_test, y_train, y_test = train_test_split(woba_X, woba_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

woba_bb_model = XGBClassifier(**best_params)
woba_bb_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [02:26<00:00, 14.64s/trial, best loss: -0.6798015886780672]
{'max_depth': 6, 'gamma': 3.5195667743256003, 'reg_alpha': 159, 'reg_lambda': 0.015469437272467856, 'colsample_bytree': 0.8925867572394106, 'min_child_weight': 6, 'n_estimators': 88}


In [28]:
woba_total_sc = total_sc[(total_sc['in_play']) & total_sc['pitch_name'].isin(offspeed)]
woba_X = woba_total_sc[features_with_dif]
woba_y = le_woba.fit_transform(woba_total_sc['type_in_play'])

X_train, X_test, y_train, y_test = train_test_split(woba_X, woba_y, test_size=0.2, random_state=np.random.seed())
best_params = fmin(fn=lambda params: objective(params, X_train, X_test, y_train, y_test),
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10)
best_params = {
    'max_depth': int(best_params['max_depth']),
    'gamma': float(best_params['gamma']),
    'reg_alpha': int(best_params['reg_alpha']),
    'reg_lambda': float(best_params['reg_lambda']),
    'colsample_bytree': float(best_params['colsample_bytree']),
    'min_child_weight': int(best_params['min_child_weight']),
    'n_estimators': int(best_params['n_estimators'])
}
print(best_params)

woba_offs_model = XGBClassifier(**best_params)
woba_offs_model.fit(X_train, y_train)

100%|███████████████████████████████████████████████| 10/10 [01:09<00:00,  6.96s/trial, best loss: -0.6982479448642365]
{'max_depth': 12, 'gamma': 3.7756761438305704, 'reg_alpha': 53, 'reg_lambda': 0.6282231009914321, 'colsample_bytree': 0.7639917066609407, 'min_child_weight': 6, 'n_estimators': 191}


In [29]:
def will_swing_objective(trial, will_swing_X, will_swing_y):
    X_train, X_test, y_train, y_test = train_test_split(will_swing_X, will_swing_y, test_size=0.2, random_state=np.random.seed())
    
    params = {
        "iterations": trial.suggest_int("iterations", 1000, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", .05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100)
    }

    model = CatBoostClassifier(**params, silent=True, thread_count=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1]  # Probability of class 1 (swing)
    logloss = log_loss(y_test, y_pred)
    return logloss

will_swing_total_sc = total_sc[total_sc['pitch_name'].isin(fastballs)]
will_swing_X = will_swing_total_sc[features]
will_swing_y = will_swing_total_sc['swing']

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: will_swing_objective(trial, will_swing_X, will_swing_y), n_trials=5)
best_params = study.best_params
best_ll = study.best_value

print('Best hyperparameters:', best_params)
print('Best logloss:', best_ll)
X_train, X_test, y_train, y_test = train_test_split(will_swing_X, will_swing_y, test_size=0.2, random_state=np.random.seed())

will_swing_model = CatBoostClassifier(**best_params, silent=True)
will_swing_model.fit(X_train, y_train)

[I 2023-12-01 22:02:37,574] A new study created in memory with name: no-name-151aeb93-bcdc-439c-80c9-0f463e530079
[I 2023-12-01 22:07:49,865] Trial 0 finished with value: 0.4467876916097434 and parameters: {'iterations': 1940, 'learning_rate': 0.0036261389080529376, 'depth': 10, 'colsample_bylevel': 0.7865074206714129, 'min_data_in_leaf': 90}. Best is trial 0 with value: 0.4467876916097434.
[I 2023-12-01 22:11:31,746] Trial 1 finished with value: 0.44421830666057216 and parameters: {'iterations': 1806, 'learning_rate': 0.007599162309251593, 'depth': 10, 'colsample_bylevel': 0.2582168369957746, 'min_data_in_leaf': 48}. Best is trial 1 with value: 0.44421830666057216.
[I 2023-12-01 22:14:06,217] Trial 2 finished with value: 0.4950922373798052 and parameters: {'iterations': 1411, 'learning_rate': 0.0011956365063188876, 'depth': 7, 'colsample_bylevel': 0.7807943274193013, 'min_data_in_leaf': 18}. Best is trial 1 with value: 0.44421830666057216.
[I 2023-12-01 22:17:16,276] Trial 3 finished 

Best hyperparameters: {'iterations': 1749, 'learning_rate': 0.08361746174126924, 'depth': 7, 'colsample_bylevel': 0.5890565174542125, 'min_data_in_leaf': 84}
Best logloss: 0.43499456556855126


<catboost.core.CatBoostClassifier at 0x24d55bb31f0>

In [30]:
will_swing_total_sc = total_sc[total_sc['pitch_name'].isin(breaking)]
will_swing_X = will_swing_total_sc[features_with_dif]
will_swing_y = will_swing_total_sc['swing']

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: will_swing_objective(trial, will_swing_X, will_swing_y), n_trials=5)
best_params = study.best_params
best_ll = study.best_value

print('Best hyperparameters:', best_params)
print('Best logloss:', best_ll)
X_train, X_test, y_train, y_test = train_test_split(will_swing_X, will_swing_y, test_size=0.2, random_state=np.random.seed())

will_swing_bb_model = CatBoostClassifier(**best_params, silent=True)
will_swing_bb_model.fit(X_train, y_train)

[I 2023-12-01 22:24:45,291] A new study created in memory with name: no-name-63349519-0220-4786-a0b1-f75069f26109
[I 2023-12-01 22:25:47,595] Trial 0 finished with value: 0.49560413646007534 and parameters: {'iterations': 1180, 'learning_rate': 0.03260370146673072, 'depth': 6, 'colsample_bylevel': 0.10351417817771218, 'min_data_in_leaf': 65}. Best is trial 0 with value: 0.49560413646007534.
[I 2023-12-01 22:26:56,586] Trial 1 finished with value: 0.6104941953375257 and parameters: {'iterations': 1432, 'learning_rate': 0.0011735148764163506, 'depth': 3, 'colsample_bylevel': 0.1705946904532205, 'min_data_in_leaf': 60}. Best is trial 0 with value: 0.49560413646007534.
[I 2023-12-01 22:28:57,336] Trial 2 finished with value: 0.479211757856249 and parameters: {'iterations': 1399, 'learning_rate': 0.02568889590787668, 'depth': 7, 'colsample_bylevel': 0.8159834864890854, 'min_data_in_leaf': 39}. Best is trial 2 with value: 0.479211757856249.
[I 2023-12-01 22:31:29,484] Trial 3 finished with v

Best hyperparameters: {'iterations': 1836, 'learning_rate': 0.05477836027996815, 'depth': 7, 'colsample_bylevel': 0.48254004512509613, 'min_data_in_leaf': 57}
Best logloss: 0.4778402172596971


<catboost.core.CatBoostClassifier at 0x24d4fee30a0>

In [31]:
will_swing_total_sc = total_sc[total_sc['pitch_name'].isin(offspeed)]
will_swing_X = will_swing_total_sc[features_with_dif]
will_swing_y = will_swing_total_sc['swing']

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: will_swing_objective(trial, will_swing_X, will_swing_y), n_trials=5)
best_params = study.best_params
best_ll = study.best_value

print('Best hyperparameters:', best_params)
print('Best logloss:', best_ll)
X_train, X_test, y_train, y_test = train_test_split(will_swing_X, will_swing_y, test_size=0.2, random_state=np.random.seed())

will_swing_offs_model = CatBoostClassifier(**best_params, silent=True)
will_swing_offs_model.fit(X_train, y_train)

[I 2023-12-01 22:35:50,533] A new study created in memory with name: no-name-8ed1cf6a-af2b-4422-b1a3-7aa7df5c486f
[I 2023-12-01 22:36:27,224] Trial 0 finished with value: 0.45442804449523255 and parameters: {'iterations': 1865, 'learning_rate': 0.029628228876652327, 'depth': 3, 'colsample_bylevel': 0.4989042153167008, 'min_data_in_leaf': 42}. Best is trial 0 with value: 0.45442804449523255.
[I 2023-12-01 22:37:08,029] Trial 1 finished with value: 0.5173980370301069 and parameters: {'iterations': 1845, 'learning_rate': 0.0010371891771879282, 'depth': 4, 'colsample_bylevel': 0.6281980884377292, 'min_data_in_leaf': 37}. Best is trial 0 with value: 0.45442804449523255.
[I 2023-12-01 22:37:48,712] Trial 2 finished with value: 0.47986399063207286 and parameters: {'iterations': 1471, 'learning_rate': 0.0019976328284288187, 'depth': 6, 'colsample_bylevel': 0.8558296132290387, 'min_data_in_leaf': 87}. Best is trial 0 with value: 0.45442804449523255.
[I 2023-12-01 22:38:25,904] Trial 3 finished 

Best hyperparameters: {'iterations': 1389, 'learning_rate': 0.009829126008006988, 'depth': 6, 'colsample_bylevel': 0.9555922113991216, 'min_data_in_leaf': 26}
Best logloss: 0.4513335919479442


<catboost.core.CatBoostClassifier at 0x24d4efae460>

In [32]:
fl = pd.read_csv('../AutomatedPitchTagging/AutoTaggedCSVs/2023 Frontier League Autotagged.csv')

In [33]:
fl = fl.rename(columns={
    'PitcherThrows': 'p_throws',
    'HorzBreak': 'pfx_x',
    'InducedVertBreak': 'pfx_z',
    'yt_Efficiency': 'active_spin',
    'RelSpeed': 'release_speed',
    'Extension': 'release_extension',
    'SpinAxis': 'spin_axis',
    'RelSide': 'release_pos_x',
    'RelHeight': 'release_pos_z',
    'Pitcher': 'player_name',
    'AutoPitchType': 'pitch_name',
    'Balls': 'balls', 
    'Strikes': 'strikes',
    'PlateLocSide': 'plate_x',
    'PlateLocHeight': 'plate_z',
    'BatterSide': 'stand',
})

fl = fl[(fl['balls'] < 4) & (fl['strikes'] < 3)]
fl['stand'] = fl['stand'].replace({'Right': 0, 'Left': 1, 'Switch': np.where(('p_throws' == 'Right'), 1, 0)})
fl = fl.dropna(subset=features)

In [34]:
fl.loc[fl['p_throws'] == 'Left', 'pfx_x'] *= -1
fl.loc[fl['p_throws'] == 'Left', 'release_pos_x'] *= -1
fl.loc[fl['p_throws'] == 'Left', 'spin_axis'] = 360 - fl.loc[fl['p_throws'] == 'L', 'spin_axis']

In [35]:
fastballs = ['Fastball', 'Sinker']
breaking = ['Curveball', 'Slider', 'Cutter']
offspeed = ['Splitter', 'Changeup']

In [36]:
all_players = fl['player_name'].unique()

fastball_data = get_pitch_data(fl, 'Fastball', all_players)
fastball_data = fastball_data.dropna(subset=['release_speed'])
fastball_velo_map = dict(zip(fastball_data['player_name'], fastball_data['release_speed']))
fastball_h_mov_map = dict(zip(fastball_data['player_name'], fastball_data['pfx_x']))
fastball_v_mov_map = dict(zip(fastball_data['player_name'], fastball_data['pfx_z']))

sinker_data = get_pitch_data(fl, 'Sinker', all_players)
sinker_data = sinker_data.dropna(subset=['release_speed'])
sinker_velo_map = dict(zip(sinker_data['player_name'], sinker_data['release_speed']))
sinker_h_mov_map = dict(zip(sinker_data['player_name'], sinker_data['pfx_x']))
sinker_v_mov_map = dict(zip(sinker_data['player_name'], sinker_data['pfx_z']))

cutter_data = get_pitch_data(fl, 'Cutter', all_players)
cutter_data = cutter_data.dropna(subset=['release_speed'])
cutter_velo_map = dict(zip(cutter_data['player_name'], cutter_data['release_speed']))
cutter_h_mov_map = dict(zip(cutter_data['player_name'], cutter_data['pfx_x']))
cutter_v_mov_map = dict(zip(cutter_data['player_name'], cutter_data['pfx_z']))

In [37]:
fl.stand.value_counts()

0.0    104517
1.0     53217
Name: stand, dtype: int64

In [38]:
pitch_data = {
    'Fastball': {
        'velo_map': fastball_velo_map,
        'h_mov_map': fastball_h_mov_map,
        'v_mov_map': fastball_v_mov_map
    },
    'Sinker': {
        'velo_map': sinker_velo_map,
        'h_mov_map': sinker_h_mov_map,
        'v_mov_map': sinker_v_mov_map
    },
    'Cutter': {
        'velo_map': cutter_velo_map,
        'h_mov_map': cutter_h_mov_map,
        'v_mov_map': cutter_v_mov_map
    }
}

def calculate_differences_fl(row):
    player_name = row['player_name']
    for pitch_type in ['Fastball', 'Sinker', 'Cutter']:
        if player_name in pitch_data[pitch_type]['velo_map']:
            velo_dif = row['release_speed'] - pitch_data[pitch_type]['velo_map'][player_name]
            h_mov_dif = row['pfx_x'] - pitch_data[pitch_type]['h_mov_map'][player_name]
            v_mov_dif = row['pfx_z'] - pitch_data[pitch_type]['v_mov_map'][player_name]
            return velo_dif, h_mov_dif, v_mov_dif

    return np.nan, np.nan, np.nan

fl['velo_dif'], fl['h_mov_dif'], fl['v_mov_dif'] = zip(*fl.apply(calculate_differences_fl, axis=1))

In [39]:
fastball_df = fl[fl['pitch_name'].isin(fastballs)]
bb_df = fl[fl['pitch_name'].isin(breaking)]
offs_df = fl[fl['pitch_name'].isin(offspeed)]

In [40]:
values = {
    'home_run': 1.374328827219,
    'triple': 1.05755624961515,
    'double': 0.766083123898271,
    'single': 0.467292970729251,
    'ball': 0.0636883289483747,
    'hit_by_pitch': 0.0636883289483747,
    'blocked_ball': 0.0636883289483747,
    'foul': -0.0380502742575014,
    'foul_tip': -0.0380502742575014,
    'bunt_foul': -0.0380502742575014,
    'bunt_foul_tip': -0.0380502742575014,
    'called_strike': -0.065092516089806,
    'swinging_strike': -0.118124935770601,
    'swinging_strike_blocked': -0.118124935770601,
    'force_out': -0.1955687665555,
    'grounded_into_double_play': -0.1955687665555,
    'fielders_choice_out': -0.1955687665555,
    'fielders_choice': -0.1955687665555,
    'field_out': -0.1955687665555,
    'double_play': -0.1955687665555,
    'sac_fly': -0.236889645519856,
    'field_error': -0.236889645519856,
    'catcher_interf': -0.789788814378052,
    'sac_fly_double_play': -0.789788814378052,
    'triple_play': -0.789788814378052
}

In [41]:
swing_probs = swing_model.predict_proba(fastball_df[features])
take_probs = take_model.predict_proba(fastball_df[features])
ws_probs = will_swing_model.predict_proba(fastball_df[features])
woba_probs = woba_model.predict_proba(fastball_df[features])

fastball_df['whiff_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('whiff')]
fastball_df['in_play_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('in_play')]
fastball_df['foul_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('foul')]

fastball_df['strike_prob'] = take_probs[:, list(le_take.inverse_transform(take_model.classes_)).index('strike')]
fastball_df['ball_prob'] = take_probs[:, list(le_take.inverse_transform(take_model.classes_)).index('ball')]
fastball_df['hbp_prob'] = take_probs[:, list(le_take.inverse_transform(take_model.classes_)).index('hbp')]

fastball_df['single_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('single')]
fastball_df['double_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('double')]
fastball_df['triple_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('triple')]
fastball_df['hr_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('home_run')]
fastball_df['fo_prob'] = woba_probs[:, list(le_woba.inverse_transform(woba_model.classes_)).index('field_out')]
fastball_df['xwOBAcon'] = values['single'] * fastball_df['single_prob'] + values['double'] * fastball_df['double_prob'] + values['triple'] * fastball_df['triple_prob'] + values['home_run'] * fastball_df['hr_prob'] + values['field_out'] * fastball_df['fo_prob']

fastball_df['swing_prob'] = ws_probs[:, list(will_swing_model.classes_).index('True')]
fastball_df['take_prob'] = 1 - fastball_df['swing_prob'] 

fastball_df['val_swing'] = values['swinging_strike'] * fastball_df['whiff_prob'] + values['foul'] * fastball_df['foul_prob']
+ fastball_df['xwOBAcon'] * fastball_df['in_play_prob']
fastball_df['val_take'] = values['called_strike'] * fastball_df['strike_prob'] + values['ball'] * fastball_df['ball_prob'] + values['hit_by_pitch'] * fastball_df['hbp_prob']
fastball_df['xRV'] = fastball_df['val_swing'] * fastball_df['swing_prob'] + fastball_df['val_take'] * fastball_df['take_prob']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fastball_df['whiff_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('whiff')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fastball_df['in_play_prob'] = swing_probs[:, list(le_swing.inverse_transform(swing_model.classes_)).index('in_play')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [42]:
swing_bb_probs = swing_bb_model.predict_proba(bb_df[features_with_dif])
take_bb_probs = take_bb_model.predict_proba(bb_df[features_with_dif])
ws_bb_probs = will_swing_bb_model.predict_proba(bb_df[features_with_dif])
woba_bb_probs = woba_bb_model.predict_proba(bb_df[features_with_dif])

bb_df['whiff_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('whiff')]
bb_df['in_play_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('in_play')]
bb_df['foul_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('foul')]

bb_df['strike_prob'] = take_bb_probs[:, list(le_take.inverse_transform(take_bb_model.classes_)).index('strike')]
bb_df['ball_prob'] = take_bb_probs[:, list(le_take.inverse_transform(take_bb_model.classes_)).index('ball')]
bb_df['hbp_prob'] = take_bb_probs[:, list(le_take.inverse_transform(take_bb_model.classes_)).index('hbp')]

bb_df['single_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('single')]
bb_df['double_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('double')]
bb_df['triple_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('triple')]
bb_df['hr_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('home_run')]
bb_df['fo_prob'] = woba_bb_probs[:, list(le_woba.inverse_transform(woba_bb_model.classes_)).index('field_out')]
bb_df['xwOBAcon'] = values['single'] * bb_df['single_prob'] + values['double'] * bb_df['double_prob'] + values['triple'] * bb_df['triple_prob'] + values['home_run'] * bb_df['hr_prob'] + values['field_out'] * bb_df['fo_prob']

bb_df['swing_prob'] = ws_bb_probs[:, list(will_swing_bb_model.classes_).index('True')]
bb_df['take_prob'] = 1 - bb_df['swing_prob'] 

bb_df['val_swing'] = values['swinging_strike'] * bb_df['whiff_prob'] + values['foul'] * bb_df['foul_prob']
+ bb_df['xwOBAcon'] * bb_df['in_play_prob']
bb_df['val_take'] = values['called_strike'] * bb_df['strike_prob'] + values['ball'] * bb_df['ball_prob'] + values['hit_by_pitch'] * bb_df['hbp_prob']
bb_df['xRV'] = bb_df['val_swing'] * bb_df['swing_prob'] + bb_df['val_take'] * bb_df['take_prob']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['whiff_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('whiff')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb_df['in_play_prob'] = swing_bb_probs[:, list(le_swing.inverse_transform(swing_bb_model.classes_)).index('in_play')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [43]:
swing_offs_probs = swing_offs_model.predict_proba(offs_df[features_with_dif])
take_offs_probs = take_offs_model.predict_proba(offs_df[features_with_dif])
ws_offs_probs = will_swing_offs_model.predict_proba(offs_df[features_with_dif])
woba_offs_probs = woba_offs_model.predict_proba(offs_df[features_with_dif])

offs_df['whiff_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('whiff')]
offs_df['in_play_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('in_play')]
offs_df['foul_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('foul')]

offs_df['strike_prob'] = take_offs_probs[:, list(le_take.inverse_transform(take_offs_model.classes_)).index('strike')]
offs_df['ball_prob'] = take_offs_probs[:, list(le_take.inverse_transform(take_offs_model.classes_)).index('ball')]
offs_df['hbp_prob'] = take_offs_probs[:, list(le_take.inverse_transform(take_offs_model.classes_)).index('hbp')]

offs_df['single_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('single')]
offs_df['double_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('double')]
offs_df['triple_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('triple')]
offs_df['hr_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('home_run')]
offs_df['fo_prob'] = woba_offs_probs[:, list(le_woba.inverse_transform(woba_offs_model.classes_)).index('field_out')]
offs_df['xwOBAcon'] = values['single'] * offs_df['single_prob'] + values['double'] * offs_df['double_prob'] + values['triple'] * offs_df['triple_prob'] + values['home_run'] * offs_df['hr_prob'] + values['field_out'] * offs_df['fo_prob']

offs_df['swing_prob'] = ws_offs_probs[:, list(will_swing_offs_model.classes_).index('True')]
offs_df['take_prob'] = 1 - offs_df['swing_prob'] 

offs_df['val_swing'] = values['swinging_strike'] * offs_df['whiff_prob'] + values['foul'] * offs_df['foul_prob']
+ offs_df['xwOBAcon'] * offs_df['in_play_prob']
offs_df['val_take'] = values['called_strike'] * offs_df['strike_prob'] + values['ball'] * offs_df['ball_prob'] + values['hit_by_pitch'] * offs_df['hbp_prob']
offs_df['xRV'] = offs_df['val_swing'] * offs_df['swing_prob'] + offs_df['val_take'] * offs_df['take_prob']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offs_df['whiff_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('whiff')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offs_df['in_play_prob'] = swing_offs_probs[:, list(le_swing.inverse_transform(swing_offs_model.classes_)).index('in_play')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [44]:
total_df = pd.concat([fastball_df, bb_df, offs_df])

In [45]:
def calculate_pitching_plus(pitch_sc, min_num_pitches=100):
    agg_pitching_plus = pitch_sc.groupby(['player_name', 'pitch_name']).agg(
        mean_xrv=('xRV', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()
    
    agg_pitching_plus['xrv_100'] = agg_pitching_plus['mean_xrv'] * 100
    agg_pitching_plus['xrv_100_adj'] = abs(agg_pitching_plus['xrv_100'] - agg_pitching_plus['xrv_100'].max())
    agg_pitching_plus['pitching_plus'] = (agg_pitching_plus['xrv_100_adj'] / agg_pitching_plus['xrv_100_adj'].mean()) * 100
    agg_pitching_plus = agg_pitching_plus.sort_values('pitching_plus', ascending=False)

    agg_pitching_plus = agg_pitching_plus[agg_pitching_plus['num_pitches'] > min_num_pitches]

    return agg_pitching_plus

def calculate_agg_pitching_plus(pitch_sc, min_num_pitches=100):
    agg_pitching_plus = pitch_sc.groupby(['player_name']).agg(
        mean_xrv=('xRV', 'mean'),  
        swing_prob=('swing_prob', 'mean'),
        whiff_prob=('whiff_prob', 'mean'),
        take_prob=('take_prob', 'mean'),
        xwobacon=('xwOBAcon', 'mean'),
        fo_prob=('fo_prob', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_pitching_plus['xrv_100'] = agg_pitching_plus['mean_xrv'] * 100
    agg_pitching_plus['xrv_100_adj'] = abs(agg_pitching_plus['xrv_100'] - agg_pitching_plus['xrv_100'].max())
    agg_pitching_plus['pitching_plus'] = (agg_pitching_plus['xrv_100_adj'] / agg_pitching_plus['xrv_100_adj'].mean()) * 100
    agg_pitching_plus = agg_pitching_plus.sort_values('pitching_plus', ascending=False)

    agg_pitching_plus = agg_pitching_plus[agg_pitching_plus['num_pitches'] > min_num_pitches]

    return agg_pitching_plus

agg_total = calculate_agg_pitching_plus(total_df, 500)
agg_fb = calculate_pitching_plus(fastball_df, 50)
agg_bb = calculate_pitching_plus(bb_df, 50)
agg_offs = calculate_pitching_plus(offs_df, 50)

In [46]:
agg_total

Unnamed: 0,player_name,mean_xrv,swing_prob,whiff_prob,take_prob,xwobacon,fo_prob,num_pitches,velo,vert_break,horz_break,xrv_100,xrv_100_adj,pitching_plus
349,Parker Brahms,-0.023280,0.642700,0.315128,0.357300,0.082602,0.690831,524,84.996999,11.001383,4.993150,-2.327979,8.114615,120.265471
7,Abdiel Saldana,-0.021748,0.645252,0.307291,0.354748,0.088970,0.684935,949,88.028554,8.552360,0.062278,-2.174832,7.961468,117.995695
54,Brac Warren,-0.019527,0.601462,0.377879,0.398538,0.083678,0.690542,943,90.988504,9.974550,5.012292,-1.952667,7.739303,114.703028
372,Ruben Ramirez,-0.019429,0.609881,0.328113,0.390119,0.083516,0.689294,1037,85.889197,6.856246,2.306872,-1.942920,7.729556,114.558566
417,Turner Larkins,-0.018835,0.612003,0.325718,0.387997,0.082148,0.690954,1264,83.651108,10.557290,2.928001,-1.883518,7.670154,113.678186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,Edgar Martinez,-0.001031,0.431641,0.372832,0.568359,0.077108,0.692410,1513,85.491474,8.536699,7.029612,-0.103054,5.889691,87.290204
205,Jack Dellinger,0.000221,0.474627,0.343436,0.525373,0.080106,0.694498,812,84.475831,8.730953,5.499259,0.022087,5.764549,85.435500
251,Jose Ledesma jr.,0.000418,0.455488,0.341888,0.544512,0.081080,0.690948,719,89.696082,7.622582,5.454237,0.041780,5.744857,85.143641
193,Griffin Baker,0.000812,0.394474,0.312380,0.605526,0.080997,0.693103,562,83.845650,13.660748,6.200636,0.081210,5.705426,84.559252


In [47]:
agg_offs

Unnamed: 0,player_name,pitch_name,mean_xrv,num_pitches,velo,vert_break,horz_break,xrv_100,xrv_100_adj,pitching_plus
88,Brooks Walton,Changeup,-0.021960,110,77.336493,8.622382,9.355163,-2.195971,8.516127,144.263677
482,Zac Westcott,Changeup,-0.015237,422,75.575884,6.727359,15.258970,-1.523684,7.843839,132.875082
205,Harley Gollert,Changeup,-0.012399,288,77.789406,7.405029,7.733486,-1.239941,7.560096,128.068462
118,Christian Cosby,Changeup,-0.012209,54,84.734505,10.121728,6.512684,-1.220940,7.541095,127.746584
213,Hunter Hoopes,Changeup,-0.010427,71,79.869784,4.373005,11.040000,-1.042703,7.362859,124.727245
...,...,...,...,...,...,...,...,...,...,...
219,Jack Dellinger,Changeup,0.017932,141,79.910829,5.406671,10.278883,1.793189,4.526967,76.687070
109,Carter Poiry,Changeup,0.018302,120,81.061181,1.685963,9.076248,1.830190,4.489965,76.060264
257,Joey Gonzalez,Changeup,0.019291,281,82.100226,5.878463,14.390047,1.929138,4.391017,74.384080
426,Taylor Sugg,Changeup,0.020307,61,82.783427,7.832408,12.753259,2.030682,4.289473,72.663921


In [48]:
import pandas as pd

leaderboard = pd.DataFrame(columns=['Pitcher', 'FB+', 'CB+', 'SI+', 'CUT+', 'SPL+', 'CH+', 'SL+', 'Pitching+'])

for pitcher in agg_total.player_name.unique():
    p = pitcher
    pitching = agg_total.loc[agg_total['player_name'] == pitcher, 'pitching_plus'].values[0]
    
    fb_d = agg_fb[agg_fb['player_name'] == pitcher]
    bb_d = agg_bb[agg_bb['player_name'] == pitcher]
    offs_d = agg_offs[agg_offs['player_name'] == pitcher]
    
    fb_sp = np.nan
    si_sp = np.nan
    
    sl_sp = np.nan
    cb_sp = np.nan
    cut_sp = np.nan
    
    ch_sp = np.nan
    spl_sp = np.nan
    
    if 'Fastball' in fb_d.pitch_name.unique():
        fb_sp = fb_d[fb_d['pitch_name'] == 'Fastball']['pitching_plus'].values[0]
    if 'Sinker' in fb_d.pitch_name.unique():
        si_sp = fb_d[fb_d['pitch_name'] == 'Sinker']['pitching_plus'].values[0]
    
    if 'Curveball' in bb_d.pitch_name.unique():
        cb_sp = bb_d[bb_d['pitch_name'] == 'Curveball']['pitching_plus'].values[0]
    if 'Cutter' in bb_d.pitch_name.unique():
        cut_sp = bb_d[bb_d['pitch_name'] == 'Cutter']['pitching_plus'].values[0]
    if 'Slider' in bb_d.pitch_name.unique():
        sl_sp = bb_d[bb_d['pitch_name'] == 'Slider']['pitching_plus'].values[0]
    
    if 'Changeup' in offs_d.pitch_name.unique():
        ch_sp = offs_d[offs_d['pitch_name'] == 'Changeup']['pitching_plus'].values[0]
    if 'Splitter' in offs_d.pitch_name.unique():
        spl_sp = offs_d[offs_d['pitch_name'] == 'Splitter']['pitching_plus'].values[0]
    
    leaderboard = leaderboard.append({
        'Pitcher': p,
        'FB+': fb_sp,
        'CB+': cb_sp,
        'SI+': si_sp,
        'CUT+': cut_sp,
        'SPL+': spl_sp,
        'CH+': ch_sp,
        'SL+': sl_sp,
        'Pitching+': pitching
    }, ignore_index=True)

  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
 

  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
 

  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
  leaderboard = leaderboard.append({
 

In [49]:
leaderboard.to_csv('../../../Documents/GitHub/Frontier-League-Savant/csvs/pitching+.csv')

In [50]:
leaderboard

Unnamed: 0,Pitcher,FB+,CB+,SI+,CUT+,SPL+,CH+,SL+,Pitching+
0,Parker Brahms,123.641989,118.945542,,,,105.154344,,120.265471
1,Abdiel Saldana,117.383393,,,,,,118.367991,117.995695
2,Brac Warren,117.731336,,,,,,99.469038,114.703028
3,Ruben Ramirez,114.950469,,,,,112.206966,124.221966,114.558566
4,Turner Larkins,124.96954,,,,,104.5957,99.324275,113.678186
...,...,...,...,...,...,...,...,...,...
113,Edgar Martinez,92.625432,93.840438,,,,84.641012,,87.290204
114,Jack Dellinger,92.461375,,,,,76.68707,87.017372,85.4355
115,Jose Ledesma jr.,90.054751,88.043131,,,,88.713494,,85.143641
116,Griffin Baker,86.636382,,,,,104.790232,,84.559252
