In [21]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from pybaseball import statcast, cache
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import joblib
import math
import scipy.stats as stats
cache.enable()

In [22]:
# CSV from call to pybaseball('2021-04-01', '2022-10-31')
sc = pd.read_csv('../statcast 21-22.csv')

In [23]:
sc = sc[(sc['strikes'] < 3) & (sc['balls'] < 4)]

In [24]:
columns = ['player_name', 'p_throws', 'pitch_name', 'stand', 'delta_run_exp', 'release_speed', 'spin_axis', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'balls', 'strikes', 'release_pos_x', 'release_pos_z', 'release_extension', 'description', 'release_spin_rate', 'events']
df = sc[columns]
df = df[df['pitch_name'].isin(['Slider', 'Curveball', '4-Seam Fastball', 'Changeup', 'Sinker',
       'Cutter', 'Knuckle Curve', 'Split-Finger', 'Sweeper'])]

In [25]:
non_features = ['player_name', 'delta_run_exp', 'stand', 'pitch_name', 'p_throws', 'balls', 'strikes', 'spin_axis', 'plate_x', 'plate_z', 'description', 'events', 'release_extension', 'release_pos_x', 'release_pos_z']
features = [column for column in columns if column not in non_features]

In [26]:
sc = sc.dropna(subset=features)

In [27]:
average_extension = df['release_extension'].mean()
# Found this using the added Carter Capps perceived velo divided by his added extension over average
extension_constant = (3.5/.5)
def calculate_perceived_velocity(row):
    adjusted_extension = row['release_extension'] - average_extension
    perceived_velocity = row['release_speed'] + (adjusted_extension * extension_constant)
    return perceived_velocity

df['perceived_velocity'] = df.apply(calculate_perceived_velocity, axis=1)

In [28]:
df['perceived_velocity'].min()

20.37671757482117

In [29]:
player_name_counts = df['player_name'].value_counts()
# Dropping all position players pitching from data
valid_player_names = player_name_counts[player_name_counts >= 50].index.tolist()
df = df[df['player_name'].isin(valid_player_names)]

In [30]:
df['perceived_velocity'].min()

53.87671757482117

In [31]:
df.loc[df['p_throws'] == 'L', 'pfx_x'] *= -1
df.loc[df['p_throws'] == 'L', 'release_pos_x'] *= -1

In [32]:
arm_angles = np.degrees(np.arctan2(df['release_pos_z'], df['release_pos_x']))
df['arm_angle'] = arm_angles
features.append('arm_angle')

In [33]:
features.append('perceived_velocity')
features

['release_speed',
 'pfx_x',
 'pfx_z',
 'release_spin_rate',
 'arm_angle',
 'perceived_velocity']

In [34]:
df['spin_axis_rad'] = df['spin_axis'] * np.pi / 180

df['TSM'] = df['release_spin_rate'] * np.sin(df['spin_axis_rad'])
df['G'] = df['release_spin_rate'] * np.cos(df['spin_axis_rad'])

df['spin_efficiency'] = (df['TSM'] / df['release_spin_rate']) * 100

min_spin_efficiency = df['spin_efficiency'].min(skipna=True)
max_spin_efficiency = df['spin_efficiency'].max(skipna=True)

df['normalized_spin_efficiency'] = (df['spin_efficiency'] - min_spin_efficiency) / (max_spin_efficiency - min_spin_efficiency)

In [35]:
features.append('normalized_spin_efficiency')

In [36]:
df['description'] = np.where(df['description'] == 'hit_into_play', df['events'], df['description'])

In [37]:
run_values = {
    'home_run': 1.374328827219,
    'triple': 1.05755624961515,
    'double': 0.766083122898271,
    'single': 0.467292970729251,
    'ball': 0.0636883289483747,
    'hit_by_pitch': 0.0636883289483747,
    'blocked_ball': 0.0636883289483747,
    'foul': -0.0380502742575014,
    'foul_tip': -0.0380502742575014,
    'bunt_foul': -0.0380502742575014,
    'bunt_foul_tip': -0.0380502742575014,
    'called_strike': -0.065092516089806,
    'swinging_strike': -0.118124935770601,
    'swinging_strike_blocked': -0.118124935770601,
    'force_out': -0.1955687665555,
    'grounded_into_double_play': -0.1955687665555,
    'fielders_choice_out': -0.1955687665555,
    'fielders_choice': -0.1955687665555,
    'field_out': -0.1955687665555,
    'double_play': -0.1955687665555,
    'sac_fly': -0.236889645519856,
    'field_error': -0.236889645519856,
    'catcher_interf': -0.789788814378052,
    'sac_fly_double_play': -0.789788814378052,
    'triple_play': -0.789788814378052
}

df['RV'] = df['description'].map(run_values)

In [38]:
df = df.dropna(subset=['delta_run_exp'])
df = df.dropna(subset=features)

In [40]:
# Pitch breakdowns as per Driveline's model
pitch_types = [['4-Seam Fastball', 'Sinker'], ['Curveball', 'Slider', 'Cutter', 'Knuckle Curve', 'Sweeper'], ['Split-Finger', 'Changeup', 'Forkball']]

param_dist = {
    'n_estimators': stats.randint(100, 1000),
    'max_depth': stats.randint(1, 10),
    'learning_rate': stats.uniform(0.01, 0.3),
    'subsample': stats.uniform(0.6, 0.4),
    'colsample_bytree': stats.uniform(0.6, 0.4),
}

for pitch_type in pitch_types:
    pitch_df = df[df['pitch_name'].isin(pitch_type)]
    
    X = pitch_df[features]
    y = pitch_df['delta_run_exp']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = XGBRegressor()
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=2, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"RMSE on Test Set: {rmse:.4f}")
    joblib.dump(best_model, f'models/{pitch_type}_optimized_random_stuff.joblib')

Best Parameters: {'colsample_bytree': 0.8832290311184181, 'learning_rate': 0.016175348288740735, 'max_depth': 2, 'n_estimators': 443, 'subsample': 0.9329770563201687}
RMSE on Test Set: 0.2404
Best Parameters: {'colsample_bytree': 0.8832290311184181, 'learning_rate': 0.016175348288740735, 'max_depth': 2, 'n_estimators': 443, 'subsample': 0.9329770563201687}
RMSE on Test Set: 0.2401
Best Parameters: {'colsample_bytree': 0.8832290311184181, 'learning_rate': 0.016175348288740735, 'max_depth': 2, 'n_estimators': 443, 'subsample': 0.9329770563201687}
RMSE on Test Set: 0.2446
