In [27]:
# Import packages
import pandas as pd
import numpy as np
import xgboost as xgb
from lightgbm import LGBMRegressor
import optuna
import seaborn as sns
import matplotlib.pyplot as plt

In [28]:
# To Do
# -Load in data
# -Create training/cv/test split. Test should likely be in chronological order to avoid any data leakage
# -List of variables for model, both pitch char and non pitch char models
# -Fit initial XGBoost model and take a peek at results to see if things are looking okay
# -Implememt optuna

In [47]:
full_pitch_df = pd.read_csv("final_pitch_data_new.csv").drop(columns = ["Unnamed: 0"])

In [48]:
full_pitch_df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,events_in_play,batted_ball_type,fast_pitch_avg_velo,delta_diff,rv,tto,batter_right,p_throws_right,description3,rv2
0,FF,2022-04-26,90.6,-1.94,6.53,"McKenzie, Triston",621493,663474,,called_strike,...,out,,92.251064,0.002177,-0.083518,1,1,1,strike,-0.083516
1,FF,2022-04-26,92.6,-1.9,6.39,"McKenzie, Triston",621493,663474,,called_strike,...,out,,92.251064,0.018665,-0.083518,1,1,1,strike,-0.083516
2,FF,2022-04-26,93.0,-2.04,6.42,"McKenzie, Triston",621493,663474,,foul,...,out,fb,92.251064,0.0,0.0,1,1,1,fb,0.025442
3,FF,2022-04-26,92.2,-1.97,6.43,"McKenzie, Triston",621493,663474,,foul,...,out,pu,92.251064,0.0,0.0,1,1,1,pu,-0.100858
4,CU,2022-04-26,78.8,-2.14,6.48,"McKenzie, Triston",621493,663474,field_out,hit_into_play,...,out,fb,92.251064,0.020345,-0.226302,1,1,1,fb,0.025442


In [49]:
pitch_char_list = ["release_speed", "release_pos_x", "release_pos_z", "release_extension", "pfx_x", "pfx_z", 
                   "release_spin_rate", "fast_pitch_avg_velo"]
location_list = ["plate_x", "plate_z"]
context_list = ["balls", "strikes", "batter_right", "p_throws_right", "sz_top", "sz_bot", "tto"]
# context_list = ["balls", "strikes", "outs_when_up", "batter_right", "p_throws_right", "runner1b", "runner2b", "runner3b", 
#                 "sz_top", "sz_bot", "tto"]

In [50]:
# # Turn categorical variables (stand and p_throws) into binary variables
# full_pitch_df["batter_right"] = (full_pitch_df["stand"] == "R").replace({True: 1, False: 0})
# full_pitch_df["p_throws_right"] = (full_pitch_df["p_throws"] == "R").replace({True: 1, False: 0})
# #full_pitch_df[context_list]

In [51]:
# # One last data filter. No 3 strike or 4 ball pitches
# full_pitch_df = full_pitch_df[(full_pitch_df["balls"] != 4) & (full_pitch_df["strikes"] != 3)]

In [52]:
# Sort for train/cv/test split
full_pitch_df = full_pitch_df.sort_values(["game_date", "game_pk", "pitcher", "at_bat_number", "pitch_number"])

In [53]:
# Split by pitch grouping

# Fastball pitches
fb_pitch_type = ["SI", "FC", "FF"]

# Offspeed pitches
ofs_pitch_type = ["FS", "CH", "EP", "FA", "KN"]

# Breaking ball pitches
bb_pitch_type = ["SL", "ST", "CU", "KC", "CS"]

fb_df = full_pitch_df[full_pitch_df["pitch_type"].isin(fb_pitch_type) & (full_pitch_df["p_throws_right"] == 1) & (full_pitch_df["batter_right"] == 1)]
ofs_df = full_pitch_df[full_pitch_df["pitch_type"].isin(ofs_pitch_type)]
bb_df = full_pitch_df[full_pitch_df["pitch_type"].isin(bb_pitch_type)]


In [54]:
# Start with fastball model
fb_train_df = fb_df.iloc[0:round(len(fb_df) * 0.64), :]
fb_cv_df = fb_df.iloc[(round(len(fb_df) * 0.64)):round(len(fb_df) * 0.8), :]
fb_test_df = fb_df.iloc[(round(len(fb_df) * 0.8)):, :]
(len(fb_train_df) + len(fb_cv_df) + len(fb_test_df)) == len(fb_df)

True

In [71]:
# Establish hyperparameter range
def fb_objective(trial):
    
    xgb_fit = xgb.XGBRegressor(n_estimators = int(trial.suggest_float("n_estimators", 10, 500, step = 1)), 
                               max_depth = int(trial.suggest_float("max_depth", 3, 10, step = 1)),
                               learning_rate = trial.suggest_float("learning_rate", 0.005, 0.3), 
                               subsample = trial.suggest_float("subsample", 0.6, 1),
                               tree_method = "hist",
                               alpha = trial.suggest_float("alpha", 0, 1))
    
    xgb_fit.fit(fb_train_df[pitch_char_list + location_list + context_list], fb_train_df["rv2"])
    
    pred = xgb_fit.predict(fb_cv_df[pitch_char_list + location_list + context_list])
    fb_cv_df.loc[:,"pred"] = pred
    return((np.mean((pred - fb_cv_df["rv2"])**2))**0.5)
    

In [72]:
(np.mean((fb_cv_df["rv2"].mean() - fb_cv_df["rv"])**2))**0.5


0.20244387332178093

In [73]:
fb_train_df["rv2"].isna().sum()

0

In [74]:
import time
start_time = time.time()
fb_study = optuna.create_study()
 # The optimization finishes after evaluating 1000 times or 3 seconds.
fb_study.optimize(fb_objective, n_trials = 100)
print(time.time() - start_time)


[I 2024-01-05 11:31:15,040] A new study created in memory with name: no-name-386b2b20-0181-497b-ae77-1d6c44935656
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 11:31:23,151] Trial 0 finished with value: 0.07368556801451731 and parameters: {'n_estimators': 464.0, 'max_depth': 3.0, 'learning_rate': 0.2653239806895287, 'subsample': 0.8010216849977206, 'alpha': 0.875344741142457}. Best is trial 0 with value: 0.07368556801451731.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 11:33:36,119] Trial 12 finished with value: 0.07330777861474197 and parameters: {'n_estimators': 105.0, 'max_depth': 5.0, 'learning_rate': 0.12223310597225667, 'subsample': 0.9973827464882663, 'alpha': 0.6949673112919974}. Best is trial 12 with value: 0.07330777861474197.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 11:33:37,955] Trial 13 finished with value: 0.07354184625648381 and parameters: {'n_estimators': 55.0, 'max_d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 11:35:07,017] Trial 25 finished with value: 0.07325097471426249 and parameters: {'n_estimators': 151.0, 'max_depth': 9.0, 'learning_rate': 0.07119217155370111, 'subsample': 0.849486984809957, 'alpha': 0.4521057465391088}. Best is trial 18 with value: 0.07314362435895878.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 11:35:18,121] Trial 26 finished with value: 0.07349070180577065 and parameters: {'n_estimators': 121.0, 'max_d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 11:36:22,630] Trial 38 finished with value: 0.07355505609644987 and parameters: {'n_estimators': 75.0, 'max_depth': 7.0, 'learning_rate': 0.05855763691858459, 'subsample': 0.9077949244704917, 'alpha': 0.22120335954460954}. Best is trial 18 with value: 0.07314362435895878.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 11:36:28,739] Trial 39 finished with value: 0.1944883597722777 and parameters: {'n_estimators': 166.0, 'max_d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 12:54:51,027] Trial 51 finished with value: 0.07320939617591482 and parameters: {'n_estimators': 227.0, 'max_depth': 6.0, 'learning_rate': 0.04424486467284672, 'subsample': 0.8973680708298072, 'alpha': 0.26633798137956427}. Best is trial 18 with value: 0.07314362435895878.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 12:55:02,787] Trial 52 finished with value: 0.07359232795115014 and parameters: {'n_estimators': 248.0, 'max

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 12:57:18,502] Trial 64 finished with value: 0.07325039135478208 and parameters: {'n_estimators': 415.0, 'max_depth': 6.0, 'learning_rate': 0.017446964328776068, 'subsample': 0.8097971414013451, 'alpha': 0.13510279806582426}. Best is trial 60 with value: 0.07310990729462755.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 12:57:34,308] Trial 65 finished with value: 0.07315621695224415 and parameters: {'n_estimators': 493.0, 'ma

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 13:15:01,109] Trial 77 finished with value: 0.07319283010261086 and parameters: {'n_estimators': 429.0, 'max_depth': 7.0, 'learning_rate': 0.013494456564347614, 'subsample': 0.8410531442213997, 'alpha': 0.11508827024927644}. Best is trial 60 with value: 0.07310990729462755.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 13:15:14,380] Trial 78 finished with value: 0.07315128156718126 and parameters: {'n_estimators': 379.0, 'ma

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 13:18:25,179] Trial 90 finished with value: 0.07310333616647832 and parameters: {'n_estimators': 498.0, 'max_depth': 8.0, 'learning_rate': 0.021502412832311003, 'subsample': 0.8296687592271749, 'alpha': 0.11843707466783035}. Best is trial 89 with value: 0.0730947532352412.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_cv_df.loc[:,"pred"] = pred
[I 2024-01-05 13:18:46,134] Trial 91 finished with value: 0.07312079476774999 and parameters: {'n_estimators': 497.0, 'max

8585.192097187042


In [76]:
test = fb_cv_df.groupby(["player_name", "pitch_type"]).aggregate({"rv2":"mean", "pred": "mean", "description": "count"})
test2 = test[test["description"] > 50]
print(test2["rv2"].corr(test2["pred"]))
test2.sort_values("pred").sort_values("pred").head(20)

0.5126793493895159


Unnamed: 0_level_0,Unnamed: 1_level_0,rv2,pred,description
player_name,pitch_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Alcantara, Sandy",FF,-0.023626,-0.009678,53
"Foley, Jason",SI,-0.021184,-0.008466,84
"Bassitt, Chris",SI,-0.001775,-0.007397,152
"Clevinger, Mike",FF,-0.004368,-0.00733,53
"Zeuch, T.J.",SI,0.004052,-0.007128,53
"Bundy, Dylan",FF,-0.012432,-0.006473,78
"García, Luis",SI,-0.010232,-0.006435,61
"Webb, Logan",SI,-0.01286,-0.006202,66
"Wheeler, Zack",FF,-0.01952,-0.005829,58
"Mahle, Tyler",FF,-0.014207,-0.005664,80


In [131]:
full_pitch_df.columns.tolist()

['pitch_type',
 'game_date',
 'release_speed',
 'release_pos_x',
 'release_pos_z',
 'player_name',
 'batter',
 'pitcher',
 'events',
 'description',
 'spin_dir',
 'game_type',
 'stand',
 'p_throws',
 'home_team',
 'away_team',
 'type',
 'balls',
 'strikes',
 'game_year',
 'pfx_x',
 'pfx_z',
 'plate_x',
 'plate_z',
 'on_3b',
 'on_2b',
 'on_1b',
 'outs_when_up',
 'inning',
 'inning_topbot',
 'hc_x',
 'hc_y',
 'vx0',
 'vy0',
 'vz0',
 'ax',
 'ay',
 'az',
 'sz_top',
 'sz_bot',
 'hit_distance_sc',
 'launch_speed',
 'launch_angle',
 'release_spin_rate',
 'release_extension',
 'game_pk',
 'release_pos_y',
 'estimated_ba_using_speedangle',
 'estimated_woba_using_speedangle',
 'woba_value',
 'woba_denom',
 'iso_value',
 'launch_speed_angle',
 'at_bat_number',
 'pitch_number',
 'home_score',
 'away_score',
 'bat_score',
 'fld_score',
 'post_away_score',
 'post_home_score',
 'spin_axis',
 'delta_run_exp',
 'inning_top',
 'inning_id',
 'runner1b',
 'runner2b',
 'runner3b',
 'state',
 'runs_end_inni

In [147]:
# Let's do some exploration!
full_pitch_df.groupby("pitch_type").aggregate({"rv":"mean", "pitch_number":"count", "delta_run_exp": "mean", "pfx_x": "mean",
                                               "pfx_z": "mean", "release_speed": "mean"}).sort_values("rv")



Unnamed: 0_level_0,rv,pitch_number,delta_run_exp,pfx_x,pfx_z,release_speed
pitch_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SL,-3.4e-05,145876,-0.002264,3.669776,1.872354,84.610533
ST,0.000205,1273,0.000441,12.833119,-0.880534,81.19348
FS,0.001196,10322,-0.003693,-10.868115,3.904022,87.199341
CU,0.001426,58696,-0.000111,3.787476,-8.977667,79.058815
SI,0.001624,111008,0.000382,-6.37423,8.708303,93.35174
KC,0.002408,13945,-0.000128,5.692027,-10.232941,81.239104
FC,0.00305,50535,0.000133,1.70508,8.036616,88.998148
FF,0.003238,231994,0.000522,-3.127531,16.11736,93.919448
CH,0.00501,79995,0.002219,-4.223877,6.199781,85.330018
CS,0.027299,95,0.030379,6.544421,-12.865263,67.990526


In [134]:
# Fastball pitches
fb_pitch_type = ["SI", "FC", "FF"]

# Offspeed pitches
ofs_pitch_type = ["FS", "CH", "EP", "FA", "KN"]

# Breaking ball pitches
bb_pitch_type = ["SL", "ST", "CU", "KC", "CS"]

full_pitch_df["rv"].corr(full_pitch_bdf["delta_run_exp2"])

0.8122587256811388

In [146]:
test = full_pitch_df.groupby(["player_name", "pitch_type"]).aggregate({"rv":"mean", "pfx_x":"count"}).reset_index().sort_values("rv")
test[(test["pfx_x"] > 500) & (test["pitch_type"] == "SL")].head(50)

Unnamed: 0,player_name,pitch_type,rv,pfx_x
830,"Díaz, Edwin O",SL,-0.02737,534
2139,"Muñoz, Andrés",SL,-0.0269,661
2234,"Ohtani, Shohei",SL,-0.022211,984
700,"Darvish, Yu",SL,-0.020545,566
2669,"Scherzer, Max",SL,-0.019808,507
517,"Cease, Dylan",SL,-0.018303,1338
2123,"Murfee, Penn",SL,-0.015831,511
1459,"Jax, Griffin",SL,-0.015327,560
2636,"Sandoval, Patrick",SL,-0.013621,711
191,"Barlow, Scott",SL,-0.013607,534


In [108]:
#train_df =
train_df = full_pitch_df.iloc[0:round(len(full_pitch_df) * 0.64), :]
cv_df = full_pitch_df.iloc[(round(len(full_pitch_df) * 0.64)):round(len(full_pitch_df) * 0.8), :]
test_df = full_pitch_df.iloc[(round(len(full_pitch_df) * 0.8)):, :]
(len(train_df) + len(cv_df) + len(test_df)) == len(full_pitch_df)

True

In [100]:
# Initial test model 
xgb_fit = xgb.XGBRegressor(n_estimators = 300, learning_rate = 0.01, max_depth = 5, tree_method = "hist")
xgb_fit.fit(train_df[pitch_char_list + location_list + context_list], train_df["rv"])

In [None]:
SEARCH_PARAMS = {'learning_rate': trial.suggest_uniform("learning_rate", 0.005, 0.3),
                 'max_depth': int(trial.suggest_discrete_uniform("max_depth", 3, 10, 1)),
                 'n_estimators': int(trial.suggest_discrete_uniform("n_estimators", 50, 500, 1)),
                 'lambda_l1':
                 'num_leaves': int(trial.suggest_discrete_uniform("max_depth", 3, 10, 1)),
                 'feature_fraction': trial.suggest_uniform("subsample", 0.5, 0.8),
                 'subsample': trial.suggest_uniform("subsample", 0.5, 0.8)}

FIXED_PARAMS={'objective': 'binary',
             'metric': 'auc',
             'is_unbalance':True,
             'bagging_freq':5,
             'boosting':'dart',
             'early_stopping_rounds':30}


In [None]:
# Establish hyperparameter range
def objective(trial):
     
    xgb_fit.fit(train_df[pitch_char_list + location_list + context_list], train_df["rv"])
    
    pred = xgb_fit.predict(cv_df[pitch_char_list + location_list + context_list])
    
    return((np.mean((pred - cv_df["rv"])**2))**0.5)
    

In [118]:
# Establish hyperparameter range
def objective(trial):
    
    xgb_fit = xgb.XGBRegressor(n_estimators = int(trial.suggest_float("n_estimators", 50, 1000, step = 1)), 
                               max_depth = int(trial.suggest_float("max_depth", 3, 10, step = 1)),
                               learning_rate = trial.suggest_float("learning_rate", 0.005, 0.3), 
                               subsample = trial.suggest_float("subsample", 0.6, 1),
                               tree_method = "hist")
    
    xgb_fit.fit(train_df[pitch_char_list + location_list + context_list], train_df["rv"])
    
    pred = xgb_fit.predict(cv_df[pitch_char_list + location_list + context_list])
    
    return((np.mean((pred - cv_df["rv"])**2))**0.5)
    

In [119]:
context_list

['balls',
 'strikes',
 'batter_right',
 'p_throws_right',
 'sz_top',
 'sz_bot',
 'tto']

In [120]:
import time
start_time = time.time()
study = optuna.create_study()
 # The optimization finishes after evaluating 1000 times or 3 seconds.
study.optimize(objective, n_trials = 100)
print(time.time() - start_time)


[I 2023-10-23 22:20:28,660] A new study created in memory with name: no-name-127efd14-bdea-4314-a515-09efe2038b05
[I 2023-10-23 22:21:51,565] Trial 0 finished with value: 0.22149429503091442 and parameters: {'n_estimators': 653.0, 'max_depth': 10.0, 'learning_rate': 0.2885033377933189, 'subsample': 0.7115808180522478}. Best is trial 0 with value: 0.22149429503091442.
[I 2023-10-23 22:22:56,487] Trial 1 finished with value: 0.1991002454304466 and parameters: {'n_estimators': 604.0, 'max_depth': 10.0, 'learning_rate': 0.05800037986589545, 'subsample': 0.9781486570259706}. Best is trial 1 with value: 0.1991002454304466.
[I 2023-10-23 22:23:33,999] Trial 2 finished with value: 0.19727358357684313 and parameters: {'n_estimators': 475.0, 'max_depth': 8.0, 'learning_rate': 0.045423481726739885, 'subsample': 0.9250261214736584}. Best is trial 2 with value: 0.19727358357684313.
[I 2023-10-23 22:23:51,892] Trial 3 finished with value: 0.20481540232052137 and parameters: {'n_estimators': 170.0, '

[W 2023-10-24 17:17:01,204] Trial 30 failed with value None.


KeyboardInterrupt: 

In [None]:
def objective(hyperparam):
    cv_rmse = np.array([0, 0, 0, 0, 0], dtype = "float")
    # Initialize XGBoost
    xgb_fit = xgb.XGBRegressor(n_estimators = int(hyperparam["n_estimators"]), max_depth = int(hyperparam["max_depth"]), 
                               learning_rate = hyperparam["learning_rate"], subsample = hyperparam["subsample"])
    for i in range(5):
        train_iter_df = train_df[train_df["fold"] != (i + 1)]
        cv_df = train_df[train_df["fold"] == (i + 1)]
        
        # Fit on training data, predict on the left out fold, and calculate RMSE
        xgb_fit.fit(train_iter_df[["x1", "x2"]], train_iter_df["y"])
        pred = xgb_fit.predict(cv_df[["x1", "x2"]])
        cv_rmse[i] = (np.mean((pred - cv_df["y"])**2))**0.5
    
    print ("RMSE:", cv_rmse.mean())
    return {"loss": cv_rmse.mean(), "status": STATUS_OK }

In [60]:
# Look at predictions
print(np.mean((xgb_fit.predict(cv_df[pitch_char_list + location_list + context_list]) - cv_df["rv"])**2)**0.5)
print(cv_df["rv"].corr(pd.Series(xgb_fit.predict(cv_df[pitch_char_list + location_list + context_list]))))

0.1989549236771565
0.01181250166912025


In [122]:
train_df["rv"].std()

0.20066377712764996

In [79]:
cv_df["pred"] = xgb_fit.predict(cv_df[pitch_char_list + location_list + context_list])
test = cv_df.groupby(["player_name", "pitch_type"]).aggregate({"pred":"mean", "rv":"count"}).reset_index().sort_values("pred")

test[test["rv"] > 50]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df["pred"] = xgb_fit.predict(cv_df[pitch_char_list + location_list + context_list])


Unnamed: 0,player_name,pitch_type,pred,rv
202,"Bleier, Richard",SI,0.013171,70
1176,"Mantiply, Joe",SI,0.013698,66
664,"García, Jarlín",FF,0.015448,86
445,"Darvish, Yu",FC,0.015784,172
1104,"Loup, Aaron",SI,0.015851,65
...,...,...,...,...
1819,"Suter, Brent",CH,0.041027,57
1103,"Loup, Aaron",FC,0.041812,55
47,"Alvarado, José",FC,0.042267,58
619,"Freeland, Kyle",KC,0.042730,82


In [74]:
cv_df.groupby(["player_name", "pitch_type"])["rv"].mean().reset_index().sort_values("rv").head(n = 25)

Unnamed: 0,player_name,pitch_type,rv
1493,"Plassmeyer, Michael",CH,-0.259186
1997,"Weems, Jordan",CU,-0.259186
1082,"Liberatore, Matthew",SI,-0.243833
553,"Estrada, Jeremiah",CH,-0.233658
614,"Foster, Matt",CU,-0.233658
116,"Bard, Luke",CH,-0.233658
1008,"Knizner, Andrew",SL,-0.233658
1424,"Ortega, Oliver",FF,-0.233658
1322,"Moreta, Dauri",CH,-0.233658
2078,"Yates, Kirby",SL,-0.233658
