# M02. Stat Imputations
- This imputes model inputs using Steamer projections
- Type: Model
- Run Frequency: Irregular
- Sources:
    - MLB API
    - Steamer
- Created: 1/28/2024
- Updated: 12/17/2024

Note:
- Stat imputations have been phased out, replaced with the simpler approach of assigning imputed players median values for API-derived inputs, an imputation flag, and their Steamer projected rates
- Imputation models proved unreliable and difficult to evaluate, given the significant number of required imputations
- Code remains to train imputation models in case improved methods can generate more reliable imputations in the future
- Scalers trained here remain necessary

### Imports

In [1]:
%run "U1. Imports.ipynb"
%run "U2. Functions.ipynb"
%run "U3. Classes.ipynb"
%run "U4. Datasets.ipynb"
%run "U5. Models.ipynb"

### Data

##### Plate Apperances

Hitters

In [2]:
hitters_df = pd.read_csv(os.path.join(baseball_path, "Final Dataset.csv"))

Pitchers

In [3]:
pitchers_df = hitters_df.copy()

##### Steamer

Hitters

In [4]:
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')

Pitchers

In [5]:
steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers_weekly_log.csv"), encoding='iso-8859-1')

### Clean

##### Plate Appearances

Hitters

Keep one instance of each batter in each game vs. each side

In [6]:
hitters_df.drop_duplicates(['gamePk', 'batter', 'b_L', 'p_L'], keep='last', inplace=True)

Keep those with sufficient sample size

In [7]:
hitters_df = hitters_df[hitters_df['pa_b'] > 40]
hitters_df = hitters_df[~hitters_df[batter_inputs].isin([np.inf, -np.inf]).any(axis=1)]

Keep relevant columns

In [8]:
hitters_df = hitters_df[['batter', 'date', 'b_L', 'p_L', 'imp_b'] + batter_inputs]

Format for merge

In [9]:
hitters_df['date'] = pd.to_datetime(hitters_df['date'], format='%Y%m%d')
hitters_df['batter'] = hitters_df['batter'].astype(int)

Pitchers

Keep one instance of each pitcher in each game vs. each side

In [10]:
pitchers_df.drop_duplicates(['gamePk', 'pitcher', 'b_L', 'p_L'], keep='last', inplace=True)

Keep those with sufficient sample size

In [11]:
pitchers_df = pitchers_df[pitchers_df['pa_p'] > 40]
pitchers_df = pitchers_df[~pitchers_df[pitcher_inputs].isin([np.inf, -np.inf]).any(axis=1)]

Keep relevant columns

In [12]:
pitchers_df = pitchers_df[['pitcher', 'date', 'b_L', 'p_L', 'imp_p'] + pitcher_inputs]

Format for merge

In [13]:
pitchers_df['date'] = pd.to_datetime(pitchers_df['date'], format='%Y%m%d')
pitchers_df['pitcher'] = pitchers_df['pitcher'].astype(int)

##### Steamer

Hitters

In [14]:
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)
steamer_hitters_df2 = steamer_hitters_df2.dropna(subset=batter_stats_fg)

Format for merge

In [15]:
steamer_hitters_df2['date'] = pd.to_datetime(steamer_hitters_df2['date'], format='%Y%m%d')
steamer_hitters_df2['mlbamid'] = steamer_hitters_df2['mlbamid'].astype(int)

Pitchers

In [16]:
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df)
steamer_pitchers_df2 = steamer_pitchers_df2.dropna(subset=pitcher_stats_fg)

Format for merge

In [17]:
steamer_pitchers_df2['date'] = pd.to_datetime(steamer_pitchers_df2['date'], format='%Y%m%d')
steamer_pitchers_df2 = steamer_pitchers_df2[~steamer_pitchers_df2['mlbamid'].isna()]
steamer_pitchers_df2['mlbamid'] = steamer_pitchers_df2['mlbamid'].astype(int)

### Merge

##### Hitters

Sort

In [18]:
hitters_df = hitters_df.sort_values(['batter', 'date'])
steamer_hitters_df2 = steamer_hitters_df2.sort_values(['mlbamid', 'date'])

Merge

In [19]:
hitters_df = pd.merge_asof(
    hitters_df.sort_values('date'),
    steamer_hitters_df2[['mlbamid', 'steamerid', 'date'] + batter_stats_fg].sort_values('date'),
    left_on='date',
    right_on='date',
    left_by='batter',
    right_by='mlbamid'
)

##### Pitchers

Sort

In [20]:
pitchers_df = pitchers_df.sort_values(['pitcher', 'date'])
steamer_pitchers_df2 = steamer_pitchers_df2.sort_values(['mlbamid', 'date'])

Merge

In [21]:
pitchers_df = pd.merge_asof(
    pitchers_df.sort_values('date'),
    steamer_pitchers_df2[['mlbamid', 'steamerid', 'date'] + pitcher_stats_fg].sort_values('date'),
    left_on='date',
    right_on='date',
    left_by='pitcher',
    right_by='mlbamid'
)

### Sample

In [22]:
hitters_df = hitters_df[hitters_df['date'].dt.year > 2015]

In [23]:
pitchers_df = pitchers_df[pitchers_df['date'].dt.year > 2015]

### Scaler A. Batter - Plate Appearances

##### Inputs

batter_inputs defined in U1. Imports

##### Train

Create scaler

In [24]:
batter_stats_scaler = MedianCenterer()

Fit

In [25]:
hitters_df[batter_inputs] = batter_stats_scaler.fit_transform(hitters_df[batter_inputs])

Create directory

In [26]:
os.makedirs(os.path.join(model_path, "M02. Stat Imputations", todaysdate), exist_ok=True)

Save

In [27]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_batter_stats.pkl"), "wb") as file:
    pickle.dump(batter_stats_scaler, file)

### Scaler B. Pitcher - Plate Appearances

##### Inputs

pitcher_inputs defined in U1. Imports

##### Train

Create scaler

In [28]:
pitcher_stats_scaler = MedianCenterer()

Fit

In [29]:
pitchers_df[pitcher_inputs] = pitcher_stats_scaler.fit_transform(pitchers_df[pitcher_inputs])

Save

In [30]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_pitcher_stats.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_scaler, file)

### Scaler C. Batter - Steamer

##### Inputs

batter_stats_fg defined in U1. Imports

##### Train

Create scaler

In [31]:
batter_stats_fg_scaler = MedianCenterer()

Fit

In [32]:
steamer_hitters_df2[batter_stats_fg] = batter_stats_fg_scaler.fit_transform(steamer_hitters_df2[batter_stats_fg])

Save

In [33]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_batter_stats_steamer.pkl"), "wb") as file:
    pickle.dump(batter_stats_fg_scaler, file)

### Scaler D. Pitcher - Steamer

##### Inputs

pitcher_stats_fg defined in U1. Imports

##### Train

Create scaler

In [34]:
pitcher_stats_fg_scaler = MedianCenterer()

Fit

In [35]:
steamer_pitchers_df2[pitcher_stats_fg] = pitcher_stats_fg_scaler.fit_transform(steamer_pitchers_df2[pitcher_stats_fg])

Save

In [36]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_pitcher_stats_steamer.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_fg_scaler, file)

### Note:
Imputation models below are deprecated

### Impute

Hitters

Stat inputs

In [37]:
batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L', 'imp_b']

Train/Test Split

Split

In [38]:
hitters_df['split'] = np.random.choice([0, 0, 1], size=len(hitters_df))

Create masks to identify training and testing datasets

In [39]:
training_mask = (hitters_df['split'] == 0)
testing_mask = (hitters_df['split'] == 1)

In [40]:
hitters_df.tail()

Unnamed: 0,batter,date,b_L,p_L,imp_b,b1_b,b2_b,b3_b,bb_b,fo_b,go_b,hbp_b,hr_b,lo_b,po_b,so_b,estimated_woba_using_speedangle_b,to_left_b,to_middle_b,to_right_b,hard_hit_b,barrel_b,iso_b,slg_b,obp_b,woba_b,b1_b_long,b2_b_long,b3_b_long,bb_b_long,fo_b_long,go_b_long,hbp_b_long,hr_b_long,lo_b_long,po_b_long,so_b_long,estimated_woba_using_speedangle_b_long,to_left_b_long,to_middle_b_long,to_right_b_long,hard_hit_b_long,barrel_b_long,iso_b_long,slg_b_long,obp_b_long,woba_b_long,mlbamid,steamerid,b1_rate,b2_rate,b3_rate,hr_rate,bb_rate,hbp_rate,so_rate,woba,slg,obp,split
657833,671739,2025-09-28,True,True,0,0.048466,-0.002323,0.000308,-0.08,-0.040594,0.155632,-0.0,0.013274,-0.012417,0.018893,-0.06,-0.053286,-0.126016,0.034632,0.108466,0.04,0.02,0.01972,0.047175,-0.04291,-0.008562,0.013792,0.00143,0.002118,-0.053333,-0.022234,0.082128,0.004713,0.007547,-0.002948,-0.002704,-0.01,-0.04642,-0.133158,-0.006272,0.156086,0.046667,0.016667,0.018835,0.027084,-0.033197,-0.008654,671739,25931,0.171735,0.04986,0.00598,0.039773,0.0569,0.01,0.184828,0.351297,0.48531,0.335077,0
657834,695657,2025-09-28,True,False,0,-0.06549,-0.007413,0.000605,0.08,-0.05499,-0.085868,-0.0,0.014691,-0.034376,0.01408,0.18,-0.058106,-0.201774,0.015152,0.203704,-0.14,0.02,0.051418,-0.008717,-0.000241,-0.002026,-0.05063,-0.007001,0.000934,0.02,-0.018261,-0.085674,-0.003621,0.057067,-0.017288,0.011648,0.113333,0.011609,-0.172981,-0.006272,0.195909,0.003333,0.048333,0.185562,0.184995,0.007286,0.068895,695657,sa3017170,0.11797,0.033476,0.003095,0.024917,0.0944,0.017,0.290308,0.278901,0.334469,0.291715,1
657835,676044,2025-09-28,True,False,0,0.010035,0.01178,0.000848,-0.02,-0.019971,-0.009638,0.02,-0.024353,0.00871,0.003828,0.06,-0.06457,-0.011433,-0.175189,0.203704,0.02,-0.04,-0.071868,-0.087388,-0.024324,-0.045977,-0.012293,0.003934,0.001125,-0.018462,-0.01866,-0.033572,0.006764,-0.025623,0.005347,0.005538,0.10641,-0.094741,-0.066106,-0.113601,0.196363,0.035641,-0.046667,-0.081355,-0.12543,-0.054018,-0.07329,676044,sa3017852,0.133142,0.038612,0.001753,0.038772,0.069,0.009004,0.328434,0.302256,0.406462,0.291137,0
657836,701358,2025-09-28,False,True,0,-0.033628,0.000717,-0.000884,0.08,0.01907,-0.045,0.02,-0.006437,-0.028852,-0.003747,0.04,-0.06504,0.028746,-0.072511,0.060847,-0.04,-0.02,-0.014729,-0.049142,0.037134,0.004732,-0.016359,0.007261,0.000454,0.044088,0.003509,-0.081288,0.013277,0.01033,-0.018238,-0.003231,0.060706,-0.009174,-0.04371,0.08536,-0.024994,-0.030389,0.011727,0.053418,0.066244,0.049587,0.048511,701358,sa3025358,0.131667,0.027906,0.005215,0.028493,0.0639,0.011,0.240899,0.267951,0.346331,0.268886,0
657837,683737,2025-09-28,True,True,0,-0.041962,0.00515,0.003638,-0.02,-0.003609,0.007183,-0.0,0.040909,-0.009212,-0.020858,0.08,-0.003796,-0.136433,-0.112689,0.266204,0.02,0.06,0.133671,0.123273,-0.034899,0.024777,-0.03113,0.00104,0.002405,-0.008571,0.038652,-0.0344,0.005665,0.007034,-0.013036,-0.00191,0.054762,-0.053461,-0.104282,-0.010467,0.131406,-0.009048,0.005714,0.026166,-0.002604,-0.033022,-0.017934,683737,26319,0.123585,0.045158,0.00352,0.034559,0.1043,0.01,0.27687,0.32119,0.413161,0.321547,1


In [41]:
hitters_df = hitters_df.dropna(subset=batter_inputs).dropna(subset=batter_stats_fg)

Train

In [42]:
%%time
# Define the architecture of the neural network
layers = (50,50)

# Create the MLPRegressor model
batter_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=100, learning_rate_init=0.001, max_iter=100)

# Train the model
batter_imputation_model.fit(hitters_df[training_mask][batter_stats_fg_imp], hitters_df[training_mask][batter_inputs])


# Save the model
os.makedirs(os.path.join(model_path, "M02. Stat Imputations", todaysdate), exist_ok=True)
pickle.dump(batter_imputation_model, open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "impute_batter_stats.sav"), 'wb'))

CPU times: total: 4min 48s
Wall time: 36.8 s


Predict

In [43]:
y_test_pred = pd.DataFrame(batter_imputation_model.predict(hitters_df[testing_mask][batter_stats_fg_imp]))
y_test_pred.columns = [f"{col}_pred" for col in batter_inputs]
batter_pred_df = pd.concat([hitters_df[testing_mask].reset_index(), y_test_pred], axis=1)

Evaluate

In [44]:
### MSEs - Player-level
all_list, imp_list = [], []

for stat in batter_inputs:
    batter_pred_df[f'{stat}_square_error'] = (batter_pred_df[stat] - batter_pred_df[f'{stat}_pred']) ** 2
    all_list.append(batter_pred_df[f'{stat}_square_error'].mean())
    imp_list.append(batter_pred_df[batter_pred_df['imp_b'] == 1][f'{stat}_square_error'].mean())
    
print("MSE All:    ", np.mean(all_list))
print("MSE Imputed:", np.mean(imp_list))


### MSEs - Stat-level
# Identify predicted columns
pred_columns = [col + "_pred" for col in batter_inputs]

# Check if all pred_columns exist in the dataframe
missing_columns = [col for col in pred_columns if col not in batter_pred_df.columns]
if missing_columns:
    raise ValueError(f"Missing expected columns: {missing_columns}")

# Compute means for actual and predicted columns
means_actual = batter_pred_df.query('imp_b == 1')[batter_inputs].mean()
means_pred = batter_pred_df.query('imp_b == 1')[pred_columns].mean()

# Align indices to ensure proper subtraction
means_pred.index = means_actual.index  

# Compute squared errors
squared_errors = (means_actual - means_pred) ** 2

# Compute final MSE
mse = squared_errors.mean()

print("MSE Stats:  ", mse)

MSE All:     0.0022707981710573303
MSE Imputed: nan
MSE Stats:   nan


Pitchers

Stat inputs

In [45]:
pitcher_stats_fg_imp = pitcher_stats_fg + ['b_L', 'p_L', 'imp_p']

Train/Test Split

Split

In [46]:
pitchers_df['split'] = np.random.choice([0, 0, 1], size=len(pitchers_df))

Create masks to identify training and testing datasets

In [47]:
training_mask = (pitchers_df['split'] == 0)
testing_mask = (pitchers_df['split'] == 1)

In [48]:
pitchers_df = pitchers_df.dropna(subset=pitcher_inputs).dropna(subset=pitcher_stats_fg)

Train

In [49]:
%%time
# Define the architecture of the neural network
layers = (50,50)

# Create the MLPRegressor model
pitcher_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=1, learning_rate_init=0.001, max_iter=100)

# Train the model
pitcher_imputation_model.fit(pitchers_df[training_mask][pitcher_stats_fg_imp], pitchers_df[training_mask][pitcher_inputs])


# Save the model
os.makedirs(os.path.join(model_path, "M02. Stat Imputations", todaysdate), exist_ok=True)
pickle.dump(pitcher_imputation_model, open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "impute_pitcher_stats.sav"), 'wb'))

CPU times: total: 1min 44s
Wall time: 13.6 s


Predict

In [50]:
y_test_pred = pd.DataFrame(pitcher_imputation_model.predict(pitchers_df[testing_mask][pitcher_stats_fg_imp]))
y_test_pred.columns = [f"{col}_pred" for col in pitcher_inputs]
pitcher_pred_df = pd.concat([pitchers_df[testing_mask].reset_index(), y_test_pred], axis=1)

Evaluate

In [51]:
### MSEs - Player-level
all_list, imp_list = [], []

for stat in pitcher_inputs:
    pitcher_pred_df[f'{stat}_square_error'] = (pitcher_pred_df[stat] - pitcher_pred_df[f'{stat}_pred']) ** 2
    all_list.append(pitcher_pred_df[f'{stat}_square_error'].mean())
    imp_list.append(pitcher_pred_df[pitcher_pred_df['imp_p'] == 1][f'{stat}_square_error'].mean())
    
print("MSE All:    ", np.mean(all_list))
print("MSE Imputed:", np.mean(imp_list))


### MSEs - Stat-level
# Identify predicted columns
pred_columns = [col + "_pred" for col in pitcher_inputs]

# Check if all pred_columns exist in the dataframe
missing_columns = [col for col in pred_columns if col not in pitcher_pred_df.columns]
if missing_columns:
    raise ValueError(f"Missing expected columns: {missing_columns}")

# Compute means for actual and predicted columns
means_actual = pitcher_pred_df.query('imp_p == 1')[pitcher_inputs].mean()
means_pred = pitcher_pred_df.query('imp_p == 1')[pred_columns].mean()

# Align indices to ensure proper subtraction
means_pred.index = means_actual.index  

# Compute squared errors
squared_errors = (means_actual - means_pred) ** 2

# Compute final MSE
mse = squared_errors.mean()

print("MSE Stats:  ", mse)

MSE All:     0.002296019059247391
MSE Imputed: nan
MSE Stats:   nan


In [52]:
pitcher_pred_df.head()

Unnamed: 0,index,pitcher,date,b_L,p_L,imp_p,b1_p,b2_p,b3_p,bb_p,fo_p,go_p,hbp_p,hr_p,lo_p,po_p,so_p,estimated_woba_using_speedangle_p,to_left_p,to_middle_p,to_right_p,hard_hit_p,barrel_p,iso_p,slg_p,obp_p,woba_p,b1_p_long,b2_p_long,b3_p_long,bb_p_long,fo_p_long,go_p_long,hbp_p_long,hr_p_long,lo_p_long,po_p_long,so_p_long,estimated_woba_using_speedangle_p_long,to_left_p_long,to_middle_p_long,to_right_p_long,hard_hit_p_long,barrel_p_long,iso_p_long,slg_p_long,obp_p_long,woba_p_long,mlbamid,steamerid,H9,HR9,K9,BB9,GBrate,FBrate,LDrate,SIERA,split,b1_p_pred,b2_p_pred,b3_p_pred,bb_p_pred,fo_p_pred,go_p_pred,hbp_p_pred,hr_p_pred,lo_p_pred,po_p_pred,so_p_pred,estimated_woba_using_speedangle_p_pred,to_left_p_pred,to_middle_p_pred,to_right_p_pred,hard_hit_p_pred,barrel_p_pred,iso_p_pred,slg_p_pred,obp_p_pred,woba_p_pred,b1_p_long_pred,b2_p_long_pred,b3_p_long_pred,bb_p_long_pred,fo_p_long_pred,go_p_long_pred,hbp_p_long_pred,hr_p_long_pred,lo_p_long_pred,po_p_long_pred,so_p_long_pred,estimated_woba_using_speedangle_p_long_pred,to_left_p_long_pred,to_middle_p_long_pred,to_right_p_long_pred,hard_hit_p_long_pred,barrel_p_long_pred,iso_p_long_pred,slg_p_long_pred,obp_p_long_pred,woba_p_long_pred,b1_p_square_error,b2_p_square_error,b3_p_square_error,bb_p_square_error,fo_p_square_error,go_p_square_error,hbp_p_square_error,hr_p_square_error,lo_p_square_error,po_p_square_error,so_p_square_error,estimated_woba_using_speedangle_p_square_error,to_left_p_square_error,to_middle_p_square_error,to_right_p_square_error,hard_hit_p_square_error,barrel_p_square_error,iso_p_square_error,slg_p_square_error,obp_p_square_error,woba_p_square_error,b1_p_long_square_error,b2_p_long_square_error,b3_p_long_square_error,bb_p_long_square_error,fo_p_long_square_error,go_p_long_square_error,hbp_p_long_square_error,hr_p_long_square_error,lo_p_long_square_error,po_p_long_square_error,so_p_long_square_error,estimated_woba_using_speedangle_p_long_square_error,to_left_p_long_square_error,to_middle_p_long_square_error,to_right_p_long_square_error,hard_hit_p_long_square_error,barrel_p_long_square_error,iso_p_long_square_error,slg_p_long_square_error,obp_p_long_square_error,woba_p_long_square_error
0,21710,451584,2016-04-03,True,False,0,-0.035792,-0.012658,-0.000625,0.0,-0.025963,-0.029541,-0.0,0.015267,-0.010873,0.018707,0.14,0.061964,0.007937,-0.143939,0.151786,-0.02,0.02,0.027415,-0.022389,-0.060259,-0.040209,-0.016097,-0.014336,-0.001195,0.002374,-0.045832,0.036087,-0.01,-0.012569,-0.002633,0.01682,0.064521,-0.039473,-0.066667,-0.051595,0.130159,-0.068767,-0.022785,-0.0623,-0.11568,-0.05596,-0.070186,451584.0,7441,7.184896,0.7286,10.371,2.9705,0.43,0.363,0.206964,2.95,1,-0.013387,-3.8e-05,-0.001219,0.008673,-0.00124,-0.007911,0.009833,0.001793,-0.004577,-0.003431,0.060505,-0.019583,-0.064722,-0.030691,0.106176,-0.021486,0.000181,-0.00153,-0.023248,-0.023,-0.022048,-0.010463,-0.003605,0.003855,0.006395,-0.004704,-0.006167,-0.008007,-0.002463,-0.00093,-0.004277,0.041701,-0.017702,-0.063918,-0.019321,0.097366,-0.029758,-0.010082,-0.015785,-0.035261,-0.022345,-0.027039,0.000502,0.000159,0.0,7.5e-05,0.000611,0.000468,9.7e-05,0.000182,4e-05,0.00049,0.006319,0.00665,0.005279,0.012825,0.00208,2e-06,0.000393,0.000838,1e-06,0.001388,0.00033,3.2e-05,0.000115,2.6e-05,1.6e-05,0.001691,0.001785,4e-06,0.000102,3e-06,0.000445,0.000521,0.000474,8e-06,0.001042,0.001075,0.001522,0.000161,0.002164,0.006467,0.00113,0.001862
1,21714,595307,2016-04-03,True,False,0,0.084778,-0.010825,5.5e-05,0.06,-0.039298,0.092282,0.0,-0.009826,-0.020148,0.001504,-0.1,0.050721,-0.115616,-0.042588,0.173986,0.0,-0.02,-0.044595,0.032626,0.097731,0.059354,0.091093,0.001226,0.000159,-0.005271,-0.059644,0.139652,-0.01,-0.017522,-0.00625,-0.002582,-0.113721,0.013429,-0.089372,-0.077199,0.178468,0.004186,-0.031705,-0.059272,0.015529,0.055548,0.030957,595307.0,12235,9.327462,0.7407,6.3189,1.8908,0.551,0.243,0.20622,3.4,1,0.036245,0.00858,0.00135,-0.018295,-0.022757,0.118631,0.005791,0.001598,0.005155,-0.019794,-0.069655,-0.012384,-0.078912,-0.038393,0.129378,0.043952,0.003716,0.006715,0.039159,0.001971,0.011143,0.036825,0.006329,0.006274,-0.024933,-0.027855,0.121147,-0.008909,-0.005155,0.011057,-0.024061,-0.080529,-0.017303,-0.079368,-0.027674,0.121772,0.039258,-0.006676,-0.006891,0.016124,0.00062,0.002117,0.002355,0.000377,2e-06,0.00613,0.000274,0.000694,3.4e-05,0.000131,0.00064,0.000454,0.000921,0.003982,0.001347,1.8e-05,0.00199,0.001932,0.000562,0.002633,4.3e-05,0.00917,0.002324,0.002945,2.6e-05,3.7e-05,0.000387,0.001011,0.000342,1e-06,0.000153,0.0003,0.000461,0.001102,0.000944,0.0001,0.002453,0.003214,0.00123,0.000626,0.002744,0.0,0.003017,0.000832
2,21715,465657,2016-04-03,False,False,0,0.026845,-0.023644,-0.001777,0.04,-0.044667,0.007624,0.0,-0.017371,-0.007927,-0.000561,0.08,0.033,-0.123932,0.029138,0.110577,-0.06,-0.02,-0.091256,-0.11224,-0.002398,-0.040445,0.011106,-0.018198,-0.001415,0.0175,-0.019405,-0.009426,0.003889,-0.01619,-0.00638,-0.012953,0.068611,0.007987,-0.018519,0.011368,0.019048,-0.031667,-0.015556,-0.076782,-0.102004,-0.007445,-0.038381,465657.0,6941,7.869088,0.7458,8.9701,2.5869,0.462,0.333,0.204536,3.2,1,0.013542,0.003905,-0.001846,-0.027839,-0.013634,0.039026,0.01186,-0.000543,-0.004647,-0.003485,0.033806,-0.024041,0.052893,0.003118,-0.044356,0.00185,0.00011,-0.011346,-0.015146,-0.031039,-0.024454,0.012063,0.000729,0.00353,-0.031147,-0.013186,0.037215,-0.006146,-0.003857,0.000756,-0.004739,0.017465,-0.023416,0.050375,0.01166,-0.046277,-0.003656,-0.00791,-0.022732,-0.024133,-0.032196,-0.029938,0.000177,0.000759,0.0,0.004602,0.000963,0.000986,0.000141,0.000283,1.1e-05,9e-06,0.002134,0.003254,0.031267,0.000677,0.024004,0.003825,0.000404,0.006386,0.009427,0.00082,0.000256,1e-06,0.000358,2.4e-05,0.002367,3.9e-05,0.002175,0.000101,0.000152,5.1e-05,6.7e-05,0.002616,0.000986,0.004746,0.0,0.004267,0.000785,5.8e-05,0.002921,0.006064,0.000613,7.1e-05
3,21721,465657,2016-04-03,True,False,0,0.046636,0.019474,0.003622,-0.06,0.007986,0.009036,-0.0,-0.009638,0.033172,0.008235,-0.0,-0.022162,0.01634,-0.040998,0.040441,-0.1,-0.02,-0.017736,0.016797,-0.026358,-0.012865,0.034001,0.006044,0.001411,-0.041891,0.005435,0.057094,-0.01,-0.009218,0.017492,0.022592,-0.065821,-0.051651,-0.022876,-0.043098,0.077871,-0.023582,-0.020945,-0.028803,-0.010683,-0.023789,-0.019562,465657.0,6941,7.869088,0.7458,8.9701,2.5869,0.462,0.333,0.204536,3.2,1,0.004637,0.003237,-0.000966,-0.0009,-0.004811,0.030476,0.008844,0.002811,-0.000679,-0.007761,0.013655,-0.013564,-0.058062,-0.032048,0.100631,0.004568,0.002925,0.005202,0.005763,-0.011099,-0.006111,0.006306,0.000102,0.004074,-0.003338,-0.007894,0.032568,-0.00891,-0.001655,0.00413,-0.008583,-0.004089,-0.0137,-0.056411,-0.020265,0.091931,-0.004248,-0.007663,-0.009519,-0.007285,-0.011147,-0.012253,0.001764,0.000264,2.1e-05,0.003493,0.000164,0.00046,7.8e-05,0.000155,0.001146,0.000256,0.000186,7.4e-05,0.005536,8e-05,0.003623,0.010935,0.000526,0.000526,0.000122,0.000233,4.6e-05,0.000767,3.5e-05,7e-06,0.001486,0.000178,0.000602,1e-06,5.7e-05,0.000179,0.000972,0.003811,0.00144,0.001125,0.000521,0.000198,0.000374,0.000176,0.000372,1.2e-05,0.00016,5.3e-05
4,21723,502042,2016-04-03,False,False,0,-0.025789,0.00284,0.000235,0.06,-0.059488,-0.061825,-0.0,-0.002926,0.006047,-0.000571,0.14,0.062773,0.122222,-0.073939,-0.0325,0.0,-0.02,-0.004215,-0.031438,0.007909,-0.006681,0.002495,-0.006956,-0.000282,0.006667,-0.022182,-0.018873,-0.01,-0.008957,-0.000178,0.005406,0.07,0.023578,0.049906,-0.006209,-0.0318,-0.003333,-0.01,-0.039805,-0.058679,-0.02117,-0.032518,502042.0,6345,7.635047,0.7985,9.6535,3.0398,0.46,0.337,0.202972,3.36,1,0.005236,0.002442,-0.002551,-0.013328,-0.015958,0.021,0.013372,-0.000207,-0.007049,-0.004366,0.049931,-0.01307,0.053018,0.004194,-0.042773,-0.008306,-7.5e-05,-0.009507,-0.017494,-0.0219,-0.021436,0.003212,-0.000375,0.003365,-0.016103,-0.012964,0.018783,-0.004691,-0.003089,-0.000716,-0.004373,0.032406,-0.015613,0.050465,0.009497,-0.044334,-0.014594,-0.008261,-0.022728,-0.026792,-0.025022,-0.027478,0.000963,0.0,8e-06,0.005377,0.001895,0.00686,0.000179,7e-06,0.000172,1.4e-05,0.008112,0.005752,0.004789,0.006105,0.000106,6.9e-05,0.000397,2.8e-05,0.000194,0.000889,0.000218,1e-06,4.3e-05,1.3e-05,0.000518,8.5e-05,0.001418,2.8e-05,3.4e-05,0.0,9.6e-05,0.001413,0.001536,0.0,0.000247,0.000157,0.000127,3e-06,0.000292,0.001017,1.5e-05,2.5e-05


In [54]:
# event = "woba"  # change this as needed

# pred_col = f"{event}_p_long_pred"
# actual_col = f"{event}_p_long"

# # Filter for imputed pitcher rows
# imp_pitcher_pred_df = pitcher_pred_df[pitcher_pred_df['imp_p'] == 1].copy()

# # Create decile bins
# imp_pitcher_pred_df['quantile'] = pd.qcut(imp_pitcher_pred_df[pred_col], q=10, labels=False)

# # Group by quantile and calculate means
# quantile_means = imp_pitcher_pred_df.groupby('quantile')[[pred_col, actual_col]].mean().reset_index()

# # Plot
# plt.figure(figsize=(6, 6))  # Square plot
# plt.plot(quantile_means[pred_col], quantile_means[actual_col], marker='o')
# plt.plot([-1, 1], [-1, 1], 'r--')  # 45-degree reference line
# plt.xlabel('Avg Predicted Probability')
# plt.ylabel('Avg Actual Probability')
# plt.title(f'Calibration Plot by Decile for {event}')
# plt.xlim(-1, 1)
# plt.ylim(-1, 1)
# plt.gca().set_aspect('equal', adjustable='box')  # Force square axes
# plt.grid(True)
# plt.show()


### Required Follow-Ups:
- M03. Plate Appearances