# M03. Predict PAs
- This predicts the outcome of plate appearances
- Type: Model
- Run Frequency: Irregular
- Sources:
    - MLB API
    - Steamer
- Created: 4/19/2024
- Updated: 6/4/2025

Consider: 
- imputed starter, imputed reliever, unimputed starter, unimputed reliever variables
- Using batter woba and pitcher woba to determine quantiles, not projected

### Imports

In [22]:
%run "U1. Imports.ipynb"
%run "U2. Functions.ipynb"
%run "U3. Classes.ipynb"
%run "U4. Datasets.ipynb"
%run "U5. Models.ipynb"

In [23]:
# Set option to display numbers without scientific notation
pd.set_option('display.float_format', '{:.6f}'.format)

### Data

##### Park x Weather Factors

In [24]:
multiplier_df = pd.read_csv(os.path.join(baseball_path, "Park and Weather Factors.csv"))

Choose WFX
- _unadj: predicted based on weather / predicted based on batted ball <br>
- _adj: average of actual rates in similarly predicted games / predicted based on batted ball

In [25]:
wfx_type = 'adj'
for event in events_list:
    multiplier_df[f'{event}_wfx_l'] = multiplier_df[f'{event}_wfx_{wfx_type}_l'].copy()
    multiplier_df[f'{event}_wfx_r'] = multiplier_df[f'{event}_wfx_{wfx_type}_r'].copy()

##### Plate Appearances

In [26]:
complete_dataset = pd.read_csv(os.path.join(baseball_path, "Final Dataset.csv"))

##### Steamer

In [27]:
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')

In [28]:
steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers_weekly_log.csv"), encoding='iso-8859-1')

### Clean

##### MLB Stats API

Remove missings

In [29]:
complete_dataset = complete_dataset[~complete_dataset[batter_inputs].isin([np.inf, -np.inf]).any(axis=1)]
complete_dataset = complete_dataset[~complete_dataset[pitcher_inputs].isin([np.inf, -np.inf]).any(axis=1)]

Scale

In [30]:
%%time
complete_dataset[batter_inputs] = scale_batter_stats.transform(complete_dataset[batter_inputs])
complete_dataset[pitcher_inputs] = scale_pitcher_stats.transform(complete_dataset[pitcher_inputs])

CPU times: total: 3.78 s
Wall time: 1.06 s


Set data types

In [31]:
complete_dataset['date_time'] = pd.to_datetime(complete_dataset['date'], format='%Y%m%d')
complete_dataset['date_time_copy'] = complete_dataset['date_time'].copy()

complete_dataset['batter'] = complete_dataset['batter'].astype(int).astype(str)
complete_dataset['pitcher'] = complete_dataset['pitcher'].astype(int).astype(str)

Sort to prep for merge

In [32]:
complete_dataset.sort_values('date_time', inplace=True)

##### Steamer

Clean

In [33]:
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df).dropna(subset=batter_stats_fg, inplace=True)
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df).dropna(subset=pitcher_stats_fg, inplace=True)

Scale

In [34]:
steamer_hitters_df2[batter_stats_fg] = scale_batter_stats_steamer.transform(steamer_hitters_df2[batter_stats_fg])
steamer_pitchers_df2[pitcher_stats_fg] = scale_pitcher_stats_steamer.transform(steamer_pitchers_df2[pitcher_stats_fg])

Remove missing pitchers (occurs occassionally in 2014)

In [37]:
steamer_pitchers_df2 = steamer_pitchers_df2[~steamer_pitchers_df2['mlbamid'].isna()].reset_index(drop=True)

Set data types

In [38]:
steamer_hitters_df2['date_time'] = pd.to_datetime(steamer_hitters_df2['date'], format='%Y%m%d')
steamer_pitchers_df2['date_time'] = pd.to_datetime(steamer_pitchers_df2['date'], format='%Y%m%d')

steamer_hitters_df2['mlbamid'] = steamer_hitters_df2['mlbamid'].astype(int).astype(str)
steamer_pitchers_df2['mlbamid'] = steamer_pitchers_df2['mlbamid'].astype(int).astype(str)

Rename for compatibility with MLB Stats API data

In [39]:
steamer_hitters_df2.rename(columns={'mlbamid': 'batter'}, inplace=True)
steamer_pitchers_df2.rename(columns={'mlbamid': 'pitcher'}, inplace=True)

Drop unnecessary columns

In [40]:
steamer_hitters_df2.drop(columns=['date', 'firstname', 'lastname', 'steamerid'], inplace=True)
steamer_pitchers_df2.drop(columns=['date', 'firstname', 'lastname', 'steamerid'], inplace=True)

Sort to prep for merge

In [41]:
steamer_hitters_df2.sort_values('date_time', inplace=True)
steamer_pitchers_df2.sort_values('date_time', inplace=True)

### Merge

##### Merge #1. Plate Appearances and Steamer Batters

In [44]:
complete_merged_df = pd.merge_asof(
    complete_dataset,
    steamer_hitters_df2,
    on='date_time',
    by='batter',
    direction='backward'
)

##### Merge #2. Add Steamer Pitchers 

In [48]:
complete_merged_df = pd.merge_asof(
    complete_merged_df,
    steamer_pitchers_df2,
    on='date_time',
    by='pitcher',
    direction='backward'  
)

##### Merge #3. Add WFX

In [49]:
complete_merged_df = pd.merge(complete_merged_df, multiplier_df, on=['gamePk', 'date', 'venue_id'], how='left')

### Impute

For players with insufficient sample sizes, stats are imputed

##### Option 1: Steamer

In [50]:
# # First, remove from dataset if ever missing FG/Steamer stats
# complete_merged_df = complete_merged_df[~complete_merged_df['b1_rate'].isna()]
# complete_merged_df = complete_merged_df[~complete_merged_df['H9'].isna()]

# # Add hands to use in imputation
# batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L', 'imp_b']
# pitcher_stats_fg_imp = pitcher_stats_fg + ['b_L', 'p_L', 'imp_p']

# ### Batters
# # Use Steamer stats to predict API/Statcast stats for those with limited samples
# batter_predictions = impute_batter_stats.predict(complete_merged_df.loc[complete_merged_df['pa_b'] < 40, batter_stats_fg_imp])

# # Impute inputs with limited sample size with predicted values
# complete_merged_df.loc[complete_merged_df['pa_b'] < 40, batter_inputs] = batter_predictions

# ### Pitchers
# # Use Steamer stats to predict API/Statcast stats for those with limited samples
# pitcher_predictions = impute_pitcher_stats.predict(complete_merged_df.loc[complete_merged_df['pa_p'] < 40, pitcher_stats_fg_imp])

# # Impute inputs with limited sample size with predicted values
# complete_merged_df.loc[complete_merged_df['pa_p'] < 40, pitcher_inputs] = pitcher_predictions

##### Option 2: Middle

In [51]:
# # First, remove from dataset if ever missing FG/Steamer stats
# complete_merged_df = complete_merged_df[~complete_merged_df['b1_rate'].isna()]
# complete_merged_df = complete_merged_df[~complete_merged_df['H9'].isna()]

# # Instead of imputing, just weighting with 0s
# complete_merged_df[batter_inputs].fillna(0.0, inplace=True)
# complete_merged_df[pitcher_inputs].fillna(0.0, inplace=True)

# # Calculate the weighted average for each column in pitcher_stats
# # Could be simplified, but I wanted to show the steps
# # Weighted average of provided value and 0. PAs and 50-PAs are weights. 
# for col in batter_inputs:
#     complete_merged_df[col] = (complete_merged_df[col] * complete_merged_df['pa_b'] + 0.0 * (50-complete_merged_df['pa_b']))/50

# # Calculate the weighted average for each column in pitcher_stats
# for col in pitcher_inputs:
#     complete_merged_df[col] = (complete_merged_df[col] * complete_merged_df['pa_p'] + 0.0 * (50-complete_merged_df['pa_p']))/50

##### Option 3: 0s

In [52]:
complete_merged_df.loc[complete_merged_df['pa_b'] < 40, batter_inputs] = 0
complete_merged_df.loc[complete_merged_df['pa_p'] < 40, pitcher_inputs] = 0

# # Batter tendencies
# complete_merged_df.loc[(complete_merged_df['pa_b'] < 40) & (complete_merged_df['b_L'] == False), ['to_left_b', 'to_left_b_long']] = -0.283467
# complete_merged_df.loc[(complete_merged_df['pa_b'] < 40) & (complete_merged_df['b_L'] == True), ['to_left_b', 'to_left_b_long']] = -0.543105

# complete_merged_df.loc[(complete_merged_df['pa_b'] < 40) & (complete_merged_df['b_L'] == False), ['to_middle_b', 'to_middle_b_long']] = -0.171325
# complete_merged_df.loc[(complete_merged_df['pa_b'] < 40) & (complete_merged_df['b_L'] == True), ['to_middle_b', 'to_middle_b_long']] = -0.222317

# complete_merged_df.loc[(complete_merged_df['pa_b'] < 40) & (complete_merged_df['b_L'] == False), ['to_right_b', 'to_right_b_long']] = -0.499793
# complete_merged_df.loc[(complete_merged_df['pa_b'] < 40) & (complete_merged_df['b_L'] == True), ['to_right_b', 'to_right_b_long']] = -0.191897

# # Pitcher tendencies
# complete_merged_df.loc[(complete_merged_df['pa_p'] < 40) & (complete_merged_df['p_L'] == False), ['to_left_p', 'to_left_p_long']] = -0.399969
# complete_merged_df.loc[(complete_merged_df['pa_p'] < 40) & (complete_merged_df['p_L'] == True), ['to_left_p', 'to_left_p_long']] = -0.331084

# complete_merged_df.loc[(complete_merged_df['pa_p'] < 40) & (complete_merged_df['p_L'] == False), ['to_middle_p', 'to_middle_p_long']] = -0.188469
# complete_merged_df.loc[(complete_merged_df['pa_p'] < 40) & (complete_merged_df['p_L'] == True), ['to_middle_p', 'to_middle_p_long']] = -0.186767

# complete_merged_df.loc[(complete_merged_df['pa_p'] < 40) & (complete_merged_df['p_L'] == False), ['to_right_p', 'to_right_p_long']] = -0.341691
# complete_merged_df.loc[(complete_merged_df['pa_p'] < 40) & (complete_merged_df['p_L'] == True), ['to_right_p', 'to_right_p_long']] = -0.423187


complete_merged_df[batter_stats_fg] = complete_merged_df[batter_stats_fg].fillna(0)
complete_merged_df[pitcher_stats_fg] = complete_merged_df[pitcher_stats_fg].fillna(0)

### Sample

Drop early observations

In [54]:
complete_merged_df = complete_merged_df[complete_merged_df['game_date'] > '2015-07-01']

Drop atypical events

In [55]:
complete_merged_df = complete_merged_df.query('eventsModel != "Cut"')

Drop observations from inactive parks

In [56]:
active_parks = list(team_map['VENUE_ID'].astype(int))
complete_merged_df = complete_merged_df[complete_merged_df['venue_id'].astype(int).isin(active_parks)]

### Shift

Many batter and pitcher stats are calculated at the end of the plate appearance. For prediction purposes, we need these stats coming into the plate appearance.

##### Batter Inputs

Sort

In [57]:
complete_merged_df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

Shift

In [58]:
complete_merged_df[batter_inputs + ['ab_b', 'pa_b', 'imp_b']] = complete_merged_df.groupby(['batter', 'pitchHand'])[batter_inputs + ['ab_b', 'pa_b', 'imp_b']].shift(1)

##### Pitcher Inputs

Sort

In [59]:
complete_merged_df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

Shift

In [60]:
complete_merged_df[pitcher_inputs + ['ab_p', 'pa_p', 'imp_p']] = complete_merged_df.groupby(['pitcher', 'batSide'])[pitcher_inputs + ['ab_p', 'pa_p', 'imp_p']].shift(1)

##### Inning Sums

Sort

In [61]:
complete_merged_df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

Shift

In [62]:
cumulative_inning_input_list = [col for col in complete_merged_df.columns if col.endswith("_inning")]

complete_merged_df[cumulative_inning_input_list] = complete_merged_df.groupby(['gamePk', 'inning', 'pitcher'])[cumulative_inning_input_list].shift(1)
complete_merged_df[cumulative_inning_input_list] = complete_merged_df[cumulative_inning_input_list].fillna(0)

##### Game Sums

Sort

In [63]:
complete_merged_df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

Shift

In [64]:
cumulative_game_input_list = [col for col in complete_merged_df.columns if col.endswith("_game")]
cumulative_game_input_list.remove('rbi_game')

complete_merged_df[cumulative_game_input_list + ['times_faced']] = complete_merged_df.groupby(['gamePk', 'pitcher'])[cumulative_game_input_list + ['times_faced']].shift(1)
complete_merged_df[cumulative_game_input_list + ['times_faced']] = complete_merged_df[cumulative_game_input_list + ['times_faced']].fillna(0)

### Train/Test Split

Split

In [68]:
np.random.seed(1)
complete_merged_df['split'] = np.random.choice([0, 0, 1], size=len(complete_merged_df))

Create masks to identify training and testing datasets

Note: to train on the entire dataset, you can simply set split = 0 for the entire dataset

In [69]:
training_mask = (complete_merged_df['split'] == 0)

### Evaluations

##### Constructed Stats

This builds stats used for evaluating model performance (actual event rates, FP, wOBA, outs)

In [70]:
def constructed_stats(model_dataset):
    # Actual Stats
    for event in events_list:
        model_dataset[f'{event}_act'] = (model_dataset['eventsModel'] == event).astype(int)

    # FP - Pitchers
    pitcher_weights = {'fo': 1.0460, 'go': 1.0460, 'po': 1.0460, 'lo': 1.0460, 'so': 3.0408, 'bb': -1.3508, 'b1': -1.7427, 'b2': -1.7427, 'b3': -1.7427, 'hr': -3.6639}
    for suffix in ['act', 'pred']:
        model_dataset.loc[~training_mask, f'FP_P_{suffix}'] = sum(
            model_dataset.loc[~training_mask, f'{col}_{suffix}'] * w
            for col, w in pitcher_weights.items()
        )
    
    # FP - Batters
    batter_weights = {'b1': 4.3665, 'b2': 6.8271, 'b3': 10.8503, 'hr': 15.2611, 'bb':  2.8725, 'hbp': 2.9639}
    for suffix in ['act', 'pred']:
        model_dataset.loc[~training_mask, f'FP_B_{suffix}'] = sum(
            model_dataset.loc[~training_mask, f'{col}_{suffix}'] * w
            for col, w in batter_weights.items()
        )

    # wOBA (roughly)
    woba_weights = {'b1': 0.882, 'b2': 1.254, 'b3': 1.590, 'hr': 2.050, 'bb': 0.689, 'hbp': 0.720}
    for suffix in ['act', 'pred']:
        model_dataset.loc[~training_mask, f'wOBA_{suffix}'] = sum(
            model_dataset.loc[~training_mask, f'{col}_{suffix}'] * w
            for col, w in woba_weights.items()
        )
    
    # Out
    model_dataset['is_out_act'] = model_dataset['is_out'].copy()
    model_dataset.loc[~training_mask, 'is_out_pred'] = model_dataset.loc[~training_mask, ['fo_pred','go_pred','po_pred','lo_pred','so_pred']].sum(axis=1)
    

    return model_dataset

##### Summary Statistics

In [71]:
def summary_statistics(model_dataset, year, parameters, filename, model):
    print("\nFigure 1: Pitchers by Starter and Imputation Status")
    print(model_dataset[~training_mask].query(f'year == {year}').groupby(['imp_p', 'starter'])[['FP_P_pred', 'FP_P_act', 
                                                                                                'wOBA_pred', 'wOBA_act', 'so_pred', 'so_act']].mean())
    
    print("\nFigure 2: Pitchers by Imputation Status")
    print(model_dataset[~training_mask].query(f'year == {year}').groupby(['imp_p'])[['FP_P_pred', 'FP_P_act', 
                                                                                     'wOBA_pred', 'wOBA_act', 'so_pred', 'so_act']].mean())
    
    print("\nFigure 3: Batters by Imputation Status")
    print(model_dataset[~training_mask].query(f'year == {year}').groupby(['imp_b'])[['FP_B_pred', 'FP_B_act', 
                                                                                     'wOBA_pred', 'wOBA_act', 'hr_pred', 'hr_act']].mean())

    print("\nFigure 4: FP by Venue")
    print(model_dataset[~training_mask].query(f'year == {year}').groupby('venue_id')[['FP_B_pred', 'FP_B_act', 'FP_P_pred', 'FP_P_act']].mean())
    means = (model_dataset[~training_mask].query(f'year == {year}').groupby('venue_id')[['FP_B_pred', 'FP_B_act', 'FP_P_pred', 'FP_P_act']].mean())
    print(f"FP_B MSE: {np.mean((means['FP_B_pred'] - means['FP_B_act'])**2):.4f}")
    print(f"FP_P MSE: {np.mean((means['FP_P_pred'] - means['FP_P_act'])**2):.4f}")

    print("\nFigure 5: HRs by Quantile")
    model_dataset['hr_wfx_quantile'] = pd.qcut(model_dataset['hr_wfx'], q=quantiles, labels=False) + 1
    print(model_dataset[~training_mask].groupby('hr_wfx_quantile')[['hr_pred', 'hr_act']].mean())
    
    print("\nFigure 6: Stat Performance")
    # Loop over metrics
    for var in list(model.classes_) + ['is_out', 'wOBA', 'FP_B', 'FP_P']:        
        # Calculate quantiles
        model_dataset.loc[~training_mask, f'{var}_quantile'] = pd.qcut(model_dataset.loc[~training_mask, f'{var}_pred'], quantiles, labels=False, duplicates='drop')
        
        # Create aggregated dataframe
        globals()[f"{var}_df"] = model_dataset.loc[~training_mask].groupby(f'{var}_quantile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()
        globals()[f"{var}_year_df"] = model_dataset.query(f'year >= {year}').loc[~training_mask].groupby(f'{var}_quantile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()
        globals()[f"{var}_venue_df"] = model_dataset.query(f'venue_id == {venue}').loc[~training_mask].groupby(f'{var}_quantile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()
    
    # Stat Performance - All Years
    all_df_list = []
    for var in list(model.classes_) + ['is_out', 'wOBA', 'FP_B', 'FP_P']:
        actual = model_dataset.loc[~training_mask][f'{var}_act'].mean()
        predicted = model_dataset.loc[~training_mask][f'{var}_pred'].mean()
        mult = actual/predicted
        stdev = model_dataset.loc[~training_mask][f'{var}_pred'].std()
        globals()[f"{var}_df"]['se'] = (globals()[f"{var}_df"][f'{var}_act'] - globals()[f"{var}_df"][f'{var}_pred']) ** 2
        mse = globals()[f"{var}_df"]['se'].mean()
        all_df = pd.DataFrame(["All", var, actual, predicted, mult, stdev, mse])
        all_df_list.append(all_df)
    
    all_dfs = pd.concat(all_df_list, axis=1).T
    
    # Stat Performance - Specific Year
    recent_df_list = []
    for var in list(model.classes_) + ['is_out', 'wOBA', 'FP_B', 'FP_P']:
        actual = model_dataset.query(f'year == {year}').loc[~training_mask][f'{var}_act'].mean()
        predicted = model_dataset.query(f'year == {year}').loc[~training_mask][f'{var}_pred'].mean()
        mult = actual/predicted
        stdev = model_dataset.query(f'year == {year}').loc[~training_mask][f'{var}_pred'].std()
        globals()[f"{var}_year_df"]['se'] = (globals()[f"{var}_year_df"][f'{var}_act'] - globals()[f"{var}_year_df"][f'{var}_pred']) ** 2
        mse = globals()[f"{var}_year_df"]['se'].mean()
        recent_df = pd.DataFrame([str(int(year)), var, actual, predicted, mult, stdev, mse])
        recent_df_list.append(recent_df)
    
    recent_dfs = pd.concat(recent_df_list, axis=1).T
    
    all_stat_df = pd.concat([all_dfs, recent_dfs], axis=0, ignore_index=True)
    all_stat_df.columns = ['Year', 'Output', 'Actual', 'Predicted', "Multiplier", 'Std. Dev', 'MSE']
    all_stat_df['File'] = filename
    all_stat_df['Layers'] = str(parameters['hidden_layer_sizes'])
    all_stat_df['Models'] = num_models
    all_stat_df['State'] = random_state + i
    all_stat_df[['Year', 'File', 'Layers', 'Models', 'State', 'Output', 'Actual', 'Predicted', "Multiplier", 'Std. Dev', 'MSE']]
    print(all_stat_df[['Year', 'Output', 'Actual', 'Predicted', 'Multiplier', 'Std. Dev', 'MSE']])
    

    return all_stat_df

##### Plots

In [72]:
def graph_by_quantile(graph, model):
    # Subplots
    rows, columns = 5, 3
    fig, axs = plt.subplots(rows, columns, figsize=(9, 15))
    
    # Ensure that the number of plots doesn't exceed available subplots
    total_plots = rows * columns
    output_vars = list(model.classes_) + ['is_out', 'wOBA', 'FP_B', 'FP_P']
    
    # Limit the number of plots to the available subplots
    output_vars = output_vars[:total_plots]

    # Loop over outputs, adding them as subplots
    # Note: variable dataframes are defined in summary_statistics()
    for i, var in enumerate(output_vars):
        row = i // columns  # Calculate the row index based on the iteration
        col = i % columns   # Calculate the column index based on the iteration
        axs[row, col].plot(globals()[f"{var}{graph}_df"][f'{var}_quantile'], globals()[f"{var}{graph}_df"][f'{var}_pred'], color='red')
        axs[row, col].plot(globals()[f"{var}{graph}_df"][f'{var}_quantile'], globals()[f"{var}{graph}_df"][f'{var}_act'], color='black')
        axs[row, col].set_title(var)
    
    # Add some space between subplots to prevent overlapping
    fig.tight_layout(pad=.0)
    
    # Show the figure
    plt.show()

### Model A. All - Unadjusted

##### Inputs

Batter Inputs

In [73]:
batter_input_list = batter_inputs

Pitcher Inputs

In [74]:
pitcher_input_list = pitcher_inputs

Hand Inputs

In [75]:
hand_input_list = ['p_L', 'b_L']

Imputation Inputs

In [76]:
imp_input_list = ['imp_b', 'imp_p']

Starter Input(s)

In [77]:
starter_input_list = ['starter']

Cumulative Inning Inputs

In [78]:
cumulative_inning_input_list = [col for col in complete_merged_df.columns if col.endswith("_inning")]

In [79]:
cumulative_inning_input_list.remove('rbi_inning')

Cumulative Game Inputs

In [80]:
cumulative_game_input_list = [col for col in complete_merged_df.columns if col.endswith("_game")]

In [81]:
cumulative_game_input_list.remove('rbi_game')

Game State Inputs

In [82]:
complete_merged_df['winning'] = (complete_merged_df['preBatterScore'] > complete_merged_df['prePitcherScore']).astype(int)
complete_merged_df['winning_big'] = (complete_merged_df['preBatterScore'] > complete_merged_df['prePitcherScore'] + 3).astype(int)

In [83]:
game_state_input_list = ['onFirst', 'onSecond', 'onThird', 'top', 'score_diff', 'prePitcherScore', 'preBatterScore', 'winning', 'winning_big', 'times_faced']

Inning Inputs

In [84]:
for inning in range(1, 12):
    complete_merged_df[f'inning_{inning}'] = (complete_merged_df['inning'] == inning).astype(int)
complete_merged_df['inning_11'] = (complete_merged_df['inning'] >= 11).astype(int)

In [85]:
inning_input_list = [col for col in complete_merged_df.columns if col.startswith("inning_")]

Out Inputs

In [86]:
for out in range(0, 3):
    complete_merged_df[f'outs_{out}'] = (complete_merged_df['outs_pre'] == out).astype(int)

In [87]:
out_input_list = ['outs_0', 'outs_1', 'outs_2']

Venue Inputs

In [None]:
Note: venue inputs are not preferred following integrating into WFX

In [88]:
complete_merged_df['venue_id2'] = complete_merged_df['venue_id'].copy()
complete_merged_df = pd.get_dummies(complete_merged_df, columns=['venue_id2'], prefix='venue')

MemoryError: Unable to allocate 4.70 GiB for an array with shape (365, 1728035) and data type float64

In [None]:
venue_input_list = [col for col in complete_merged_df.columns if col.startswith("venue_") and col != "venue_id" and col != "venue_name"]
venue_input_list = list(dict.fromkeys(venue_input_list))

Multiplier Inputs

In [None]:
for event in events_list:
    # Assign multiplier for their batSide
    complete_merged_df[f'{event}_wfx'] = np.where(complete_merged_df['batSide'] == "L", complete_merged_df[f'{event}_wfx_l'], 
                                                                                        complete_merged_df[f'{event}_wfx_r'])

In [None]:
multiplier_input_list = [f'{event}_wfx' for event in events_list]

Imputation and starter interactions

In [None]:
complete_merged_df['imputed_starter'] = complete_merged_df['imp_p'] * complete_merged_df['starter']
complete_merged_df['imputed_reliever'] = complete_merged_df['imp_p'] * (complete_merged_df['starter'] == 0).astype(int)
complete_merged_df['unimputed_starter'] = (complete_merged_df['imp_p'] == 0).astype(int) * complete_merged_df['starter']
complete_merged_df['unimputed_reliever'] = (complete_merged_df['imp_p'] == 0).astype(int) * (complete_merged_df['starter'] == 0).astype(int)

In [None]:
imp_starter_input_list = ['imputed_starter', 'imputed_reliever', 'unimputed_starter', 'unimputed_reliever']

Model Inputs

In [None]:
model_a_input_list = batter_input_list + pitcher_input_list + hand_input_list + imp_input_list + starter_input_list + cumulative_inning_input_list + cumulative_game_input_list + game_state_input_list + inning_input_list + out_input_list + imp_starter_input_list + batter_stats_fg + pitcher_stats_fg

In [None]:
n1 = len(model_a_input_list) + 1

Fill in missings

In [None]:
model_dataset[model_a_input_list] = model_dataset[model_a_input_list].fillna(0)

Outputs

In [None]:
output_list = ['is_out', 'eventsModel']

Other variables

In [None]:
additional_list = ['pa_b', 'pa_p', 'year', 'date', 'gamePk', 'atBatIndex', 'venue_id', 'batterName', 'pitcherName']

Variables to keep

In [None]:
keep_list = input_list + output_list + additional_list 

##### Memory

Create Model Dataset

In [None]:
model_dataset = complete_merged_df[keep_list]

model_dataset.dropna(subset=keep_list, inplace=True)
model_dataset.reset_index(drop=True, inplace=True)

Free up memory

In [None]:
del complete_merged_df, complete_dataset, steamer_hitters_df, steamer_hitters_df2, steamer_pitchers_df, steamer_pitchers_df2, multiplier_df

##### Settings

Model

In [None]:
num_classifiers = 3 # Ensemble size
num_models = 40 # Number of voting classifiers to run in loop
random_state = random.randint(10000,90000) 

all_stat_list = [] # List of dataframes with evaluation data

model_a_parameters = {
    'hidden_layer_sizes': (160,80),
    'activation': 'relu',
    'max_iter': 10,
    'alpha': 0.00001,
    'learning_rate_init': 0.001, 
    'batch_size': 1024,
    'random_state': random_state,
    # dropout = 0.1 # Need to switch to MLPDropout to use
    'early_stopping': True,
    'tol': 0.00001,
    'n_iter_no_change': 20,
    'validation_fraction': 0.05
}

Plots

In [None]:
quantiles = 10
year = 2024
venue = 19
graph = '_year' # options include '_year', '_venue', or '' (for all years and venues)

##### Train, Predict, and Evaluate

In [None]:
%%time
print(f"Ensemble Size: {num_classifiers}")
for i in range(num_models):
    # Set filename
    all_filename = f"predict_all_{''.join(str(x) for x in model_a_parameters['hidden_layer_sizes'])}_{random_state+i}_{todaysdate}.sav"
    print(f"Model {i}: {all_filename}")

    ### Train
    # Build list of MLP classifiers with varied random_state
    estimators = []
    for j in range(num_classifiers):
        # Determine random state
        model_a_parameters['random_state'] = random_state + 100 * j + i
        # Create model
        clf = SafeMLPClassifier(**model_a_parameters)
        estimators.append((f"mlp_{j}", clf))
    # Combine into a soft voting classifier
    predict_all = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)

    # Fit
    predict_all.fit(model_dataset[training_mask][model_a_input_list], model_dataset[training_mask][['eventsModel']].values.ravel())

    # Save model
    pickle.dump(predict_all, open(os.path.join(model_path, "M03. Plate Appearances", all_filename), 'wb'))

    
    ### Predict
    all_outputs_pred = [x + "_pred" for x in list(predict_all.classes_)]
    model_dataset.loc[~training_mask, all_outputs_pred] = predict_all.predict_proba(model_dataset[~training_mask][model_a_input_list])


    ### Evaluate
    # Construct stats required for model evaluations
    model_dataset = constructed_stats(model_dataset)

    # Print summary statistics
    all_stat_df = summary_statistics(model_dataset, year, parameters=model_a_parameters, filename=all_filename, model=predict_all)

    # Add model statistics to a running dataframe list for later evaluation across models
    all_stat_list.append(all_stat_df)

    # Graph
    graph_by_quantile(graph, model=predict_all)

Pareto-Optimal Models

In [None]:
all_stat_df = pd.concat(all_stat_list, ignore_index=True)

pareto_optimal(all_stat_df.query(f'Year == "{year}"')   # Will accept local variable year and string "All"
                          .query('Output == "wOBA"')
                          .query('1.01 > Multiplier > 0.99').reset_index(drop=True), ['MSE', 'Std. Dev'], ['Minimize', 'Maximize']).sort_values('Std. Dev')

### Predict

Load model

Note: this will overwrite predict_all model from U5. Models.ipynb

In [None]:
all_filename = "predict_all_16080_56111_20251101.sav"

predict_all = pickle.load(open(os.path.join(model_path, "M03. Plate Appearances", all_filename), 'rb'))

Predict

In [None]:
all_outputs_pred = [x + "_pred1" for x in list(predict_all.classes_)]

model_dataset[all_outputs_pred] = predict_all.predict_proba(model_dataset[model_a_input_list])

### Model B. All - WFX Adjusted

##### Inputs

Calculate Predicted Rate x WFX Interactions

In [None]:
interactions_list = []

for event in events_list:
    model_dataset[f'{event}_int'] = model_dataset[f'{event}_pred1'] * model_dataset[f'{event}_wfx']
    interactions_list.append(f'{event}_int')

Model inputs

In [None]:
model_b_input_list = interactions_list + imp_starter_input_list

##### Settings

Model

In [None]:
num_classifiers = 3 # Ensemble size
num_models = 40 # Number of voting classifiers to run in loop
random_state = random.randint(10000,90000) 

all_adjusted_stat_list = [] # List of dataframes with evaluation data

model_b_parameters = {
    'hidden_layer_sizes': (16,),
    'activation': 'relu',
    'max_iter': 100,
    'alpha': 0.00001,
    'learning_rate_init': 0.001, 
    'batch_size': 1024,
    'random_state': random_state,
    # dropout = 0.1 # Need to switch to MLPDropout to use
    'early_stopping': True,
    'tol': 0.00001,
    'n_iter_no_change': 10,
    'validation_fraction': 0.05
}

Plots

In [None]:
quantiles = 10
year = 2024 
venue = 19
graph = '_year' # options include '_year', '_venue', or '' (for all years and venues)

##### Train, Predict, and Evaluate

In [None]:
%%time
print(f"Ensemble Size: {num_classifiers}")
for i in range(num_models):
    # Set filename
    all_adjusted_filename = f"predict_all_adjusted_{''.join(str(x) for x in model_b_parameters['hidden_layer_sizes'])}_{random_state+i}_{todaysdate}.sav"
    print(f"Model {i}: {all_adjusted_filename}")

    ### Train
    # Build list of MLP classifiers with varied random_state
    estimators = []
    for j in range(num_classifiers):
        # Determine random state
        model_b_parameters['random_state'] = random_state + 100 * j + i
        # Create model
        clf = SafeMLPClassifier(**model_b_parameters)
        estimators.append((f"mlp_{j}", clf))
    # Combine into a soft voting classifier
    predict_all_adjusted = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)

    # Fit
    predict_all_adjusted.fit(model_dataset[training_mask][model_b_input_list], model_dataset[training_mask][['eventsModel']].values.ravel())

    # Save model
    pickle.dump(predict_all_adjusted, open(os.path.join(model_path, "M03. Plate Appearances", all_adjusted_filename), 'wb'))

    
    ### Predict
    all_outputs_pred = [x + "_pred" for x in list(predict_all_adjusted.classes_)]
    model_dataset.loc[~training_mask, all_outputs_pred] = predict_all_adjusted.predict_proba(model_dataset[~training_mask][model_b_input_list])


    ### Evaluate
    # Construct stats required for model evaluations
    model_dataset = constructed_stats(model_dataset)

    # Print summary statistics
    all_stat_df = summary_statistics(model_dataset, year, parameters=model_b_parameters, filename=all_adjusted_filename, model=predict_all_adjusted)

    # Add model statistics to a running dataframe list for later evaluation across models
    all_adjusted_stat_list.append(all_stat_df)

    # Graph
    graph_by_quantile(graph, model=predict_all_adjusted)

Pareto-Optimal Models

In [None]:
all_adjusted_stat_df = pd.concat(all_adjusted_stat_list, ignore_index=True)

pareto_optimal(all_adjusted_stat_df.query(f'Year == "{year}"')   # Will accept local variable year and string "All"
                                   .query('Output == "wOBA"')
                                   .query('1.01 > Multiplier > 0.99').reset_index(drop=True), ['MSE', 'Std. Dev'], ['Minimize', 'Maximize']).sort_values('Std. Dev')