# M03. Predict PAs
- This predicts the outcome of plate appearances
- Type: Model
- Run Frequency: Irregular
- Sources:
    - MLB API
    - Steamer
- Dates:
    - Created: 4/19/2024
    - Updated: 4/21/2024

### Imports

In [None]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"
%run "U4. Datasets.ipynb"
%run "U5. Models.ipynb"

In [None]:
# Set option to display numbers without scientific notation
pd.set_option('display.float_format', '{:.6f}'.format)

### Data

##### Park x Weather Factors

In [None]:
multiplier_df = pd.read_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"))

##### Stats API and Statcast Data

Create dataset

In [None]:
# %%time
# complete_dataset = create_pa_inputs(multiplier_df, 2015, 2024, short=50, long=300, adjust=True)

Write to CSV

In [None]:
# complete_dataset.to_csv(os.path.join(baseball_path, "nn_dataset.csv"), index=False)

Read CSV

In [None]:
complete_dataset = pd.read_csv(os.path.join(baseball_path, "nn_dataset.csv"))

Scale

In [None]:
%%time
complete_dataset[batter_inputs] = scale_batter_stats.transform(complete_dataset[batter_inputs])
complete_dataset[pitcher_inputs] = scale_pitcher_stats.transform(complete_dataset[pitcher_inputs])

##### Steamer

Read in hitters

In [None]:
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')

Clean

In [None]:
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)
steamer_hitters_df2.dropna(subset=batter_stats_fg, inplace=True)
steamer_hitters_df2[batter_stats_fg] = scale_batter_stats_steamer.transform(steamer_hitters_df2[batter_stats_fg])

Scale

In [None]:
steamer_hitters_df2[batter_stats_fg] = scale_batter_stats_steamer.transform(steamer_hitters_df2[batter_stats_fg])

Read in pitchers

In [None]:
steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers_weekly_log.csv"), encoding='iso-8859-1')

Clean

In [None]:
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df)
steamer_pitchers_df2.dropna(subset=pitcher_stats_fg2, inplace=True)

Scale

In [None]:
steamer_pitchers_df2[pitcher_stats_fg] = scale_pitcher_stats_steamer.transform(steamer_pitchers_df2[pitcher_stats_fg])

##### Merge

Format dates

In [None]:
complete_dataset['date_time'] = pd.to_datetime(complete_dataset['date'], format='%Y%m%d')
complete_dataset['date_time_copy'] = complete_dataset['date_time'].copy()
steamer_hitters_df2['date_time'] = pd.to_datetime(steamer_hitters_df2['date'], format='%Y%m%d')
steamer_pitchers_df2['date_time'] = pd.to_datetime(steamer_pitchers_df2['date'], format='%Y%m%d')

steamer_hitters_df2.rename(columns={'mlbamid': 'batter'}, inplace=True)
steamer_pitchers_df2.rename(columns={'mlbamid': 'pitcher'}, inplace=True)

Sort to prep for merge

In [None]:
complete_dataset.sort_values('date_time', inplace=True)
steamer_hitters_df2.sort_values('date_time', inplace=True)
steamer_pitchers_df2.sort_values('date_time', inplace=True)

Drop unnecessary columns

In [None]:
steamer_hitters_df2.drop(columns=['date', 'firstname', 'lastname', 'steamerid'], inplace=True)
steamer_pitchers_df2.drop(columns=['date', 'firstname', 'lastname', 'steamerid'], inplace=True)

Remove missing pitchers (occurs occassionally in 2014)

In [None]:
steamer_pitchers_df2 = steamer_pitchers_df2[~steamer_pitchers_df2['pitcher'].isna()].reset_index(drop=True)

Set data types

In [None]:
complete_dataset['batter'] = complete_dataset['batter'].astype(int).astype(str)
complete_dataset['pitcher'] = complete_dataset['pitcher'].astype(int).astype(str)
steamer_hitters_df2['batter'] = steamer_hitters_df2['batter'].astype(int).astype(str)
steamer_pitchers_df2['pitcher'] = steamer_pitchers_df2['pitcher'].astype(int).astype(str)

Merge asof most recent date in Steamer

In [None]:
complete_merged_df = pd.merge_asof(
    complete_dataset,
    steamer_hitters_df2,
    on='date_time',
    by='batter',  # Group by 'batter'
    direction='backward'  # Use 'backward', 'forward', or 'nearest' as appropriate
)
# Correct datetime (might be unnecessary, but I'm not sure which date_time it takes after the merge)
complete_merged_df['date_time'] = complete_merged_df['date_time_copy'].copy()

complete_merged_df = pd.merge_asof(
    complete_merged_df,
    steamer_pitchers_df2,
    on='date_time',
    by='pitcher',  # Group by 'batter'
    direction='backward'  # Use 'backward', 'forward', or 'nearest' as appropriate
)

##### Impute

For players with insufficient sample sizes, stats are imputed

Option 1: Steamer

First, remove from dataset if ever missing FG/Steamer stats

In [None]:
complete_merged_df = complete_merged_df[~complete_merged_df['b1_rate'].isna()]
complete_merged_df = complete_merged_df[~complete_merged_df['H9'].isna()]

In [None]:
# Add hands to use in imputation
batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L']
pitcher_stats_fg_imp = pitcher_stats_fg + ['b_L', 'p_L']

### Batters
# Use Steamer stats to predict API/Statcast stats for those with limited samples
batter_predictions = impute_batter_stats.predict(complete_merged_df.loc[complete_merged_df['pa_b'] < 40, batter_stats_fg_imp])

# Impute inputs with limited sample size with predicted values
complete_merged_df.loc[complete_merged_df['pa_b'] < 40, batter_inputs] = batter_predictions

### Pitchers
# Use Steamer stats to predict API/Statcast stats for those with limited samples
pitcher_predictions = impute_pitcher_stats.predict(complete_merged_df.loc[complete_merged_df['pa_p'] < 40, pitcher_stats_fg_imp])

# Impute inputs with limited sample size with predicted values
complete_merged_df.loc[complete_merged_df['pa_p'] < 40, pitcher_inputs] = pitcher_predictions

Option 2: 0s

In [None]:
# # Testing instead of imputing, just weighting with 0s
# complete_merged_df[batter_inputs].fillna(0, inplace=True)
# complete_merged_df[pitcher_inputs].fillna(0, inplace=True)

# # Calculate the weighted average for each column in pitcher_stats
# # Could be simplified, but I wanted to show the steps
# # Weighted average of provided value and 0. PAs and 50-PAs are weights. 
# for col in batter_inputs:
#     complete_merged_df[col] = (complete_merged_df[col] * complete_merged_df['pa_b'] + 0 * (50-complete_merged_df['pa_b']))/50

# # Calculate the weighted average for each column in pitcher_stats
# for col in pitcher_inputs:
#     complete_merged_df[col] = (complete_merged_df[col] * complete_merged_df['pa_p'] + 0 * (50-complete_merged_df['pa_p']))/50

### Select Data

Drop early observations

In [None]:
complete_merged_df = complete_merged_df[complete_merged_df['game_date'] > '2015-07-01']

Drop atypical events

In [None]:
complete_merged_df = complete_merged_df.query('eventsModel != "Cut"')

Drop observations from inactive parks

In [None]:
active_parks = list(team_map['VENUE_ID'].astype(int))
complete_merged_df = complete_merged_df[complete_merged_df['venue_id'].astype(int).isin(active_parks)]

Venue dummies

In [None]:
complete_merged_df['venue_id2'] = complete_merged_df['venue_id'].copy()
complete_merged_df = pd.get_dummies(complete_merged_df, columns=['venue_id2'], prefix='venue')

In [None]:
venue_dummy_list = [col for col in complete_merged_df.columns if col.startswith("venue_") and col != "venue_id"]

Park x weather multiplier

In [None]:
for event in events_list:
    # Assign multiplier for their 
    complete_merged_df[f'{event}_wfx'] = np.where(complete_merged_df['batSide'] == "L", complete_merged_df[f'{event}_wfx_l'], complete_merged_df[f'{event}_wfx_r'])

In [None]:
multiplier_inputs = [f'{event}_wfx' for event in events_list]

Leading dummies

In [None]:
complete_merged_df['winning'] = (complete_merged_df['preBatterScore'] > complete_merged_df['prePitcherScore']).astype(int)
complete_merged_df['winning_big'] = (complete_merged_df['preBatterScore'] > complete_merged_df['prePitcherScore'] + 3).astype(int)

Inning dummies

In [None]:
for inning in range(1, 12):
    complete_merged_df[f'inning_{inning}'] = (complete_merged_df['inning'] == inning).astype(int)
complete_merged_df['inning_11'] = (complete_merged_df['inning'] >= 11).astype(int)

In [None]:
inning_dummy_list = [col for col in complete_merged_df.columns if col.startswith("inning_")]

Out dummies

In [None]:
# complete_merged_df[['description', 'outs', 'outs_pre', 'outs_pa', 'outs_pa_inning', 'outs_0', 'outs_1', 'outs_2']].head()

In [None]:
for out in range(0, 3):
    complete_merged_df[f'outs_{out}'] = (complete_merged_df['outs_pre'] == out).astype(int)

In [None]:
out_dummy_list = ['outs_0', 'outs_1', 'outs_2']

Cumulative variables

In [None]:
cumulative_inning_list = [col for col in complete_merged_df.columns if col.endswith("_inning")]

In [None]:
cumulative_inning_list.remove('rbi_inning')

In [None]:
cumulative_game_list = [col for col in complete_merged_df.columns if col.endswith("_game")]

In [None]:
cumulative_game_list.remove('rbi_game')

All Test Inputs

In [None]:
test_inputs = ['prePitcherScore', 'preBatterScore', 'winning', 'winning_big', 'times_faced'] + cumulative_inning_list + cumulative_game_list + venue_dummy_list + inning_dummy_list + out_dummy_list

### Shift

Many batter and pitcher stats are calculated at the end of the plate appearance. For prediction purposes, we need these stats coming into the plate appearance.

##### Batter Inputs

Sort

In [None]:
complete_merged_df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

Shift

In [None]:
complete_merged_df[batter_inputs + ['ab_b', 'pa_b', 'imp_b']] = complete_merged_df.groupby(['batter', 'pitchHand'])[batter_inputs + ['ab_b', 'pa_b', 'imp_b']].shift(1)

##### Pitcher Inputs

Sort

In [None]:
complete_merged_df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

Shift

In [None]:
complete_merged_df[pitcher_inputs + ['ab_p', 'pa_p', 'imp_p']] = complete_merged_df.groupby(['pitcher', 'batSide'])[pitcher_inputs + ['ab_p', 'pa_p', 'imp_p']].shift(1)

##### Inning Sums

Sort

In [None]:
complete_merged_df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

Shift

In [None]:
complete_merged_df[cumulative_inning_list] = complete_merged_df.groupby(['gamePk', 'inning', 'pitcher'])[cumulative_inning_list].shift(1)
complete_merged_df[cumulative_inning_list] = complete_merged_df[cumulative_inning_list].fillna(0)

##### Game Sums

Sort

In [None]:
complete_merged_df[cumulative_game_list + ['times_faced']] = complete_merged_df.groupby(['gamePk', 'pitcher'])[cumulative_game_list + ['times_faced']].shift(1)

Shift

In [None]:
complete_merged_df[cumulative_game_list + ['times_faced']] = complete_merged_df[cumulative_game_list + ['times_faced']].fillna(0)

In [None]:
# complete_merged_df.query('date == 20240403').query('halfInning == "top"')[['gamePk', 'inning', 'inning_1', 'inning_2', 'outs_0', 'outs_1', 'outs_2', 'pitcherName', 'batterName', 'eventsModel', 'times_faced', 'faced_inning', 'faced_game', 'preBatterScore']].head(50)

### Select Variables

In [None]:
pa_inputs2 = batter_inputs + pitcher_inputs + hand_inputs + game_state_inputs + imp_inputs + starter_inputs + multiplier_inputs + test_inputs
keep_list = pa_inputs2 + ['pa_b', 'pa_p', 'year', 'venue_id', 'is_out', 'eventsModel', 'batterName', 'pitcherName']

In [None]:
# Keep relevant variables
model_dataset = complete_merged_df[keep_list]
# Drop if missing information
model_dataset.dropna(subset=pa_inputs2, inplace=True)
model_dataset.reset_index(drop=True, inplace=True)

### Train/Test Split

Split

In [None]:
np.random.seed(42)
model_dataset['split'] = np.random.choice([0, 0, 1], size=len(model_dataset))

Create masks to identify training and testing datasets

Note: to train on the entire dataset, you can simply set split = 0 for the entire dataset

In [None]:
training_mask = (model_dataset['split'] == 0)
testing_mask = (model_dataset['split'] == 1)

# training_out_mask = (model_dataset['is_out'] == 1) & (model_dataset['split'] == 0)
# testing_out_mask = (model_dataset['is_out'] == 1) & (model_dataset['split'] == 1)

# training_safe_mask = (model_dataset['is_out'] == 0) & (model_dataset['split'] == 0)
# testing_safe_mask = (model_dataset['is_out'] == 0) & (model_dataset['split'] == 1)

Free up memory

In [None]:
del complete_merged_df, complete_dataset, steamer_hitters_df, steamer_hitters_df2, steamer_pitchers_df, steamer_pitchers_df2, multiplier_df,  batter_predictions, pitcher_predictions

### Outs vs. Safe

In [None]:
len(pa_inputs2)

In [None]:
# Train a single model
def train_model(i, training_dataset, pa_inputs2, target, layers, activation, alpha, learning_rate, early_stopping, random_state, iters, batch_size):
    model = MLPClassifier(
        hidden_layer_sizes=layers,
        activation=activation,
        verbose=False,
        alpha=alpha,
        learning_rate_init=learning_rate,
        early_stopping=early_stopping,
        random_state=random_state + i,
        max_iter=iters,
        batch_size=batch_size
    )
    model.fit(training_dataset[pa_inputs2], training_dataset[target].values.ravel())
    return model

In [None]:
# Neural Network Layers
layers = (196,196,196)


# To string
layers_str = ''.join(str(x) for x in layers)
binary_filename = f"predict_binary_{layers_str}_{todaysdate}.sav"
print(binary_filename)

activation='relu' # Other models have been deemed worse than relu
iters = 100
learning_rate = 0.00001
alpha = 0.0001
early_stopping = True
random_state = 100
batch_size='auto'
# batch_size = 32
number_of_models = 16

In [None]:
##### %%time
# List to store model performance stats
model_stats = []
model_list = []

for i in range(number_of_models):
    # Train the model
    model = train_model(i, model_dataset[training_mask], pa_inputs2, 'is_out', layers, activation, alpha, learning_rate, early_stopping, random_state, iters, batch_size)
    model_list.append(model)
    
    # Predict probabilities
    model_dataset.loc[testing_mask, [f'model_{i}_is_safe_pred', f'model_{i}_is_out_pred']] = model.predict_proba(model_dataset.loc[testing_mask, pa_inputs2].astype(float))

    # Create decile column for current model
    model_dataset.loc[testing_mask, f'model_{i}_decile'] = pd.qcut(model_dataset[testing_mask][f'model_{i}_is_out_pred'], 20, labels=False)

    # Group by decile
    df = model_dataset[testing_mask].groupby(f'model_{i}_decile')[[f'model_{i}_is_out_pred', 'is_out']].mean().reset_index()

    # Calculate projected mean
    pred_mean = df[f'model_{i}_is_out_pred'].mean()
    
    # Calculate actual mean
    act_mean = df['is_out'].mean()
    
    # Calculate mean squared error
    df['error_sq'] = (df[f'model_{i}_is_out_pred'] - df['is_out']) ** 2
    mse = np.mean(df['error_sq'])

    
    # Calculate difference from mean
    df['diff_mean'] = abs(df[f'model_{i}_is_out_pred'] - pred_mean)
    diff_mean = df['diff_mean'].sum()
    

    # Append model stats
    model_stats.append({'model': f'model_{i}', 'index': i, 'mean': act_mean, 'predicted': pred_mean, 'mse': mse, 'diff_mean': diff_mean})
    
                        
    # Graph results
    plt.figure(figsize=(6, 6))  # Make the figure square
    plt.plot(df[f'model_{i}_is_out_pred'], df['is_out'], marker='o', color='black', label='is_out')
    plt.plot(df[f'model_{i}_is_out_pred'], df[f'model_{i}_is_out_pred'], linestyle='--', color='red', label='Ideal')

    # Set limits for both axes
    plt.xlim(.6, .75)
    plt.ylim(.6, .75)

    # plt.xlim(0, 1)
    # plt.ylim(0, 1)

    # Make the aspect ratio equal
    plt.gca().set_aspect('equal', adjustable='box')

    plt.xlabel('Predicted Probability of Out')
    plt.ylabel('Actual Probability of Out')
    plt.title(f'Model {i}: Decile Analysis')
    plt.legend()
    plt.grid()
    plt.show()
                        
    # Print out stats
    print(f"model_{i}", f"index: {i}", f"mean: {act_mean}", f"predicted: {pred_mean}", f"MSE: {mse}", f"Integral: {diff_mean}")
    print("\n")

In [None]:
# Identify Pareto-optimal models
pareto_optimal = []
for current in model_stats:
    dominated = False
    for other in model_stats:
        if (
            other['mse'] <= current['mse'] and 
            other['diff_mean'] >= current['diff_mean'] and 
            (other['mse'] < current['mse'] or other['diff_mean'] > current['diff_mean'])
        ):
            dominated = True
            break
    if not dominated:
        pareto_optimal.append(current)
        
# Output Pareto-optimal models
print("\nPareto-optimal models:")
for stats in pareto_optimal:
    print(f"Model: {stats['model']}, Mean: {stats['mean']}, Predicted: {stats['predicted']}, Multiplier : {stats['mean'] / stats['predicted']}, MSE: {stats['mse']}, Diff Mean: {stats['diff_mean']:.4f}")

In [None]:
# 10 relu
mses = []
diff_means = []

for model in model_stats:
    diff_means.append(model['diff_mean'])
    mses.append(model['mse'])
    
np.mean(diff_means), np.mean(mses)

In [None]:
pickle.dump(model_list[0], open(os.path.join(model_path, "M03. Plate Appearances", binary_filename), 'wb'))
binary_filename

In [None]:
model_dataset[testing_mask].query('year == 2024')[['is_out', 'model_14_is_out_pred']].mean()

### Outs

In [None]:
%%time
# Neural network layerss
layers = (10,)
# To string
layers_str = ''.join(str(x) for x in layers)
# Activation method
activation = 'relu'
# Iterations
iters = 100

outs_filename = f"predict_outs_{layers_str}_{todaysdate}.sav"
print(outs_filename)

# Define the individual models in the ensemble
models = [
    MLPClassifier(hidden_layer_sizes=layers, activation='relu', random_state=1, early_stopping=True, learning_rate_init=0.00001, alpha=0.00001, max_iter=iters),
    MLPClassifier(hidden_layer_sizes=layers, activation='relu', random_state=2, early_stopping=True, learning_rate_init=0.00001, alpha=0.00001, max_iter=iters),
    MLPClassifier(hidden_layer_sizes=layers, activation='relu', random_state=3, early_stopping=True, learning_rate_init=0.00001, alpha=0.00001, max_iter=iters),

]

# Create the ensemble classifier using VotingClassifier
predict_outs = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(model_dataset[training_out_mask].reset_index(drop=True)[pa_inputs2], model_dataset[training_out_mask][['eventsModel']].values.ravel())

# Save model
pickle.dump(predict_outs, open(os.path.join(model_path, "M03. Plate Appearances", outs_filename), 'wb'))

In [None]:
# Predict out types
outs_outputs = list(predict_outs.classes_)
outs_outputs_pred = [x + "_pred" for x in outs_outputs]

model_dataset.loc[testing_out_mask, outs_outputs_pred] = predict_outs.predict_proba(model_dataset[testing_out_mask][pa_inputs2])

In [None]:
# Create deciles
for var in outs_outputs:
    # Create actual outcome column
    model_dataset.loc[testing_out_mask, f'{var}_act'] = (model_dataset.loc[testing_out_mask, 'eventsModel'] == var).astype(int)
    
    # Create deciles
    model_dataset.loc[testing_out_mask, f'{var}_decile'] = pd.qcut(model_dataset.loc[testing_out_mask, f'{var}_pred'], 10, labels=False)
    
    # Create aggregated dataframe
    globals()[f"{var}_df"] = model_dataset.loc[testing_out_mask].groupby(f'{var}_decile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()
    # globals()[f"{var}_df"] = testing_dataset.query('year >= 2022').loc[mask].groupby(f'{var}_decile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()
    # globals()[f"{var}_df"] = testing_dataset.query('venue_id == 19').loc[mask].groupby(f'{var}_decile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()

In [None]:
# Create figures
fig, axs = plt.subplots(2, 3, figsize=(12, 8))

for i, var in enumerate(outs_outputs):
    row = i // 3  # Calculate the row index based on the iteration
    col = i % 3   # Calculate the column index based on the iteration
    axs[row, col].plot(globals()[f"{var}_df"][f'{var}_decile'], globals()[f"{var}_df"][f'{var}_pred'], color='red')
    axs[row, col].plot(globals()[f"{var}_df"][f'{var}_decile'], globals()[f"{var}_df"][f'{var}_act'], color='black')
    axs[row, col].set_title(var)
    # axs[row, col].set_ylim(0,0.35)


# Add some space between subplots to prevent overlapping
fig.tight_layout(pad=.0)

# Show the figure
plt.show()

### Safe

In [None]:
%%time
# Neural network layers
# layers = (174,174,174,174)
# layers = (10,)
# layers = (198,198,198,198,198,198)
layers = (196,196,196,196,196)
# To string
layers_str = ''.join(str(x) for x in layers)
# Activation method
activation = 'relu'
# Iterations
iters = 100
# Alpha
alpha = 0.0001
# Learning Rate
learning_rate = 0.00001
# Batch Size
batch_size='auto'
# batch_size=32
# Random state
random_state = 10
# Number of models
num_models = 3

safe_filename = f"predict_safe_{layers_str}_{todaysdate}.sav"
print(safe_filename)



# Define the individual models in the ensemble
models = [
    MLPClassifier(hidden_layer_sizes=layers, activation=activation, verbose=True, alpha=alpha, learning_rate_init=learning_rate, early_stopping=True, random_state=random_state+i, max_iter=iters, batch_size=batch_size)
    for i in range(num_models)
]

# Create the ensemble classifier using VotingClassifier
predict_safe = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(model_dataset[training_safe_mask][pa_inputs2], model_dataset[training_safe_mask][['eventsModel']].values.ravel())

# Save model
pickle.dump(predict_safe, open(os.path.join(model_path, "M03. Plate Appearances", safe_filename), 'wb'))

In [None]:
# Predict safe types
safe_outputs = list(predict_safe.classes_)
safe_outputs_pred = [x + "_pred" for x in safe_outputs]

model_dataset.loc[testing_safe_mask, safe_outputs_pred] = predict_safe.predict_proba(model_dataset[testing_safe_mask][pa_inputs2])

In [None]:
# Create deciles
for var in safe_outputs:    
    # Create actual outcome column
    model_dataset.loc[testing_safe_mask, f'{var}_act'] = (model_dataset.loc[testing_safe_mask, 'eventsModel'] == var).astype(int)
    
    # Create deciles
    model_dataset.loc[testing_safe_mask, f'{var}_decile'] = pd.qcut(model_dataset.loc[testing_safe_mask, f'{var}_pred'], 10, labels=False)
    
    # Create aggregated dataframe
    globals()[f"{var}_df"] = model_dataset.loc[testing_safe_mask].groupby(f'{var}_decile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()
    globals()[f"{var}_df"] = model_dataset.query('year >= 2022').loc[testing_safe_mask].groupby(f'{var}_decile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()
    # globals()[f"{var}_df"] = model_dataset.query('venue_id == 19').loc[testing_safe_mask].groupby(f'{var}_decile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()

In [None]:
# Create figures
fig, axs = plt.subplots(2, 3, figsize=(12, 8))

for i, var in enumerate(safe_outputs):
    row = i // 3  # Calculate the row index based on the iteration
    col = i % 3   # Calculate the column index based on the iteration
    axs[row, col].plot(globals()[f"{var}_df"][f'{var}_decile'], globals()[f"{var}_df"][f'{var}_act'], color='black')
    axs[row, col].plot(globals()[f"{var}_df"][f'{var}_decile'], globals()[f"{var}_df"][f'{var}_pred'], color='red')
    axs[row, col].set_title(var)
    # axs[row, col].set_ylim(globals()[df_name][f'{var}_act'].min(),globals()[df_name][f'{var}_act'].max())
    

# Add some space between subplots to prevent overlapping
fig.tight_layout(pad=.0)

# Show the figure
plt.show()

### All

##### Voting

In [None]:
%%time
# Neural network layers
# layers = (196,196,196,196,196,196)
layers = (10,)
# To string
layers_str = ''.join(str(x) for x in layers)
# Activation method
activation = 'relu'
# Iterations
iters = 1000
# Alpha
alpha = 0.0001
# Learning Rate
learning_rate = 0.00001
# Batch Size
batch_size='auto'
# batch_size=32
# Random state
random_state = 1000000000
# Number of models
num_models = 1

all_filename = f"predict_all_{layers_str}_{todaysdate}.sav"
print(all_filename)


# Define the individual models in the ensemble
models = [
    MLPClassifier(hidden_layer_sizes=layers, activation=activation, verbose=True, alpha=alpha, learning_rate_init=learning_rate, early_stopping=True, 
                  random_state=random_state+i, max_iter=iters, batch_size=batch_size)
    for i in range(num_models)]

# Create the ensemble classifier using VotingClassifier
predict_all = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(model_dataset[training_mask][pa_inputs2], model_dataset[training_mask][['eventsModel']].values.ravel())

# Save model
pickle.dump(predict_all, open(os.path.join(model_path, "M03. Plate Appearances", all_filename), 'wb'))

##### Stacked

In [None]:
%%time
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Neural network layers
# layers = (196,196,196,196,196,196)
layers = (196,)
layers = (10,)
# To string
layers_str = ''.join(str(x) for x in layers)
# Activation method
activation = 'relu'
# Iterations
iters = 1000
# Alpha
alpha = 0.0001
# Learning Rate
learning_rate = 0.00001
# Batch Size
batch_size='auto'
# batch_size=32
# Random state
random_state = 1000
# Number of models
num_models = 5
cv = 5

all_filename = f"predict_all_{layers_str}_{todaysdate}.sav"
print(all_filename)


# Define the individual models in the ensemble
models = [
    MLPClassifier(hidden_layer_sizes=layers, activation=activation, verbose=True, alpha=alpha, learning_rate_init=learning_rate, early_stopping=True, 
                  random_state=random_state+i, max_iter=iters, batch_size=batch_size)
    for i in range(num_models)]

# Define the meta-model
meta_model = LogisticRegression()

# Create the ensemble classifier using StackingClassifier
predict_all = StackingClassifier(
    estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], 
    final_estimator=meta_model, 
    n_jobs=2, cv=cv
).fit(model_dataset[training_mask][pa_inputs2], model_dataset[training_mask][['eventsModel']].values.ravel())

# Save model
pickle.dump(predict_all, open(os.path.join(model_path, "M03. Plate Appearances", all_filename), 'wb'))

In [None]:
# Predict all types
all_outputs = list(predict_all.classes_)
all_outputs_pred = [x + "_pred" for x in all_outputs]

model_dataset.loc[testing_mask, all_outputs_pred] = predict_all.predict_proba(model_dataset[testing_mask][pa_inputs2])

# Calculate predicted out rate
model_dataset['is_out_pred'] = model_dataset[['so_pred', 'fo_pred', 'go_pred', 'lo_pred', 'po_pred']].sum(axis=1)

In [None]:
# Set quantiles
quantiles = 40

# Create quantiles
for var in all_outputs + ['is_out']:    
    # Create actual outcome column
    model_dataset.loc[testing_mask, f'{var}_act'] = (model_dataset.loc[testing_mask, 'eventsModel'] == var).astype(int)

    # Create actual is_out value
    if var == "is_out":
        model_dataset.loc[testing_mask, f'{var}_act'] = model_dataset.loc[testing_mask, 'eventsModel'].isin(['so', 'lo', 'po', 'go', 'fo']).astype(int)
    
    # Create deciles
    model_dataset.loc[testing_mask, f'{var}_quantile'] = pd.qcut(model_dataset.loc[testing_mask, f'{var}_pred'], quantiles, labels=False)
    
    # Create aggregated dataframe
    globals()[f"{var}_df"] = model_dataset.loc[testing_mask].groupby(f'{var}_quantile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()
    # globals()[f"{var}_df"] = model_dataset.query('year >= 2022').loc[testing_mask].groupby(f'{var}_quantile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()
    # globals()[f"{var}_df"] = model_dataset.query('venue_id == 19').loc[testing_mask].groupby(f'{var}_quantile')[[f'{var}_act', f'{var}_pred']].mean().reset_index()

In [None]:
# Create figures
fig, axs = plt.subplots(4, 3, figsize=(12, 16))

for i, var in enumerate(all_outputs + ['is_out']):
    row = i // 3  # Calculate the row index based on the iteration
    col = i % 3   # Calculate the column index based on the iteration
    axs[row, col].plot(globals()[f"{var}_df"][f'{var}_quantile'], globals()[f"{var}_df"][f'{var}_act'], color='black')
    axs[row, col].plot(globals()[f"{var}_df"][f'{var}_quantile'], globals()[f"{var}_df"][f'{var}_pred'], color='red')
    axs[row, col].set_title(var)
    # axs[row, col].set_ylim(globals()[df_name][f'{var}_act'].min(),globals()[df_name][f'{var}_act'].max())
    

# Add some space between subplots to prevent overlapping
fig.tight_layout(pad=.0)

# Show the figure
plt.show()

In [None]:
model_dataset['total_bases_act'] = (model_dataset['b1_act'] +
                                                  model_dataset['bb_act'] + 
                                                  model_dataset['hbp_act'] +
                                                  model_dataset['b2_act'] * 2 +
                                                  model_dataset['b3_act'] * 3 + 
                                                  model_dataset['hr_act'] * 4
                                                 )

model_dataset['total_bases_pred'] = (model_dataset['b1_pred'] +
                                                  model_dataset['bb_pred'] + 
                                                  model_dataset['hbp_pred'] +
                                                  model_dataset['b2_pred'] * 2 +
                                                  model_dataset['b3_pred'] * 3 + 
                                                  model_dataset['hr_pred'] * 4
                                                 )

In [None]:
model_dataset[testing_mask][['total_bases_act', 'total_bases_pred']].mean()

In [None]:
stat_lists = []
for event in events_list + ['is_out']:
    globals()[f"{event}_df"]['se'] = (globals()[f"{event}_df"][f'{event}_act'] - globals()[f"{event}_df"][f'{event}_pred']) ** 2
    stat_list = [event, model_dataset[testing_mask][f'{event}_act'].mean(), model_dataset[testing_mask][f'{event}_pred'].mean(), globals()[f"{event}_df"]['se'].mean(), model_dataset[testing_mask][f'{event}_pred'].std()]
    stat_lists.append(stat_list)
stat_df = pd.DataFrame(stat_lists, columns=['Event', 'Actual', 'Predicted', 'MSE', 'STDev'])
pd.options.display.float_format = '{:.3e}'.format

print(f"Layers: {layers}, num_models {num_models}, cv: {cv}, file_name: {all_filename}")
stat_df

In [None]:
stat_lists = []
for event in events_list + ['is_out']:
    globals()[f"{event}_df"]['se'] = (globals()[f"{event}_df"][f'{event}_act'] - globals()[f"{event}_df"][f'{event}_pred']) ** 2
    stat_list = [event, model_dataset[testing_mask][f'{event}_act'].mean(), model_dataset[testing_mask][f'{event}_pred'].mean(), globals()[f"{event}_df"]['se'].mean(), model_dataset[testing_mask][f'{event}_pred'].std()]
    stat_lists.append(stat_list)
stat_df = pd.DataFrame(stat_lists, columns=['Event', 'Actual', 'Predicted', 'MSE', 'STDev'])
pd.options.display.float_format = '{:.3e}'.format

print(f"Layers: {layers}, num_models {num_models}, cv: {cv}, file_name: {all_filename}")
stat_df

In [None]:
stat_lists = []
for event in events_list + ['is_out']:
    globals()[f"{event}_df"]['se'] = (globals()[f"{event}_df"][f'{event}_act'] - globals()[f"{event}_df"][f'{event}_pred']) ** 2
    stat_list = [event, model_dataset[testing_mask][f'{event}_act'].mean(), model_dataset[testing_mask][f'{event}_pred'].mean(), globals()[f"{event}_df"]['se'].mean(), model_dataset[testing_mask][f'{event}_pred'].std()]
    stat_lists.append(stat_list)
stat_df = pd.DataFrame(stat_lists, columns=['Event', 'Actual', 'Predicted', 'MSE', 'STDev'])
pd.options.display.float_format = '{:.3e}'.format

print(f"Layers: {layers}, num_models {num_models}, cv: {cv}, file_name: {all_filename}")
stat_df

In [None]:
model_dataset[testing_mask][['total_bases_act', 'total_bases_pred']].describe()

In [None]:
stat_lists = []
for event in events_list + ['is_out']:
    globals()[f"{event}_df"]['se'] = (globals()[f"{event}_df"][f'{event}_act'] - globals()[f"{event}_df"][f'{event}_pred']) ** 2
    stat_list = [event, 
                 model_dataset[testing_mask][f'{event}_act'].mean(), 
                 model_dataset[testing_mask][f'{event}_pred'].mean(), 
                 globals()[f"{event}_df"]['se'].mean(), 
                 model_dataset[testing_mask][f'{event}_pred'].std()
                 
                ]
    stat_lists.append(stat_list)
stat_df = pd.DataFrame(stat_lists, columns=['Event', 'Actual', 'Predicted', 'MSE', 'STDev'])
pd.options.display.float_format = '{:.3e}'.format

print(f"Layers: {layers}, num_models {num_models}, cv: {cv}, file_name: {all_filename}")
stat_df

In [None]:
model_dataset[testing_mask][['total_bases_act', 'total_bases_pred']].describe()

In [None]:
import winsound; winsound.MessageBeep()