# M00. Expected Events

### Imports

In [None]:
import sys
if not hasattr(sys.modules['__main__'], '__file__'):
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"

### Data

##### MLB Stats API

In [None]:
start_year, end_year = 2015, 2025

Merge MLB Stats API and Statcast data

In [None]:
%%time
df = merge_datasets(start_year, end_year)
df = clean_weather(df)
df = create_events(df)
df = create_variables(df)
df = start_data(df)

##### Open Meteo

Read in Open Meteo weather data

In [None]:
%%time
weather_df = pd.concat(map(pd.read_csv, glob.glob(r"C:\Users\james\Documents\MLB\Database\A06. Weather\1. Open Meteo\*.csv")), ignore_index=True)[
       ['game_id', 'year', 'venue_name', 'location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude', 
        'fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 'location.elevation', 'location.azimuthAngle', 'fieldInfo.roofType', 'active', 
        'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'surface_pressure', 'wind_speed_10m', 'wind_direction_10m', 'weather_code', 'precipitation_probability']]

Calculate wind vectors

In [None]:
def calculate_vectors(row, azimuth_column, wind_column, speed_column):
    angle = row[wind_column] - row[azimuth_column]
    
    # Calculate vectors
    x_vect = round(math.sin(math.radians(angle)), 5) * row[speed_column] * -1
    y_vect = round(math.cos(math.radians(angle)), 5) * row[speed_column] * -1

    return pd.Series([x_vect, y_vect], index=['x_vect', 'y_vect'])

In [None]:
weather_df[['meteo_x_vect', 'meteo_y_vect']] = weather_df.apply(lambda row: calculate_vectors(row, 'location.azimuthAngle', 'wind_direction_10m', 'wind_speed_10m'), axis=1)

##### Baserunning

In [None]:
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1', usecols=['proj_date', 'mlbamid', 'PA', 'UBR'], dtype='str')

Convert data types

In [None]:
steamer_hitters_df[['PA', 'UBR']] = steamer_hitters_df[['PA', 'UBR']].astype(float)

In [None]:
steamer_hitters_df['proj_date'] = pd.to_datetime(steamer_hitters_df['proj_date'])

Calculate UBR per 600 Plate Appearances

In [None]:
steamer_hitters_df['UBR600'] = steamer_hitters_df['UBR'] / steamer_hitters_df['PA'] * 600

##### Merge

Weather data

In [None]:
complete_dataset = df.merge(weather_df.drop(columns=['year']), left_on=['gamePk'], right_on=['game_id'], how='inner')

Use weather column from MLB data to adjust for domes/roofs

In [None]:
mask = complete_dataset['weather'].str.contains('Roof|Dome', case=False, na=False)

In [None]:
complete_dataset.loc[mask, 'temperature'] = 70
complete_dataset.loc[mask, 'x_vect'] = 0
complete_dataset.loc[mask, 'y_vect'] = 0

In [None]:
complete_dataset.loc[mask, 'temperature_2m'] = 70
complete_dataset.loc[mask, 'meteo_x_vect'] = 0
complete_dataset.loc[mask, 'meteo_y_vect'] = 0
complete_dataset.loc[mask, 'relative_humidity_2m'] = 60
complete_dataset.loc[mask, 'dew_point_2m'] = 57

Baserunning Data

In [None]:
complete_dataset['proj_date'] = pd.to_datetime(complete_dataset['date'], format='%Y%m%d')

In [None]:
complete_dataset['mlbamid'] = complete_dataset['batter'].astype(str)

In [None]:
complete_dataset = pd.merge_asof(
    complete_dataset.sort_values('proj_date'),
    steamer_hitters_df.sort_values('proj_date'),
    by='mlbamid',
    on='proj_date',
    direction='backward'
)

Note:
- if y > 198.27 and x < 125.42), it's actually to left
- if y > 198.27 and x > 125.42), it's actually to right

### Model #1. Expected Outcome

Probability of events given how the baseball was launched, where it was launched to, and some information about the batter, including handedness and base running. Notably excluded park and weather.

$ \hat{\text{eventsModel}} = launch\_angle + launch\_speed + to\_l + to\_lc + to\_c + to\_rc + to\_r + b\_L + UBR600 $

##### Inputs

In [None]:
outcome_inputs = ['launch_angle', 'launch_speed', 'to_l', 'to_lc', 'to_c', 'to_rc', 'to_r', 'b_L', 'UBR600'] + ['bb', 'hbp', 'so']

##### Sample

Sent launch data to 0 if not batted

In [None]:
complete_dataset[['launch_angle', 'launch_speed']] = complete_dataset[['launch_angle', 'launch_speed']].fillna(0)

Remove atypical events and missings

In [None]:
complete_dataset = complete_dataset[~complete_dataset['eventsModel'].isin(["Cut"])].dropna(subset=outcome_inputs)

Define model input and outputs

In [None]:
X = complete_dataset[outcome_inputs].values
y = complete_dataset[['eventsModel']].values

##### Encode

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    # One-hot encode the target
    encode_outcome = OneHotEncoder(sparse_output=False)
    # Fit and transform
    y_encoded = encode_outcome.fit_transform(y)
    # Save
    pickle.dump(encode_outcome, open(os.path.join(model_path, "M00. Expected Events", "encode_outcome.pkl"), 'wb'))
else:
    y_encoded = encode_outcome.transform(y)

# Calculate number of classes (used for model inputs)
num_classes = y_encoded.shape[1]

##### Scale

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    # Scale
    scale_outcome = StandardScaler()
    # Fit and transform
    X_scaled = scale_outcome.fit_transform(X)
    # Save
    pickle.dump(scale_outcome, open(os.path.join(model_path, "M00. Expected Events", "scale_outcome.pkl"), 'wb'))
else:
    X_scaled = scale_outcome.transform(X)

##### Train

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    predict_outcome = Sequential([
        Dense(32, input_shape=(X_scaled.shape[1],), activation='relu'),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(num_classes, activation='softmax')  # softmax for multi-class classification
    ])
    
    predict_outcome.compile(optimizer=Adam(learning_rate=0.00001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    early_stop = EarlyStopping(
        monitor='val_loss',      # watch validation loss
        patience=5,              # stop if no improvement after 5 epochs
        restore_best_weights=True
    )
    
    predict_outcome.fit(
        X_scaled, y_encoded,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stop]
    )

    predict_outcome.save(os.path.join(model_path, "M00. Expected Events", 'predict_outcome.keras'))

##### Predict

In [None]:
predictions = predict_outcome.predict(X_scaled)

prediction_df = pd.DataFrame(predictions, columns=encode_outcome.categories_[0])
prediction_df = prediction_df.add_suffix('_pred')

prediction_df = pd.concat([complete_dataset.reset_index(drop=True), prediction_df], axis=1)

##### Evaluate

In [None]:
# Adjust the number of rows and columns
n_events = len(events_list)
n_cols = 3
n_rows = (n_events + n_cols - 1) // n_cols  # Ceiling division

# Set square plots: each subplot is 5x5 inches
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 5 * n_rows))
axes = axes.flatten()

for i, event in enumerate(events_list):
    ax = axes[i]
    pred_col = f"{event}_pred"
    
    if pred_col not in prediction_df.columns:
        continue

    # Bucket the predicted values into quantiles
    prediction_df['bucket'] = pd.qcut(prediction_df[pred_col], q=20, duplicates='drop')

    # Compute averages
    bucket_avg = prediction_df.groupby('bucket').agg(
        avg_pred=(pred_col, 'mean'),
        avg_actual=(event, 'mean')
    ).reset_index()

    # Plot
    ax.plot(bucket_avg['avg_pred'], label='Predicted')
    ax.plot(bucket_avg['avg_actual'], label='Actual')
    ax.set_title(f"{event.upper()} Prediction vs Actual")
    ax.set_xlabel("Quantile Bucket")
    ax.set_ylabel("Rate")
    ax.legend()
    ax.grid(True)

# Remove extra axes if any
for j in range(n_events, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


### Calculate PFX

In [None]:
events_list_pred = [f"{event}_pred" for event in events_list]
pfx_list = [f"{event}_pfx" for event in events_list]

In [None]:
game_pfx_df = prediction_df.groupby(['venue_id', 'gamePk', 'batSide', 'date'])[events_list + events_list_pred].mean().reset_index()

In [None]:
num_games = 243

In [None]:
# Make sure the data is sorted appropriately for rolling
game_pfx_df = game_pfx_df.sort_values(['venue_id', 'date', 'batSide'])

##### Unshifted

In [None]:
# Compute the rolling average of the last num_games *including* the current row
rolling_avgs = (
    game_pfx_df
    .groupby(['venue_id', 'batSide'], group_keys=False)
    .apply(lambda group: group[events_list + events_list_pred].shift(0).rolling(num_games, min_periods=1).mean())
)

# Rename columns to indicate they are rolling averages
rolling_avgs.columns = [f'{col}_rolling' for col in events_list + events_list_pred]

# Concatenate with the original dataframe
unshifted_game_pfx_df = pd.concat([game_pfx_df, rolling_avgs], axis=1)

for event in events_list:
    unshifted_game_pfx_df[f'{event}_pfx'] = unshifted_game_pfx_df[f'{event}_rolling'] / unshifted_game_pfx_df[f'{event}_pred_rolling']

In [None]:
unshifted_game_pfx_df[['venue_id', 'batSide'] + [col for col in unshifted_game_pfx_df if col.endswith("pfx")]].drop_duplicates(subset=['venue_id', 'batSide'], keep='last').to_csv(os.path.join(baseball_path, "Park Latest.csv"), index=False)

##### Shifted

In [None]:
# Compute the rolling average of the last num_games *including* the current row
rolling_avgs = (
    game_pfx_df
    .groupby(['venue_id', 'batSide'], group_keys=False)
    .apply(lambda group: group[events_list + events_list_pred].shift(1).rolling(num_games, min_periods=1).mean())
)

# Rename columns to indicate they are rolling averages
rolling_avgs.columns = [f'{col}_rolling' for col in events_list + events_list_pred]

# Concatenate with the original dataframe
shifted_game_pfx_df = pd.concat([game_pfx_df, rolling_avgs], axis=1)

for event in events_list:
    shifted_game_pfx_df[f'{event}_pfx'] = shifted_game_pfx_df[f'{event}_rolling'] / shifted_game_pfx_df[f'{event}_pred_rolling']

### Model #2. Weather Factors

$ \hat{\text{eventsModel2}} = \hat{\text{eventsModel}} + pfx + meteo\_x\_vect + meteo\_y\_vect + temperature\_2m + relative\_humidity\_2m + dew\_point\_2m + surface\_pressure + venue\_id $

##### Inputs

Meteo weather inputs

In [None]:
meteo_weather_list = ['meteo_x_vect', 'meteo_y_vect', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'surface_pressure']

Parks with sufficient samples

In [None]:
venue_dummy_list = [f'venue_{id}' for id in sorted(prediction_df['venue_id'].value_counts()[lambda x: x > 20000].index.tolist())]

Select inputs

In [None]:
wfx_inputs = events_list_pred + pfx_list + meteo_weather_list + venue_dummy_list + ['b_L']

##### Sample

Merge in park factors

In [None]:
sample_df2 = prediction_df.merge(shifted_game_pfx_df[['gamePk', 'batSide'] + pfx_list], on=['gamePk', 'batSide'], how='left')

Create venue dummies

Note: not all venue dummies may be included in venue_dummy_list

In [None]:
sample_df2['venue_id2'] = sample_df2['venue_id'].copy()

In [None]:
sample_df2 = pd.get_dummies(sample_df2, columns=['venue_id2'], prefix='venue', drop_first=False)

Set pfx to 1 if not in venue sample

Note: we may want to set this in shifted_game_pfx_df and default to a rolling value

In [None]:
sample_df2['sample_venue'] = sample_df2[venue_dummy_list].sum(axis=1)

In [None]:
for pfx in pfx_list:
    sample_df2[pfx] = np.where(sample_df2['sample_venue'] == 0, 1, sample_df2[pfx])

Drop if missing inputs

In [None]:
sample_df2.dropna(subset=wfx_inputs, inplace=True)

Group by game

In [None]:
sample_df2 = sample_df2.groupby(['gamePk', 'date', 'venue_id', 'batSide'])[wfx_inputs + events_list].mean().reset_index()

In [None]:
sample_df2['b_L'] = (sample_df2['batSide'] == "L").astype(int)

In [None]:
sample_df2 = sample_df2[sample_df2['date'] > 20180101]

In [None]:
# Split features and target
X = sample_df2[wfx_inputs].values
y = sample_df2[events_list].values

# Number of classes
num_classes = y.shape[1]

In [None]:
sample_df2[wfx_inputs].head()

###### Scale

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    # Scale
    scale_wfx = StandardScaler()
    # Fit and transform
    X_scaled = scale_wfx.fit_transform(X)
    # Save
    pickle.dump(scale_wfx, open(os.path.join(model_path, "M00. Expected Events", "scale_wfx.pkl"), 'wb'))
else:
    X_scaled = scale_wfx.transform(X)

##### Train

In [None]:
from tensorflow.keras.losses import KLDivergence

In [None]:
%%time
if not hasattr(sys.modules['__main__'], '__file__'):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.callbacks import EarlyStopping
    from tensorflow.keras.layers import BatchNormalization, Dropout

    import numpy as np
    import os

    class VotingEnsemble:
        def __init__(self, models):
            self.models = models

        def predict(self, X):
            predictions = np.array([model.predict(X, verbose=0) for model in self.models])
            return np.mean(predictions, axis=0)

    ensemble_size = 3
    ensemble_models = []
    model_dir = os.path.join(model_path, "M00. Expected Events")
    os.makedirs(model_dir, exist_ok=True)

    for i in range(ensemble_size):
        model = Sequential([
            Dense(128, input_shape=(X_scaled.shape[1],), activation='relu'),
            Dropout(0.3),
            Dense(256, activation='relu'),
            Dropout(0.3),
            Dense(128, activation='relu'),
            Dense(64, activation='relu'),
            Dense(32, activation='relu'),
            Dense(num_classes, activation='softmax')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001),
                      # loss='categorical_crossentropy',
                      loss=keras.losses.KLDivergence(),
                      metrics=[KLDivergence()])

        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )

        model.fit(
            X_scaled, y,
            epochs=100,
            batch_size=32,
            validation_split=0.2,
            callbacks=[early_stop],
            verbose=1
        )

        model_path_i = os.path.join(model_dir, f'predict_wfx_{i}.keras')
        model.save(model_path_i)
        ensemble_models.append(model)

    # Wrap ensemble in predict_wfx for compatibility
    predict_wfx = VotingEnsemble(ensemble_models)

##### Predict

Save event averages for use in predictions in A06. Weather

In [None]:
average_df = pd.DataFrame(sample_df2[events_list].mean()).T
# average_df.to_csv(os.path.join(baseball_path, "Event Averages.csv"), index=False)

Before predicting, replace with mean predicted event rates to determine how weather would affect an average batted-ball game

In [None]:
sample_df3 = sample_df2.copy()
for event in events_list:
    sample_df3[f'{event}_pred'] = sample_df3[event].mean()

Now actually predict

In [None]:
# Split features and target
X2 = sample_df3[wfx_inputs].values
y2 = sample_df3[events_list].values

# Scale the features
X2_scaled = scale_wfx.transform(X2)

In [None]:
predictions2 = predict_wfx.predict(X2_scaled)
prediction_df2 = pd.DataFrame(predictions2, columns=events_list)

prediction_df2 = prediction_df2.add_suffix('_pred2')

prediction_df2 = pd.concat([prediction_df2, sample_df3.reset_index()], axis=1)

Calculate WFX

In [None]:
prediction_df2.head()

In [None]:
for event in events_list:
    prediction_df2[f'{event}_wfx'] = prediction_df2[f'{event}_pred2'] / prediction_df2[f'{event}_pred']

In [None]:
prediction_df2[['hr_pred', 'hr_pred2', 'hr']].mean()

In [None]:
hr_df = prediction_df2.groupby('venue_id')[['hr_pred2', 'hr']].mean()
hr_df

In [None]:
hr_df['squared_error'] = (hr_df['hr_pred2'] - hr_df['hr']) ** 2
hr_df['squared_error'].mean()

In [None]:
# Sort and get top/bottom 500
top_500 = prediction_df2.nlargest(500, 'hr_wfx')
bottom_500 = prediction_df2.nsmallest(500, 'hr_wfx')

# Get value counts
top_counts = top_500['venue_id'].value_counts().head(5)
bottom_counts = bottom_500['venue_id'].value_counts().head(5)

# Combine into a 5x4 DataFrame
result_df = pd.DataFrame({
    'Top Venue': top_counts.index,
    'Top Count': top_counts.values,
    'Bottom Venue': bottom_counts.index,
    'Bottom Count': bottom_counts.values
})

result_df

In [None]:
prediction_df2.sort_values('hr_wfx', ascending=False).head(100)[['meteo_y_vect', 'temperature_2m']].mean()

In [None]:
prediction_df2.sort_values('hr_wfx', ascending=False).tail(100)[['meteo_y_vect', 'temperature_2m']].mean()

In [None]:
# Adjust the number of rows and columns
n_events = len(events_list)
n_cols = 3
n_rows = (n_events + n_cols - 1) // n_cols  # Ceiling division

# Set square plots: each subplot is 5x5 inches
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 5 * n_rows))
axes = axes.flatten()

for i, event in enumerate(events_list):
    ax = axes[i]
    pred_col = f"{event}_pred2"
    
    if pred_col not in prediction_df2.columns:
        continue

    # Bucket the predicted values into quantiles
    prediction_df2['bucket'] = pd.qcut(prediction_df2[pred_col], q=10, duplicates='drop')

    # Compute averages
    bucket_avg = prediction_df2.groupby('bucket').agg(
        avg_pred=(pred_col, 'mean'),
        avg_actual=(event, 'mean')
    ).reset_index()

    # Plot
    ax.plot(bucket_avg['avg_pred'], label='Predicted')
    ax.plot(bucket_avg['avg_actual'], label='Actual')
    ax.set_title(f"{event.upper()} Prediction vs Actual")
    ax.set_xlabel("Quantile Bucket")
    ax.set_ylabel("Rate")
    ax.legend()
    ax.grid(True)

    # Set y-axis limits: from 1/3 of the max to the max
    y_max = max(bucket_avg['avg_pred'].max(), bucket_avg['avg_actual'].max())
    y_min = y_max / 2

    # Create 10 evenly spaced ticks from y_min to y_max
    ticks = np.linspace(y_min, y_max, 10)
    ax.set_ylim(y_min, y_max)
    ax.set_yticks(np.round(ticks, 5))  # round for cleaner labels

# Remove extra axes if any
for j in range(n_events, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
##### %run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"

In [None]:
prediction_df[events_list_pred].describe()

In [None]:
prediction_df[events_list] = prediction_df[events_list_pred].copy()

In [None]:
if 'bucket' in list(prediction_df.columns):
    prediction_df.drop(columns=['bucket'], inplace=True)

In [None]:
### Rolling stats
short, long = 50, 300

# Short
start_time = time.time()
df_short = rolling_pas(prediction_df, short, events_list)
print(f"Short took {time.time() - start_time:.2f} seconds")

In [None]:
# Long
start_time = time.time()
df_long = rolling_pas(prediction_df, long, events_list)
df_long = df_long.add_suffix("_long")
print(f"Long took {time.time() - start_time:.2f} seconds")

In [None]:
df_short[batter_stats_short].describe()

In [None]:
   
# We only need the rolling stats 
long_stats = batter_stats_long + pitcher_stats_long
df_long = df_long[long_stats]

# Dataset
final_dataset = pd.concat([df_short, df_long], axis=1)


# Reset index
final_dataset.reset_index(drop=True, inplace=True)

# Sort
final_dataset.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)



In [None]:
final_dataset.drop(columns=events_list + ['Cut'], inplace=True)

In [None]:
event_dummies = pd.get_dummies(final_dataset['eventsModel']).astype(int)

In [None]:
final_dataset = pd.concat([final_dataset, event_dummies], axis=1)

In [None]:
final_dataset = final_dataset.replace([float('inf'), float('-inf')], 0)

In [None]:
%%time
final_dataset.to_csv(os.path.join(baseball_path, "Final Dataset.csv"), index=False)

In [None]:
l_shifted_game_wfx_df = prediction_df2[prediction_df2['batSide'] == "L"]
r_shifted_game_wfx_df = prediction_df2[prediction_df2['batSide'] == "R"]

wfx_df = pd.merge(l_shifted_game_wfx_df, r_shifted_game_wfx_df, on=['venue_id', 'gamePk', 'date'], how='left', suffixes=("_l", "_r"))

In [None]:
wfx_df[['venue_id', 'gamePk', 'date'] + [col for col in wfx_df if "wfx" in col]].to_csv(os.path.join(baseball_path, "Park and Weather Factors.csv"), index=False)

In [None]:
for date in wfx_df['date'].unique():
    wfx_df[wfx_df['date'] == date][['venue_id', 'gamePk', 'date'] + [col for col in wfx_df if "wfx" in col]].to_csv(os.path.join(baseball_path, "A06. Weather", "3. Park and Weather Factors", f"Park and Weather Factors {date}.csv"), index=False)