# M01. Park and Weather Factors

### Imports

In [None]:
%run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U2. Functions.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"

### Data

##### MLB Stats API

In [None]:
start_year, end_year = 2015, 2025

Merge MLB Stats API and Statcast data

In [None]:
%%time
df = merge_datasets(start_year, end_year)
df = clean_weather(df)
df = create_events(df)
df = create_variables(df)
df = start_data(df)

##### Open Meteo

Read in Open Meteo weather data

In [None]:
%%time
weather_df = pd.concat(map(pd.read_csv, glob.glob(r"C:\Users\james\Documents\MLB\Database\A06. Weather\1. Open Meteo\*.csv")), ignore_index=True)[
       ['game_id', 'year', 'venue_name', 'location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude', 
        'fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 'location.elevation', 'location.azimuthAngle', 'fieldInfo.roofType', 'active', 
        'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'surface_pressure', 'wind_speed_10m', 'wind_direction_10m', 'weather_code', 'precipitation_probability']]

Calculate wind vectors

In [None]:
def calculate_vectors(row, azimuth_column, wind_column, speed_column):
    angle = row[wind_column] - row[azimuth_column]
    
    # Calculate vectors
    x_vect = round(math.sin(math.radians(angle)), 5) * row[speed_column] * -1
    y_vect = round(math.cos(math.radians(angle)), 5) * row[speed_column] * -1

    return pd.Series([x_vect, y_vect], index=['x_vect', 'y_vect'])

In [None]:
weather_df[['meteo_x_vect', 'meteo_y_vect']] = weather_df.apply(lambda row: calculate_vectors(row, 'location.azimuthAngle', 'wind_direction_10m', 'wind_speed_10m'), axis=1)

##### Baserunning

In [None]:
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1', usecols=['proj_date', 'mlbamid', 'PA', 'UBR'], dtype='str')

Convert data types

In [None]:
steamer_hitters_df[['PA', 'UBR']] = steamer_hitters_df[['PA', 'UBR']].astype(float)

In [None]:
steamer_hitters_df['proj_date'] = pd.to_datetime(steamer_hitters_df['proj_date'])

Calculate UBR per 600 Plate Appearances

In [None]:
steamer_hitters_df['UBR600'] = steamer_hitters_df['UBR'] / steamer_hitters_df['PA'] * 600

##### Merge

Weather data

In [None]:
complete_dataset = df.merge(weather_df.drop(columns=['year']), left_on=['gamePk'], right_on=['game_id'], how='inner')

Use weather column from MLB data to adjust for domes/roofs

In [None]:
mask = complete_dataset['weather'].str.contains('Roof|Dome', case=False, na=False)

In [None]:
complete_dataset.loc[mask, 'temperature'] = 70
complete_dataset.loc[mask, 'x_vect'] = 0
complete_dataset.loc[mask, 'y_vect'] = 0

In [None]:
complete_dataset.loc[mask, 'temperature_2m'] = 70
complete_dataset.loc[mask, 'meteo_x_vect'] = 0
complete_dataset.loc[mask, 'meteo_y_vect'] = 0
complete_dataset.loc[mask, 'relative_humidity_2m'] = 60
complete_dataset.loc[mask, 'dew_point_2m'] = 57

Baserunning Data

In [None]:
complete_dataset['proj_date'] = pd.to_datetime(complete_dataset['date'], format='%Y%m%d')

In [None]:
complete_dataset['mlbamid'] = complete_dataset['batter'].astype(str)

In [None]:
complete_dataset = pd.merge_asof(
    complete_dataset.sort_values('proj_date'),
    steamer_hitters_df.sort_values('proj_date'),
    by='mlbamid',
    on='proj_date',
    direction='backward'
)

Note:
- if y > 198.27 and x < 125.42), it's actually to left
- if y > 198.27 and x > 125.42), it's actually to right

### Model #1. Expected Outcome

Probability of events given how the baseball was launched, where it was launched to, and some information about the batter, including handedness and base running. Notably excluded park and weather.

$ \hat{\text{eventsModel}} = launch\_angle + launch\_speed + to\_l + to\_lc + to\_c + to\_rc + to\_r + b\_L + UBR600 $

##### Inputs

In [None]:
outcome_inputs = ['launch_angle', 'launch_speed', 'to_l', 'to_lc', 'to_c', 'to_rc', 'to_r', 'b_L', 'UBR600'] + ['bb', 'hbp', 'so']

##### Sample

Sent launch data to 0 if not batted

In [None]:
complete_dataset[['launch_angle', 'launch_speed']] = complete_dataset[['launch_angle', 'launch_speed']].fillna(0)

Remove atypical events and missings

In [None]:
complete_dataset = complete_dataset[~complete_dataset['eventsModel'].isin(["Cut"])].dropna(subset=outcome_inputs)

Define model input and outputs

In [None]:
X = complete_dataset[outcome_inputs].values
y = complete_dataset[['eventsModel']].values

##### Encode

In [None]:
# if not hasattr(sys.modules['__main__'], '__file__'): # Run if notebook is origin file
if 1 == 2: # Force not to run regardless of origin file 
    # One-hot encode the target
    encode_outcome = OneHotEncoder(sparse_output=False)
    # Fit and transform
    y_encoded = encode_outcome.fit_transform(y)
    # Create folder
    os.makedirs(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate), exist_ok=True)   
    # Save
    pickle.dump(encode_outcome, open(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate, "encode_outcome.pkl"), 'wb'))
else:
    y_encoded = encode_outcome.transform(y)

# Calculate number of classes (used for model inputs)
num_classes = y_encoded.shape[1]

##### Scale

In [None]:
# if not hasattr(sys.modules['__main__'], '__file__'): # Run if notebook is origin file
if 1 == 2: # Force not to run regardless of origin file 
    # Scale
    scale_outcome = StandardScaler()
    # Fit and transform
    X_scaled = scale_outcome.fit_transform(X)
    # Save
    pickle.dump(scale_outcome, open(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate, "scale_outcome.pkl"), 'wb'))
else:
    X_scaled = scale_outcome.transform(X)

##### Train

In [None]:
# if not hasattr(sys.modules['__main__'], '__file__'): # Run if notebook is origin file
if 1 == 2: # Force not to run regardless of origin file 
    predict_outcome = Sequential([
        Dense(32, input_shape=(X_scaled.shape[1],), activation='relu'),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(num_classes, activation='softmax')  # softmax for multi-class classification
    ])
    
    predict_outcome.compile(optimizer=Adam(learning_rate=0.00001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    early_stop = EarlyStopping(
        monitor='val_loss',      # watch validation loss
        patience=5,              # stop if no improvement after 5 epochs
        restore_best_weights=True
    )
    
    predict_outcome.fit(
        X_scaled, y_encoded,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stop]
    )

    predict_outcome.save(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate, 'predict_outcome.keras'))

##### Predict

Predicted rates of events based on batted-ball data

In [None]:
predictions = predict_outcome.predict(X_scaled)

prediction_df = pd.DataFrame(predictions, columns=encode_outcome.categories_[0])
prediction_df = prediction_df.add_suffix('_pred_batted')

prediction_df = pd.concat([complete_dataset.reset_index(drop=True), prediction_df], axis=1)

##### Evaluate

In [None]:
prediction_df.tail()

In [None]:
# Adjust the number of rows and columns
n_events = len(events_list)
n_cols = 3
n_rows = (n_events + n_cols - 1) // n_cols  # Ceiling division

# Set square plots: each subplot is 5x5 inches
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 5 * n_rows))
axes = axes.flatten()

for i, event in enumerate(events_list):
    ax = axes[i]
    pred_col = f"{event}_pred_batted"
    
    if pred_col not in prediction_df.columns:
        continue

    # Bucket the predicted values into quantiles
    prediction_df['bucket'] = pd.qcut(prediction_df[pred_col], q=20, duplicates='drop')

    # Compute averages
    bucket_avg = prediction_df.groupby('bucket').agg(
        avg_pred=(pred_col, 'mean'),
        avg_actual=(event, 'mean')
    ).reset_index()

    # Plot
    ax.plot(bucket_avg['avg_pred'], label='Predicted')
    ax.plot(bucket_avg['avg_actual'], label='Actual')
    ax.set_title(f"{event.upper()} Prediction vs Actual")
    ax.set_xlabel("Quantile Bucket")
    ax.set_ylabel("Rate")
    ax.legend()
    ax.grid(True)

# Remove extra axes if any
for j in range(n_events, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

### Calculate PFX

In [None]:
events_list_pred_batted = [f"{event}_pred_batted" for event in events_list]
pfx_list = [f"{event}_pfx" for event in events_list]

In [None]:
game_pfx_df = prediction_df.groupby(['venue_id', 'gamePk', 'batSide', 'date'])[events_list + events_list_pred_batted].mean().reset_index()

In [None]:
num_games = 243

In [None]:
# Make sure the data is sorted appropriately for rolling
game_pfx_df = game_pfx_df.sort_values(['venue_id', 'date', 'batSide'])

In [None]:
game_pfx_df.head()

##### Unshifted

In [None]:
# Compute the rolling average of the last num_games *including* the current row
rolling_avgs = (
    game_pfx_df
    .groupby(['venue_id', 'batSide'], group_keys=False)
    .apply(lambda group: group[events_list + events_list_pred_batted].shift(0).rolling(num_games, min_periods=1).mean())
)

# Rename columns to indicate they are rolling averages
rolling_avgs.columns = [f'{col}_rolling' for col in events_list + events_list_pred_batted]

# Concatenate with the original dataframe
unshifted_game_pfx_df = pd.concat([game_pfx_df, rolling_avgs], axis=1)

for event in events_list:
    unshifted_game_pfx_df[f'{event}_pfx'] = unshifted_game_pfx_df[f'{event}_rolling'] / unshifted_game_pfx_df[f'{event}_pred_batted_rolling']

In [None]:
unshifted_game_pfx_df[['venue_id', 'batSide'] + [col for col in unshifted_game_pfx_df if col.endswith("pfx")]].drop_duplicates(subset=['venue_id', 'batSide'], keep='last').to_csv(os.path.join(baseball_path, "Park Latest.csv"), index=False)

##### Shifted

In [None]:
# Compute the rolling average of the last num_games *excluding* the current row
rolling_avgs = (
    game_pfx_df
    .groupby(['venue_id', 'batSide'], group_keys=False)
    .apply(lambda group: group[events_list + events_list_pred_batted].shift(1).rolling(num_games, min_periods=1).mean())
)

# Rename columns to indicate they are rolling averages
rolling_avgs.columns = [f'{col}_rolling' for col in events_list + events_list_pred_batted]

# Concatenate with the original dataframe
shifted_game_pfx_df = pd.concat([game_pfx_df, rolling_avgs], axis=1)

for event in events_list:
    shifted_game_pfx_df[f'{event}_pfx'] = shifted_game_pfx_df[f'{event}_rolling'] / shifted_game_pfx_df[f'{event}_pred_batted_rolling']

### Model #2. Weather Factors

$ \hat{\text{eventsModel2}} = \hat{\text{eventsModel}} + pfx + meteo\_x\_vect + meteo\_y\_vect + temperature\_2m + relative\_humidity\_2m + dew\_point\_2m + surface\_pressure + venue\_id $

The purpose of this model is to estimate rates of events in games based on weather and venue. This model is trained with expected rates based on the actual batted ball data. This allows for us to control for differences in inherent batted ball data across games. The model then predicts with league average rates to determine how a game with typical batted ball data would differ in various weather and venue conditions. <br>
Ideally, we would then compare these predicted rates to league average rates to determine park x weather factors, multipliers that estimate how much more or less likely given events are on the game-level than under average conditions. <br>
However, this is hard. <br>
Instead, predicted rates are used to assign park-specific quantiles to games based on weather conditions. For instance, a weather conditions that predict a 0.05 projected home run rate at Fenway may get a game assigned to the top quantile of games at that park. From this point, multipliers will be calculated by averaging actual home runs rates at similar games and dividing by batted-ball predicted home run rates at those same games. The result can be interpreted as a multiplier that determines how much more or less likely were home runs at games with similar weather conditions compared to their batted ball likelihoods.

##### Inputs

Meteo weather inputs

In [None]:
meteo_weather_list = ['meteo_x_vect', 'meteo_y_vect', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'surface_pressure']

Parks with sufficient samples

In [None]:
venue_dummy_list = [f'venue_{id}' for id in sorted(prediction_df['venue_id'].value_counts()[lambda x: x > 20000].index.tolist())]

Select inputs

In [None]:
wfx_inputs = events_list_pred_batted + pfx_list + meteo_weather_list + venue_dummy_list + ['b_L']

##### Sample

Merge in park factors

In [None]:
sample_df2 = prediction_df.merge(shifted_game_pfx_df[['gamePk', 'batSide'] + pfx_list], on=['gamePk', 'batSide'], how='left')

Create venue dummies

Note: not all venue dummies may be included in venue_dummy_list

In [None]:
sample_df2['venue_id2'] = sample_df2['venue_id'].copy()

In [None]:
sample_df2 = pd.get_dummies(sample_df2, columns=['venue_id2'], prefix='venue', drop_first=False)

Set pfx to 1 if not in venue sample

Note: we may want to set this in shifted_game_pfx_df and default to a rolling value

In [None]:
sample_df2['sample_venue'] = sample_df2[venue_dummy_list].sum(axis=1)

In [None]:
for pfx in pfx_list:
    sample_df2[pfx] = np.where(sample_df2['sample_venue'] == 0, 1, sample_df2[pfx])

Drop if missing inputs

In [None]:
sample_df2.dropna(subset=wfx_inputs, inplace=True)

Group by game

In [None]:
sample_df2 = sample_df2.groupby(['gamePk', 'date', 'venue_id', 'batSide'])[wfx_inputs + events_list].mean().reset_index()

In [None]:
sample_df2['b_L'] = (sample_df2['batSide'] == "L").astype(int)

In [None]:
sample_df2 = sample_df2[sample_df2['date'] > 20180101]

In [None]:
# Split features and target
X = sample_df2[wfx_inputs].values
y = sample_df2[events_list].values

# Number of classes
num_classes = y.shape[1]

###### Scale

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    # Scale
    scale_wfx = StandardScaler()
    # Fit and transform
    X_scaled = scale_wfx.fit_transform(X)
    # Create directory
    os.makedirs(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate), exist_ok=True)
    # Save
    pickle.dump(scale_wfx, open(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate, "scale_wfx.pkl"), 'wb'))
else:
    X_scaled = scale_wfx.transform(X)

##### Train

In [None]:
%%time
if not hasattr(sys.modules['__main__'], '__file__'):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.callbacks import EarlyStopping
    from tensorflow.keras.layers import BatchNormalization, Dropout
    from tensorflow.keras.losses import KLDivergence

    class VotingEnsemble:
        def __init__(self, models):
            self.models = models

        def predict(self, X):
            predictions = np.array([model.predict(X, verbose=0) for model in self.models])
            return np.mean(predictions, axis=0)

    ensemble_size = 5
    ensemble_models = []
    model_dir = os.path.join(model_path, "M01. Park and Weather Factors", todaysdate)
    os.makedirs(model_dir, exist_ok=True)

    for i in range(ensemble_size):
        model = Sequential([
            Dense(128, input_shape=(X_scaled.shape[1],), activation='relu'),
            Dropout(0.3),
            Dense(256, activation='relu'),
            Dropout(0.3),
            Dense(128, activation='relu'),
            # Dropout(0.3),
            Dense(64, activation='relu'),
            # Dropout(0.3),
            Dense(32, activation='relu'),
            Dense(num_classes, activation='softmax')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001),
                      # loss='categorical_crossentropy',
                      loss=keras.losses.KLDivergence(),
                      metrics=[KLDivergence()])

        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )

        model.fit(
            X_scaled, y,
            epochs=100,
            batch_size=32,
            validation_split=0.2,
            callbacks=[early_stop],
            verbose=1
        )

        # Create folder
        model_path_i = os.path.join(model_dir, f'predict_wfx_{i}.keras')
        model.save(model_path_i)
        ensemble_models.append(model)

    # Wrap ensemble in predict_wfx for compatibility
    predict_wfx = VotingEnsemble(ensemble_models)

##### Predict

Save event averages for use in predictions in A06. Weather

In [None]:
average_df = pd.DataFrame(sample_df2[events_list].mean()).T
# average_df.to_csv(os.path.join(baseball_path, "Event Averages.csv"), index=False)

Before predicting, replace with mean predicted event rates (based on batted ball data) to determine how weather would affect an average batted-ball game

In [None]:
sample_df3 = sample_df2.copy()
for event in events_list:
    sample_df3[f'{event}_pred_batted'] = sample_df3[event].mean()

Now actually predict

In [None]:
# Split features and target
X2 = sample_df3[wfx_inputs].values
y2 = sample_df3[events_list].values

# Scale the features
X2_scaled = scale_wfx.transform(X2)

In [None]:
predictions2 = predict_wfx.predict(X2_scaled)
prediction_df2 = pd.DataFrame(predictions2, columns=events_list)

prediction_df2 = prediction_df2.add_suffix('_pred_weather')

prediction_df2 = pd.concat([prediction_df2, sample_df3.reset_index()], axis=1)

In [None]:
prediction_df2.head()

Calculate WFX

Predicted, based on weather, over predicted, based on batted-ball data

In [None]:
for event in events_list:
    prediction_df2[f'{event}_wfx_unadj'] = prediction_df2[f'{event}_pred_weather'] / prediction_df2[f'{event}_pred_batted']

Highlight outliers

In [None]:
# Sort and get top/bottom 500
top_500 = prediction_df2.nlargest(500, 'hr_wfx_unadj')
bottom_500 = prediction_df2.nsmallest(500, 'hr_wfx_unadj')

# Get value counts
top_counts = top_500['venue_id'].value_counts().head(5)
bottom_counts = bottom_500['venue_id'].value_counts().head(5)

# Combine into a 5x4 DataFrame
result_df = pd.DataFrame({
    'Top Venue': top_counts.index,
    'Top Count': top_counts.values,
    'Bottom Venue': bottom_counts.index,
    'Bottom Count': bottom_counts.values
})

result_df

In [None]:
prediction_df2.sort_values('hr_wfx_unadj', ascending=False).head(100)[['meteo_y_vect', 'temperature_2m']].mean()

In [None]:
prediction_df2.sort_values('hr_wfx_unadj', ascending=False).tail(100)[['meteo_y_vect', 'temperature_2m']].mean()

##### Plot

In [None]:
# Adjust the number of rows and columns
n_events = len(events_list)
n_cols = 3
n_rows = (n_events + n_cols - 1) // n_cols  # Ceiling division

# Set square plots: each subplot is 5x5 inches
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 5 * n_rows))
axes = axes.flatten()

for i, event in enumerate(events_list):
    ax = axes[i]
    pred_col = f"{event}_pred_weather"
    
    if pred_col not in prediction_df2.columns:
        continue

    # Bucket the predicted values into quantiles
    prediction_df2['bucket'] = pd.qcut(prediction_df2[pred_col], q=10, duplicates='drop')

    # Compute averages
    bucket_avg = prediction_df2.groupby('bucket').agg(
        avg_pred=(pred_col, 'mean'),
        avg_actual=(event, 'mean')
    ).reset_index()

    # Plot
    ax.plot(bucket_avg['avg_pred'], label='Predicted')
    ax.plot(bucket_avg['avg_actual'], label='Actual')
    ax.set_title(f"{event.upper()} Prediction vs Actual")
    ax.set_xlabel("Quantile Bucket")
    ax.set_ylabel("Rate")
    ax.legend()
    ax.grid(True)

    # Set y-axis limits: from 1/3 of the max to the max
    y_max = max(bucket_avg['avg_pred'].max(), bucket_avg['avg_actual'].max())
    y_min = y_max / 2

    # Create 10 evenly spaced ticks from y_min to y_max
    ticks = np.linspace(y_min, y_max, 10)
    ax.set_ylim(y_min, y_max)
    ax.set_yticks(np.round(ticks, 5))  # round for cleaner labels

# Remove extra axes if any
for j in range(n_events, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


Drop bucket column 

In [None]:
prediction_df2.drop(columns={'bucket'}, inplace=True)

##### Calibrate

In [None]:
%%time
n = 50  # number of last past games to consider (your desired threshold)

# Convert pandas DataFrame to Polars DataFrame
# IMPORTANT: Ensure prediction_df2 is sorted by your game time column (e.g., 'gamePk', 'date')
# BEFORE this cell if it's not already. For example:
# prediction_df2 = prediction_df2.sort_values(by='gamePk').copy()
df = pl.from_pandas(prediction_df2.copy())

# Initialize lists to store results for new columns
results_mean = {f'{event}_mean': [] for event in events_list}
results_pred_batted_mean = {f'{event}_pred_batted_mean': [] for event in events_list}

# Iterate through each row of the Polars DataFrame
for i in range(len(df)):
    current_row = df.row(i, named=True)  # Get the current row data as a dictionary
    past_data = df.slice(0, i) # Get all data from previous rows in the DataFrame (already chronological)

    # Process for each event type
    for event in events_list:
        true_col = event            # Column for '{event}_mean' (your actual event result)
        model_col = f'{event}_pred_batted' # Column for '{event}_pred_mean' (your model's event prediction)

        # --- Step 1: Attempt strict filtering (venue_id AND batSide) ---
        past_subset_strict_filter = past_data.filter(
            (pl.col('venue_id') == current_row['venue_id']) &
            (pl.col('batSide') == current_row['batSide'])
        )

        games_to_average = None # Initialize variable to hold the final set of games

        # --- Step 2: Check count and apply conditional logic ---
        if len(past_subset_strict_filter) < n:
            # If strict filter yields less than 'n' games, fallback to broader filter
            # print(f"Row {i}, Event {event}: Less than {n} games with strict filter ({len(past_subset_strict_filter)} found). Falling back to batSide only.")
            past_subset_broad_filter = past_data.filter(
                pl.col('batSide') == current_row['batSide']
            )
            games_to_average = past_subset_broad_filter
        else:
            # If strict filter yields 'n' or more games, use those
            games_to_average = past_subset_strict_filter

        # --- Handle cases where even the broadest filter yields no games ---
        if games_to_average.is_empty():
            results_mean[f'{event}_mean'].append(np.nan)
            results_pred_batted_mean[f'{event}_pred_batted_mean'].append(np.nan)
            continue

        # --- Step 3: Take the last 'n' games from the chosen subset and average ---
        final_games_for_avg = games_to_average.tail(n)

        # Compute the mean of the relevant columns
        results_mean[f'{event}_mean'].append(final_games_for_avg[true_col].mean())
        results_pred_batted_mean[f'{event}_pred_batted_mean'].append(final_games_for_avg[model_col].mean())

# Combine all collected results into a single dictionary
final_results_combined = {}
final_results_combined.update(results_mean)
final_results_combined.update(results_pred_batted_mean)

# Add the newly computed columns to the Polars DataFrame
for col_name, values_list in final_results_combined.items():
    df = df.with_columns(pl.Series(name=col_name, values=values_list))

# Convert the final Polars DataFrame back to a pandas DataFrame
prediction_df2 = df.to_pandas()

In [None]:
prediction_df2.tail()

##### Calculate WFX

WFX = Actual for similar games / Predicted (using batted-ball data) in similar games 

In [None]:
for event in events_list:
    prediction_df2[f'{event}_wfx_adj'] = prediction_df2[f'{event}_mean'] / prediction_df2[f'{event}_pred_batted_mean']

##### Evaluate

What questions are you trying to answer?
    - Do multipliers predict hr rates?
    - Are multipliers "fair" across venue and hand?

In [None]:
prediction_df2['residual'] = prediction_df2['hr'] - prediction_df2['hr_wfx_adj']


plt.figure(figsize=(14, 6))
sns.boxplot(x='venue_id', y='residual', hue='batSide', data=prediction_df2)
plt.axhline(0, color='gray', linestyle='--')
plt.xticks(rotation=90)
plt.title("Residuals by Venue and Bat Side")
plt.tight_layout()
plt.show()


##### WFX Dataframe

Convert from long to wide

In [None]:
l_shifted_game_wfx_df = prediction_df2[prediction_df2['batSide'] == "L"]
r_shifted_game_wfx_df = prediction_df2[prediction_df2['batSide'] == "R"]

wfx_df = pd.merge(l_shifted_game_wfx_df, r_shifted_game_wfx_df, on=['venue_id', 'gamePk', 'date'], how='left', suffixes=("_l", "_r"))

Write all game WFX to CSV

In [None]:
wfx_df[['venue_id', 'gamePk', 'date'] + [col for col in wfx_df if "wfx" in col] + [col for col in wfx_df if "pred" in col] + [f'{event}_l' for event in events_list] + [f'{event}_r' for event in events_list]].to_csv(os.path.join(baseball_path, "Park and Weather Factors.csv"), index=False)

Write individual-game WFX to CSV

In [None]:
for date in wfx_df['date'].unique():
    wfx_df[wfx_df['date'] == date][['venue_id', 'gamePk', 'date'] + [col for col in wfx_df if "wfx" in col]].to_csv(os.path.join(baseball_path, "A06. Weather", "3. Park and Weather Factors", f"Park and Weather Factors {date}.csv"), index=False)

##### Player Stat DataFrame

This comes from Model #1 and is completely independent of Model #2. Placement is for convenience, not necessarily logic.

Replace actual event rates with predicted ones

In [None]:
prediction_df[events_list] = prediction_df[events_list_pred_batted].copy()

Drop unneeded columns

In [None]:
if 'bucket' in list(prediction_df.columns):
    prediction_df.drop(columns=['bucket'], inplace=True)

Calculate rolling stats

In [None]:
short, long = 50, 300

Short

In [None]:
start_time = time.time()
df_short = rolling_pas(prediction_df, short, events_list)
print(f"Short took {time.time() - start_time:.2f} seconds")

Long

In [None]:
start_time = time.time()
df_long = rolling_pas(prediction_df, long, events_list)
df_long = df_long.add_suffix("_long")
print(f"Long took {time.time() - start_time:.2f} seconds")

# We only need the rolling stats from long (the rest are in df_short)
long_stats = batter_stats_long + pitcher_stats_long
df_long = df_long[long_stats]

Merge long stats onto rolling (and other) stats

In [None]:
final_dataset = pd.concat([df_short, df_long], axis=1)
final_dataset.reset_index(drop=True, inplace=True)
final_dataset.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
final_dataset.drop(columns=events_list + ['Cut'], inplace=True)

Add event dummies

In [None]:
event_dummies = pd.get_dummies(final_dataset['eventsModel']).astype(int)

In [None]:
final_dataset = pd.concat([final_dataset, event_dummies], axis=1)

Clean

In [None]:
final_dataset = final_dataset.replace([float('inf'), float('-inf')], 0)

Write to CSV

In [None]:
%%time
final_dataset.to_csv(os.path.join(baseball_path, "Final Dataset.csv"), index=False)

In [None]:
final_dataset['date'].max()

### Required Follow-Ups:
- Model #1. Expected Outcomes
    - Model #2. Expected Outcomes
    - B01. Matchups
    - M02. Stat Imputations
    - M03. Plate Appearances
- Model #2. Park and Weather Factors
    - M03. Plate Appearances

Note: You should avoid rerunning Model #1. Expected Outcomes as much as possible. Likely shouldn't need many updates anyway.