# M01. Park and Weather Factors
- This calculated Park x Weather Factors
- Type: Model
- Run Frequency: Daily
- Sources:
    - MLB API
    - Steamer
- Created: 12/10/2024
- Updated: 12/17/2024

### Imports

In [None]:
import sys
if not hasattr(sys.modules['__main__'], '__file__'):
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"

### Data

Create Latest PA Data

In [None]:
%%time
complete_dataset = pd.read_csv(os.path.join(baseball_path, "Complete Dataset - Unadjusted.csv"))

In [None]:
drop_list = ['type', 'id', 'description', 'rbi', 'awayScore', 'homeScore', 'postOnFirst', 'postOnSecond', 'postOnThird', 'runner_id', 'start', 'end', 'movementReason', 'isScoringEvent', 'earned', 'pitch_number', 'pitch_name', 'hc_x', 'hc_y', 'hit_location', 'totalDistance', 'launchSpeed', 'launch_angle', 'launch_speed_angle', 'h', 'tb', 'reached', 'faced', 'outs_total', 'outs_pa', 'b1_inning', 'b2_inning', 'b3_inning', 'hr_inning', 'bb_inning', 'hbp_inning', 'so_inning', 'fo_inning', 'go_inning', 'lo_inning', 'po_inning', 'h_inning', 'tb_inning', 'reached_inning', 'faced_inning', 'rbi_inning', 'outs_pa_inning', 'b1_game', 'b2_game', 'b3_game', 'hr_game', 'bb_game', 'hbp_game', 'so_game', 'fo_game', 'go_game', 'lo_game', 'po_game', 'h_game', 'tb_game', 'reached_game', 'faced_game', 'rbi_game', 'outs_pa_game', 'bottom', 'atBatIndex_min', 'first_ab', 'atBatIndex_max', 'pulled', 'times_faced']

In [None]:
complete_dataset.drop(columns=drop_list, inplace=True)

##### Open Meteo Weather Data

In [None]:
%%time
weather_df = pd.concat(map(pd.read_csv, glob.glob(r"C:\Users\james\Documents\MLB\Database\A06. Weather\1. Open Meteo\*.csv")), ignore_index=True)[
       ['game_id', 'year', 'venue_name', 'location.defaultCoordinates.latitude',
       'location.defaultCoordinates.longitude', 'fieldInfo.leftLine',
       'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter',
       'fieldInfo.rightCenter', 'location.elevation', 'location.azimuthAngle',
       'fieldInfo.roofType', 'active', 'temperature_2m',
       'relative_humidity_2m', 'dew_point_2m', 'surface_pressure',
       'wind_speed_10m', 'wind_direction_10m', 'weather_code',
       'precipitation_probability']]

In [None]:
def calculate_vectors(row, azimuth_column, wind_column, speed_column):
    angle = row[wind_column] - row[azimuth_column]
    
    # Calculate vectors
    x_vect = round(math.sin(math.radians(angle)), 5) * row[speed_column] * -1
    y_vect = round(math.cos(math.radians(angle)), 5) * row[speed_column] * -1

    return pd.Series([x_vect, y_vect], index=['x_vect', 'y_vect'])

In [None]:
weather_df[['meteo_x_vect', 'meteo_y_vect']] = weather_df.apply(lambda row: calculate_vectors(row, 'location.azimuthAngle', 'wind_direction_10m', 'wind_speed_10m'), axis=1)

##### Merge

In [None]:
complete_dataset = complete_dataset.merge(weather_df, left_on=['gamePk'], right_on=['game_id'], how='inner')

### Base Rates

Calculate average stats in a given base year <br>
Note: This only has to be run once

In [None]:
def base_rates(df, base_year=2014):
    # Convert to datetime
    df['game_date'] = pd.to_datetime(df['game_date'])

    # Select period of interest
    df = df[df['game_date'].dt.year == base_year]

    # Calculate averages over period of interest
    base_rate_df = pd.DataFrame(df[events_list].mean()).T

    
    return base_rate_df

In [None]:
# base_rate_df = base_rates(complete_dataset, 2014)
# base_rate_df.to_csv(os.path.join(baseball_path, "Base Rates.csv"), index=False)

### Game Averages

Average rates within the game

In [None]:
def game_averages(df):    
    # Calculate averages by game
    game_avgs = df.groupby(['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name', 'x_vect', 'y_vect', 'temperature'])[events_list].mean().reset_index()

    # Add the 'pas' column to count the number of observations in each group
    game_avgs['pas'] = df.groupby(['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name', 'x_vect', 'y_vect', 'temperature']).size().values

    # Sort by date
    game_avgs.sort_values(['game_date'], ascending=True, inplace=True)

    
    return game_avgs

In [None]:
# game_average_df = game_averages(complete_dataset)
# game_average_df.tail(1)

### Player Averages

Average stats of all the players in the game, coming into the game

In [None]:
def player_averages(df):
    # Stats to average
    batter_inputs_short = [f"{event}_b_long" for event in events_list]
    pitcher_inputs_short = [f"{event}_p_long" for event in events_list]

    # Apply stats from last at bat to entire game
    df[batter_inputs_short] = df.groupby(['gamePk', 'batter'])[batter_inputs_short].transform('last')
    df[pitcher_inputs_short] = df.groupby(['gamePk', 'pitcher'])[pitcher_inputs_short].transform('last')
    
    # Calculate player averages by game
    batter_avgs = df.groupby(['gamePk'])[batter_inputs_short].mean().reset_index()
    pitcher_avgs = df.groupby(['gamePk'])[pitcher_inputs_short].mean().reset_index()

    # Concatenate together
    player_avgs = pd.concat([batter_avgs, pitcher_avgs.drop(columns=['gamePk'])], axis=1)
    
    
    return player_avgs

In [None]:
# player_average_df = player_averages(complete_dataset)
# player_average_df.tail(1)

### League Averages

In [None]:
def league_average(complete_dataset, days=30):
    # Calculate daily sum of events
    league_avg = complete_dataset.groupby('game_date')[events_list].sum().reset_index()
    # Calculate total events
    league_avg['pas'] = league_avg[events_list].sum(axis=1)
    
    # Use rolling sum including the current row
    for event in events_list + ['pas']:
        league_avg[f'{event}_sum'] = league_avg[event].rolling(window=days, min_periods=1).sum()

    # Calculate average
    for event in events_list:
        league_avg[f'{event}_lg'] = league_avg[f'{event}_sum'] / league_avg['pas_sum']

        
    return league_avg[["game_date"] + [col for col in league_avg if "_lg" in col]]

In [None]:
# league_average_df = league_average(complete_dataset, 30)
# league_average_df.tail(1)

### Park Factors

##### Rolling Averages

Average of stats over last rolling_window games - excluding game of interest

In [None]:
def rolling_averages(game_avgs, rolling_window, column):
    # Sort by group column and date
    rolling_avgs = game_avgs.sort_values([column, 'game_date']).copy()

    # Compute rolling sum for `pas`
    rolling_avgs['pas_rolling'] = rolling_avgs.groupby(column)['pas'].transform(
        lambda x: x.rolling(window=rolling_window, min_periods=1, closed="right").sum()
    )

    # Define function for rolling weighted average
    def weighted_avg(group):
        return (
            group[events_list]
            .rolling(window=rolling_window, min_periods=1, closed="right")
            .apply(lambda x: (x * group.loc[x.index, 'pas']).sum() / group.loc[x.index, 'pas'].sum(), raw=False)
        )

    # Apply rolling weighted average by the given column
    rolling_avgs[events_list] = rolling_avgs.groupby(column, group_keys=False).apply(weighted_avg)

    
    return rolling_avgs

In [None]:
# Need to account for small sample parks

In [None]:
# park_average_df = rolling_averages(game_average_df, 243, 'venue_id')
# park_average_df = park_average_df[['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name'] + events_list + ['pas_rolling']]
# park_average_df.tail(1)

In [None]:
# team_average_df = rolling_averages(game_average_df, 243, 'away_name')
# team_average_df = team_average_df[['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name'] + events_list + ['pas_rolling']]
# team_average_df.tail(1)

##### Park Factors

In [None]:
def create_park_factors(park_avgs, team_avgs):
    # Sort by game_date
    park_avgs = park_avgs.sort_values('game_date')
    team_avgs = team_avgs.sort_values('game_date')

    # Create uniform team_name variable equal to name of interest
    park_avgs['team_name'] = park_avgs['home_name'].copy()
    team_avgs['team_name'] = team_avgs['away_name'].copy()

    # Set to datetime
    # park_avgs['game_date'] = pd.to_datetime(park_avgs['game_date'], format='%m/%d/%Y')
    # team_avgs['game_date'] = pd.to_datetime(team_avgs['game_date'], format='%m/%d/%Y')
    park_avgs['game_date'] = pd.to_datetime(park_avgs['game_date'], format='mixed')
    team_avgs['game_date'] = pd.to_datetime(team_avgs['game_date'], format='mixed')
    
    # Perform merge_asof
    park_factor_df = pd.merge_asof(park_avgs, team_avgs, left_on='game_date', right_on='game_date', by='team_name', direction='backward', suffixes=('_park', '_team'))

    # Calculate park factors
    for stat in events_list:
        park_factor_df[f'{stat}_pfx'] = park_factor_df[f'{stat}_park'] / park_factor_df[f'{stat}_team'] 
        
    park_factor_df.rename(columns={'gamePk_park': 'gamePk'}, inplace=True)
    keep_columns = ['gamePk'] + [col for col in park_factor_df.columns if col.endswith('pfx')]

    
    return park_factor_df[keep_columns]

In [None]:
# park_factor_df = create_park_factors(park_average_df, team_average_df)

In [None]:
# park_factor_df.tail(1)

### Analysis 

Merge together game averages, player averages, and park factors

In [None]:
def create_analysis_df(complete_dataset, league_average_df, park_factor_df):
    # Merge on league averages
    analysis_df = pd.merge(complete_dataset, league_average_df, on=['game_date'], how='inner')
    # Merge on park factors
    analysis_df = pd.merge(analysis_df, park_factor_df, on='gamePk', how='inner')
   
    
    # Extract dummies from venues
    venue_dummy_df = pd.get_dummies(analysis_df['venue_id'].astype(int), prefix='venue')
    # Extract dummy column names
    venue_dummies = list(venue_dummy_df.columns)
    
    # Add in dummies
    analysis_df = pd.concat([analysis_df, venue_dummy_df], axis=1)
    
    # Select variables to keep
    mlb_variables = ['x_vect', 'y_vect', 'temperature', 'weather']
    meteo_variables = ['meteo_x_vect', 'meteo_y_vect', 
                       'fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 
                       'location.elevation', 'fieldInfo.roofType', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'surface_pressure']
    variables = mlb_variables + meteo_variables + venue_dummies
    
    # Loop over events
    for event in events_list: 
        # Define the dependent variable (e.g., `b1`) and independent variables
        variables += [f'{event}_b_long', f'{event}_p_long', f'{event}_pfx']
    
    # Select relevant variables and drop missings
    analysis_df = analysis_df[["eventsModel", 'gamePk', 'game_date', 'venue_id', 'away_name', 'home_name', 'batter', 'pitcher', 'batSide', 'pitchHand'] + variables + [col for col in analysis_df if col.endswith("_lg")]]
    
    # Remove cut
    analysis_df = analysis_df[analysis_df['eventsModel'] != "Cut"]
    
    
    return analysis_df, venue_dummies

In [None]:
# analysis_df, venue_dummies = create_analysis_df(complete_dataset, league_average_df, park_factor_df)

### Create Datasets

Dome adjustments

In [None]:
mask = complete_dataset['weather'].str.contains('Roof|Dome', case=False, na=False)

In [None]:
complete_dataset.loc[mask, 'temperature'] = 70
complete_dataset.loc[mask, 'x_vect'] = 0
complete_dataset.loc[mask, 'y_vect'] = 0

In [None]:
complete_dataset.loc[mask, 'temperature_2m'] = 70
complete_dataset.loc[mask, 'meteo_x_vect'] = 0
complete_dataset.loc[mask, 'meteo_y_vect'] = 0
complete_dataset.loc[mask, 'relative_humidity_2m'] = 60
complete_dataset.loc[mask, 'dew_point_2m'] = 57

Generate or read base rates

In [None]:
### Generate base rates (base year = 2014)
# Only needs to be run once
# Generate:
# base_rate_df = base_rates(complete_dataset, 2014)
# base_rate_df.to_csv(os.path.join(baseball_path, "Base Rates.csv"), index=False)

# Read: 
base_rate_df = pd.read_csv(os.path.join(baseball_path, "Base Rates.csv"))

In [None]:
# List of dataframes
analysis_df_list = []
# Loop over batter sides
for batSide in ['L', 'R']:
    print(batSide)
    # Subset complete dataset
    complete_dataset_side = complete_dataset[complete_dataset['batSide'] == batSide]
    # Calculate game averages (average rates within a particular games)
    game_average_df = game_averages(complete_dataset_side)
    # # Calculate player averages (average rates of all players coming into the game) (deprecated? - player level is in complete_dataset, so it's unnecessary)
    # player_average_df = player_averages(complete_dataset_side)
    # Calculate league averages (average rates of all PAs over last n days coming into the day)
    league_average_df = league_average(complete_dataset_side, days=30)
    # Average rates at park over last n games (both teams)
    park_average_df = rolling_averages(game_average_df, 243, 'venue_id')
    # Average rates at away games over last n games (both teams)
    team_average_df = rolling_averages(game_average_df, 243, 'away_name')
    # Park factors
    park_factor_df = create_park_factors(park_average_df, team_average_df)
    # Create dataframe that can be used to train and analyze data
    analysis_df, venue_dummies = create_analysis_df(complete_dataset_side, league_average_df, park_factor_df)
    analysis_df_list.append(analysis_df)

Extract Dataframes

In [None]:
l_analysis_df = analysis_df_list[0].copy()
r_analysis_df = analysis_df_list[1].copy()

In [None]:
# del analysis_df_list

### Park Latest

This contains the latest data available at each park, used to create WFX <br>
Note: We can't just use multiplier dataset for this because it won't contain data at the end of the last game

Columns to Keep

In [None]:
park_latest_columns = ['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name'] + venue_dummies + [col for col in l_analysis_df.columns if col.endswith("_pfx")] + [col for col in l_analysis_df.columns if col.endswith("_lg")] 

In [None]:
l_analysis_df['venue_id'] = l_analysis_df['venue_id'].astype(int)
r_analysis_df['venue_id'] = r_analysis_df['venue_id'].astype(int)

Write Park's Last Values to CSV

In [None]:
l_analysis_df[park_latest_columns].sort_values('game_date').drop_duplicates('venue_id', keep='last').to_csv(os.path.join(baseball_path, "Park Latest - LHB.csv"), index=False)

In [None]:
r_analysis_df[park_latest_columns].sort_values('game_date').drop_duplicates('venue_id', keep='last').to_csv(os.path.join(baseball_path, "Park Latest - RHB.csv"), index=False)

### Shift

##### Park Factors

In [None]:
pfx_list = [col for col in l_analysis_df.columns if col.endswith("pfx")]

Previous game_date at venue_id

In [None]:
l_analysis_df[pfx_list] = l_analysis_df.groupby("venue_id")[pfx_list].shift(1)
l_analysis_df[pfx_list] = l_analysis_df.groupby(["venue_id", "game_date"])[pfx_list].transform("first")

In [None]:
r_analysis_df[pfx_list] = r_analysis_df.groupby("venue_id")[pfx_list].shift(1)
r_analysis_df[pfx_list] = r_analysis_df.groupby(["venue_id", "game_date"])[pfx_list].transform("first")

##### League Averages

In [None]:
lg_list = [col for col in l_analysis_df.columns if col.endswith("lg")]

In [None]:
l_analysis_df = l_analysis_df.sort_values('game_date', ascending=True)
l_analysis_df[lg_list] = l_analysis_df[lg_list].shift(1)
l_analysis_df[lg_list] = l_analysis_df.groupby("game_date")[lg_list].transform("first")

In [None]:
r_analysis_df = r_analysis_df.sort_values('game_date', ascending=True)
r_analysis_df[lg_list] = r_analysis_df[lg_list].shift(1)
r_analysis_df[lg_list] = r_analysis_df.groupby(["game_date"])[lg_list].transform("first")

##### Batter Average

Note: You need to shift by batter and pitchHand to get the batter's last PA against that hand

In [None]:
b_long_list = [col for col in l_analysis_df.columns if col.endswith("b_long")]

In [None]:
l_analysis_df[b_long_list] = l_analysis_df.groupby(['batter', 'pitchHand'])[b_long_list].shift(1)
r_analysis_df[b_long_list] = r_analysis_df.groupby(['batter', 'pitchHand'])[b_long_list].shift(1)

##### Pitcher Averages

In [None]:
p_long_list = [col for col in l_analysis_df.columns if col.endswith("p_long")]

Note: You don't to shift by batSide to get the pitcher's last PA against that hand because all hands are the same, but why not?

In [None]:
l_analysis_df[p_long_list] = l_analysis_df.groupby(['pitcher', 'batSide'])[p_long_list].shift(1)
r_analysis_df[p_long_list] = r_analysis_df.groupby(['pitcher', 'batSide'])[p_long_list].shift(1)

### Select Model Inputs

Inputs

In [None]:
mlb_weather_variables = ['x_vect', 'y_vect', 'temperature'] # drop weather
meteo_duplicates_variables = ['meteo_x_vect', 'meteo_y_vect', 'temperature_2m']
meteo_weather_variables = ['relative_humidity_2m', 'dew_point_2m', 'surface_pressure']
mlb_park_variables = ['fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 'location.elevation'] # drop roof type

In [None]:
training_list = []
# Loop over events
for event in events_list: 
    # Define the dependent variable (e.g., `b1`) and independent variables
    training_list += [f'{event}_b_long', f'{event}_p_long', f'{event}_pfx']

In [None]:
testing_list = []
# Loop over events
for event in events_list: 
    # Define the dependent variable (e.g., `b1`) and independent variables
    testing_list += [f'{event}_lg', f'{event}_lg', f'{event}_pfx']

Generic

This uses information about the park and weather to predict, but doesn't train on park-specific dummies

In [None]:
generic_training_input_list = mlb_weather_variables + meteo_weather_variables + mlb_park_variables + training_list
generic_testing_input_list = mlb_weather_variables + meteo_weather_variables + mlb_park_variables + testing_list

Specific

In [None]:
specific_training_input_list = mlb_weather_variables + meteo_weather_variables + mlb_park_variables + training_list + venue_dummies
specific_testing_input_list = mlb_weather_variables + meteo_weather_variables + mlb_park_variables + testing_list + venue_dummies

In [None]:
# l_analysis_df['venue_id'] = l_analysis_df['venue_id'].astype(str)

### Select Data

Remove Infinite Values

Note: After setting them to missing historically, I'm going to set them to 0 now to ensure new parks aren't excluded. 

In [None]:
l_analysis_df[specific_training_input_list] = l_analysis_df[specific_training_input_list].replace([np.inf, -np.inf], 0)
r_analysis_df[specific_training_input_list] = r_analysis_df[specific_training_input_list].replace([np.inf, -np.inf], 0)

Drop if Missing Data

In [None]:
l_analysis_df = l_analysis_df.dropna()
r_analysis_df = r_analysis_df.dropna()

### Train/Test Split

Split

In [None]:
np.random.seed(42)
l_analysis_df['split'] = np.random.choice([0, 0, 1], size=len(l_analysis_df))
r_analysis_df['split'] = np.random.choice([0, 0, 1], size=len(r_analysis_df))

In [None]:
len(specific_training_input_list)

### Function

In [None]:
# def train_model(df, training_list, testing_list, filename, layers, activation, max_iter, alpha, learning_rate, batch_size, random_state):
#     # Define file name
#     model_filename = f"{filename}_{random_state}.pkl"
#     model_path_full = os.path.join(model_path, "M01. Park and Weather Factors", model_filename)

#     print(model_filename)
    
#     # Model Parameters
#     model = MLPClassifier(
#         hidden_layer_sizes=layers, 
#         activation=activation, 
#         verbose=False, 
#         alpha=alpha,
#         learning_rate_init=learning_rate, 
#         early_stopping=True, 
#         random_state=random_state,  # Modify random_state for each model
#         max_iter=max_iter, 
#         batch_size=batch_size
#     )

#     # Train model
#     model.fit(df[training_list], df[['eventsModel']].values.ravel())

#     # Save model
#     pickle.dump(model, open(model_path_full, 'wb'))

#     # Predict using the trained model (overwriting same columns each iteration)
#     output_list = list(model.classes_)
#     output_list_pred = [f"{x}_pred" for x in output_list]  

#     df[output_list_pred] = model.predict_proba(df[testing_list].values)

#     # Evaluate
#     # Get dummies
#     for event in events_list:
#         df[event] = (df['eventsModel'] == event).astype(int)

#     num_quantiles = 20  # Adjust if needed

#     # Create a figure with 3x4 subplots
#     fig, axes = plt.subplots(3, 4, figsize=(12, 9))  
#     axes = axes.flat  # Flatten to easily iterate

#     for j, event in enumerate(events_list):  
#         if j >= len(axes):  # Safety check if more events than subplots
#             break  

#         pred_col = f"{event}_pred"  
#         quantile_col = f"{event}_quantile"

#         # Create quantiles
#         df[quantile_col] = pd.qcut(
#             df[pred_col], num_quantiles, labels=False, duplicates='drop'
#         )

#         # Group by quantiles
#         plot_data = df.groupby(quantile_col).agg(
#             avg_pred=(pred_col, "mean"),  # X-axis
#             avg_event=(event, "mean")     # Y-axis
#         ).reset_index()

#         ax = axes[j]

#         # Plot data
#         sns.lineplot(x=plot_data["avg_pred"], y=plot_data["avg_event"], marker="o", ax=ax)

#         # 45-degree reference line
#         min_val, max_val = plot_data["avg_pred"].min(), plot_data["avg_pred"].max()
#         ax.plot([min_val, max_val], [min_val, max_val], linestyle="--", color="gray")

#         ax.set_xlabel("Avg Pred")
#         ax.set_ylabel("Avg Event")
#         ax.set_title(event)  

#     # Adjust layout for better spacing
#     plt.tight_layout() 
#     plt.show()

#     # Calculate WFX
#     for event in events_list:
#         df[f'{event}_wfx_l'] = df[f'{event}_pred'] / base_rate_df[event][0]
    

#     return df

In [None]:
def train_model(df, training_list, testing_list, filename, layers, activation, max_iter, alpha, learning_rate, batch_size, random_state):
    model_filename = f"{filename}_{random_state}.pkl"
    model_path_full = os.path.join(model_path, "M01. Park and Weather Factors", model_filename)

    print(model_filename)

    model = MLPClassifier(
        hidden_layer_sizes=layers, 
        activation=activation, 
        verbose=False, 
        alpha=alpha,
        learning_rate_init=learning_rate, 
        early_stopping=True, 
        random_state=random_state,  
        max_iter=max_iter, 
        batch_size=batch_size
    )

    model.fit(df[training_list], df[['eventsModel']].values.ravel())
    pickle.dump(model, open(model_path_full, 'wb'))

    output_list = list(model.classes_)
    output_list_pred = [f"{x}_pred" for x in output_list]  
    df[output_list_pred] = model.predict_proba(df[testing_list].values)

    for event in events_list:
        df[event] = (df['eventsModel'] == event).astype(int)

    num_quantiles = 20
    quantile_results = {}

    for event in events_list:  
        pred_col = f"{event}_pred"  
        quantile_col = f"{event}_quantile"

        df[quantile_col] = pd.qcut(
            df[pred_col], num_quantiles, labels=False, duplicates='drop'
        )

        plot_data = df.groupby(quantile_col).agg(
            avg_pred=(pred_col, "mean"),
            avg_event=(event, "mean")
        ).reset_index()

        quantile_results[event] = plot_data

    fig, axes = plt.subplots(3, 4, figsize=(12, 9))
    axes = axes.flat

    for j, event in enumerate(events_list):
        if j >= len(axes):
            break  

        ax = axes[j]
        plot_data = quantile_results[event]

        sns.lineplot(x=plot_data["avg_pred"], y=plot_data["avg_event"], marker="o", ax=ax)

        min_val = min(plot_data["avg_pred"].min(), plot_data["avg_event"].min())
        max_val = max(plot_data["avg_pred"].max(), plot_data["avg_event"].max())

        # Add small padding to prevent dots on edges
        padding = (max_val - min_val) * 0.05
        min_val -= padding
        max_val += padding

        ax.set_xlim(min_val, max_val)
        ax.set_ylim(min_val, max_val)

        ax.plot([min_val, max_val], [min_val, max_val], linestyle="--", color="gray")

        # Force square aspect ratio
        ax.set_aspect('equal', adjustable='box')

        # Let matplotlib auto-handle tick locations (nice round numbers)
        ax.xaxis.set_major_locator(plt.MaxNLocator(5))
        ax.yaxis.set_major_locator(plt.MaxNLocator(5))

        ax.set_xlabel("Avg Pred")
        ax.set_ylabel("Avg Event")
        ax.set_title(event)

    plt.tight_layout()
    plt.show()

    for event in events_list:
        df[f'{event}_wfx_l'] = df[f'{event}_pred'] / base_rate_df[event][0]

    return df


### Specific

### WFX - L

##### Settings

In [None]:
layers = (89,89,89,89,89,89,89)
# layers = (10,)
activation = 'relu'
max_iter = 100
alpha = 0.0001
learning_rate = 0.00001
batch_size='auto'
# batch_size=8
random_state = random.randint(1,99999)
num_models = 10

##### Run

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    for i in range(num_models):
        l_analysis_df = train_model(l_analysis_df, specific_training_input_list, specific_testing_input_list, "predict_wfx_l", layers, activation, max_iter, alpha, learning_rate, batch_size, random_state=random_state+i)

### WFX - R

##### Settings

In [None]:
layers = (89,89,89,89,89,89,89)
activation = 'relu'
max_iter = 100
alpha = 0.0001
learning_rate = 0.00001
batch_size='auto'
random_state = random.randint(1,99999)
num_models = 10

##### Run

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    for i in range(num_models):
        r_analysis_df = train_model(r_analysis_df, specific_training_input_list, specific_testing_input_list, "predict_wfx_r", layers, activation, max_iter, alpha, learning_rate, batch_size, random_state=random_state+i)

### Generic

### WFX - L

##### Settings

In [None]:
layers = (46,46,46)
activation = 'relu'
max_iter = 100
alpha = 0.0001
learning_rate = 0.00001
batch_size='auto'
batch_size = 8
random_state = random.randint(1,99999)
num_models = 10

##### Run

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    for i in range(num_models):
        l_analysis_df = train_model(l_analysis_df, generic_training_input_list, generic_testing_input_list, "predict_generic_wfx_l", layers, activation, max_iter, alpha, learning_rate, batch_size, random_state=random_state+i)

### WFX - R

##### Settings

In [None]:
layers = (46,46,46,)
activation = 'relu'
max_iter = 100
alpha = 0.0001
learning_rate = 0.00001
batch_size='auto'
random_state = random.randint(1,99999)
num_models = 10

##### Run

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    for i in range(num_models):
        r_analysis_df = train_model(r_analysis_df, generic_training_input_list, generic_testing_input_list, "predict_generic_wfx_r", layers, activation, max_iter, alpha, learning_rate, batch_size, random_state=random_state+i)

### Model Dataset

##### 1. Select Models in U5. Models

Update notebook, if WFX models have changed, and rerun

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    %run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"

##### 2. Predict

In [None]:
output_list = list(predict_wfx_l.classes_)
output_list_pred = [f"{x}_pred" for x in output_list]  

Specific

WFX - L

In [None]:
l_analysis_df[output_list_pred] = predict_wfx_l.predict_proba(l_analysis_df[specific_testing_input_list].values)
# Calculate WFX
for event in events_list:
    l_analysis_df[f'{event}_wfx_l'] = l_analysis_df[f'{event}_pred'] / base_rate_df[event][0]

WFX - R

In [None]:
r_analysis_df[output_list_pred] = predict_wfx_r.predict_proba(r_analysis_df[specific_testing_input_list].values)
# Calculate WFX
for event in events_list:
    r_analysis_df[f'{event}_wfx_r'] = r_analysis_df[f'{event}_pred'] / base_rate_df[event][0]

Generic

WFX - L

In [None]:
l_analysis_df[output_list_pred] = predict_generic_wfx_l.predict_proba(l_analysis_df[generic_testing_input_list].values)
# Calculate WFX
for event in events_list:
    l_analysis_df[f'{event}_generic_wfx_l'] = l_analysis_df[f'{event}_pred'] / base_rate_df[event][0]

WFX - R

In [None]:
r_analysis_df[output_list_pred] = predict_generic_wfx_r.predict_proba(r_analysis_df[generic_testing_input_list].values)
# Calculate WFX
for event in events_list:
    r_analysis_df[f'{event}_generic_wfx_r'] = r_analysis_df[f'{event}_pred'] / base_rate_df[event][0]

Graph

In [None]:
# Drop duplicates and select relevant columns
scatter_df = l_analysis_df.drop_duplicates('gamePk')[['hr_wfx_l', 'hr_generic_wfx_l']]

# Determine axis limits
min_val = min(scatter_df.min())
max_val = max(scatter_df.max())

# Set figure size to be square
plt.figure(figsize=(6,6))

# Create scatter plot with regression line
sns.regplot(data=scatter_df, x='hr_wfx_l', y='hr_generic_wfx_l', scatter=True, line_kws={"color": "red"})

# Set equal axis limits
plt.xlim(min_val, max_val)
plt.ylim(min_val, max_val)

# Force aspect ratio to be square
plt.gca().set_aspect('equal', adjustable='box')

# Add labels and title
plt.xlabel('HR WFX')
plt.ylabel('HR Generic WFX')
plt.title('Regression of HR WFX vs HR Generic WFX')

# Show plot
plt.show()


In [None]:
# Drop duplicates and select relevant columns
scatter_df = r_analysis_df.drop_duplicates('gamePk')[['hr_wfx_r', 'hr_generic_wfx_r']]

# Determine axis limits
min_val = min(scatter_df.min())
max_val = max(scatter_df.max())

# Set figure size to be square
plt.figure(figsize=(6,6))

# Create scatter plot with regression line
sns.regplot(data=scatter_df, x='hr_wfx_r', y='hr_generic_wfx_r', scatter=True, line_kws={"color": "red"})

# Set equal axis limits
plt.xlim(min_val, max_val)
plt.ylim(min_val, max_val)

# Force aspect ratio to be square
plt.gca().set_aspect('equal', adjustable='box')

# Add labels and title
plt.xlabel('HR WFX')
plt.ylabel('HR Generic WFX')
plt.title('Regression of HR WFX vs HR Generic WFX')

# Show plot
plt.show()

##### 3. Merge

Convert datatypes

In [None]:
l_analysis_df['venue_id'] = l_analysis_df['venue_id'].astype(str)
r_analysis_df['venue_id'] = r_analysis_df['venue_id'].astype(str)

Select columns to merge

In [None]:
descriptive_columns = ['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name', 'meteo_x_vect', 'meteo_y_vect', 'temperature_2m', 'x_vect', 'y_vect', 'temperature', 'weather']
wfx_l_columns = [col for col in l_analysis_df.columns if col.endswith("_wfx_l")]
wfx_r_columns = [col for col in r_analysis_df.columns if col.endswith("_wfx_r")]

Merge on common columns

In [None]:
multiplier_dataset = pd.merge(l_analysis_df.drop_duplicates('gamePk', keep='last')[descriptive_columns + wfx_l_columns], 
                              r_analysis_df.drop_duplicates('gamePk', keep='last')[descriptive_columns + wfx_r_columns], on=descriptive_columns, how='left')

Create date column

In [None]:
multiplier_dataset['date'] = multiplier_dataset['game_date'].str.replace("-", "")

##### 4. Write Multiplier Dataset

Write to CSV

In [None]:
multiplier_dataset.to_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"), index=False)

##### 5. Generate Historic Park and Weather Factors Files

In [None]:
# Select columns to keep
keep_columns = ['gamePk', 'game_date', 'date', 'venue_id', 'away_name', 'home_name', 'meteo_x_vect', 'meteo_y_vect', 'temperature_2m', 'x_vect', 'y_vect', 'temperature', 'weather'] + [col for col in multiplier_dataset.columns if "_wfx" in col]

In [None]:
multiplier_dataset.sort_values('date', inplace=True)
for date in multiplier_dataset[pd.to_datetime(multiplier_dataset['game_date'], format='mixed').dt.year >= 2022]['date'].unique():
    # print(date)
    if date > "20220101":
        # Subset by date
        daily_weather_df = multiplier_dataset[multiplier_dataset['date'] == date][keep_columns]

        # Write to CSV
        daily_weather_df.to_csv(os.path.join(baseball_path, "A06. Weather", "3. Park and Weather Factors", f"Park and Weather Factors {date}.csv"), index=False)

Note: new Park x Weather Factor models will change the historic Multiplier Dataset data. This requires rerunning:
- B01. Matchups
- M02. Stat Imputations
- M03. Plate Appearances