# M01. Park and Weather Factors
- This calculated Park x Weather Factors
- Type: Model
- Run Frequency: Daily
- Sources:
    - MLB API
    - Steamer
- Created: 12/10/2024
- Updated: 12/17/2024

### Imports

In [6]:
if not hasattr(sys.modules['__main__'], '__file__'):
    print("Running imports...")
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"
    print("Imports in.")
else:
    print("Imports already in.")

Running imports...
Imports in.


### Data

Create Latest PA Data

In [2]:
%%time
complete_dataset_unadjusted_latest = create_pa_inputs(None, start_year=2022, end_year=2024, short=50, long=300, adjust=False, generate=True, write=False)

CPU times: total: 8min 51s
Wall time: 8min 59s


Keep Only Most Recent Year

In [3]:
complete_dataset_unadjusted_latest = complete_dataset_unadjusted_latest[complete_dataset_unadjusted_latest['year'].astype(int) >= 2025].reset_index()
complete_dataset_unadjusted_latest.to_csv(os.path.join(baseball_path, "Complete Dataset - Unadjusted Latest.csv"), index=False)

Read in Earlier Data

In [4]:
complete_dataset_unadjusted_earlier = pd.read_csv(os.path.join(baseball_path, "Complete Dataset - Unadjusted through 2024.csv"))

Combine

In [5]:
complete_dataset = pd.concat([complete_dataset_unadjusted_earlier, complete_dataset_unadjusted_latest], axis=0)

In [None]:
com

### Base Rates

Calculate average stats in a given base year <br>
Note: This only has to be run once

In [None]:
def base_rates(df, base_year=2014):
    # Convert to datetime
    df['game_date'] = pd.to_datetime(df['game_date'])

    # Select period of interest
    df = df[df['game_date'].dt.year == base_year]

    # Calculate averages over period of interest
    base_rate_df = pd.DataFrame(df[events_list].mean()).T

    
    return base_rate_df

In [None]:
# base_rate_df = base_rates(complete_dataset, 2014)
# base_rate_df.to_csv(os.path.join(baseball_path, "Base Rates.csv"), index=False)

### Game Averages

Average rates within the game

In [None]:
def game_averages(df):    
    # Calculate averages by game
    game_avgs = df.groupby(['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name', 'x_vect', 'y_vect', 'temperature'])[events_list].mean().reset_index()

    # Add the 'pas' column to count the number of observations in each group
    game_avgs['pas'] = df.groupby(['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name', 'x_vect', 'y_vect', 'temperature']).size().values

    # Sort by date
    game_avgs.sort_values(['game_date'], ascending=True, inplace=True)

    
    return game_avgs

In [None]:
# game_average_df = game_averages(complete_dataset)
# game_average_df.tail(1)

### Player Averages

Average stats of all the players in the game, coming into the game

In [None]:
def player_averages(df):
    # Stats to average
    batter_inputs_short = [f"{event}_b_long" for event in events_list]
    pitcher_inputs_short = [f"{event}_p_long" for event in events_list]

    # Apply stats from last at bat to entire game
    df[batter_inputs_short] = df.groupby(['gamePk', 'batter'])[batter_inputs_short].transform('last')
    df[pitcher_inputs_short] = df.groupby(['gamePk', 'pitcher'])[pitcher_inputs_short].transform('last')
    
    # Calculate player averages by game
    batter_avgs = df.groupby(['gamePk'])[batter_inputs_short].mean().reset_index()
    pitcher_avgs = df.groupby(['gamePk'])[pitcher_inputs_short].mean().reset_index()

    # Concatenate together
    player_avgs = pd.concat([batter_avgs, pitcher_avgs.drop(columns=['gamePk'])], axis=1)
    
    
    return player_avgs

In [None]:
# player_average_df = player_averages(complete_dataset)
# player_average_df.tail(1)

### League Averages

In [None]:
def league_average(complete_dataset, days=30):
    # Calculate daily sum of events
    league_avg = complete_dataset.groupby('game_date')[events_list].sum().reset_index()
    # Calculate total events
    league_avg['pas'] = league_avg[events_list].sum(axis=1)
    
    # Use rolling sum including the current row
    for event in events_list + ['pas']:
        league_avg[f'{event}_sum'] = league_avg[event].rolling(window=days, min_periods=1).sum()

    # Calculate average
    for event in events_list:
        league_avg[f'{event}_lg'] = league_avg[f'{event}_sum'] / league_avg['pas_sum']

        
    return league_avg[["game_date"] + [col for col in league_avg if "_lg" in col]]

In [None]:
# league_average_df = league_average(complete_dataset, 30)
# league_average_df.tail(1)

### Park Factors

##### Rolling Averages

Average of stats over last rolling_window games - excluding game of interest

In [None]:
def rolling_averages(game_avgs, rolling_window, column):
    # Sort by group column and date
    rolling_avgs = game_avgs.sort_values([column, 'game_date']).copy()

    # Compute rolling sum for `pas`
    rolling_avgs['pas_rolling'] = rolling_avgs.groupby(column)['pas'].transform(
        lambda x: x.rolling(window=rolling_window, min_periods=1, closed="right").sum()
    )

    # Define function for rolling weighted average
    def weighted_avg(group):
        return (
            group[events_list]
            .rolling(window=rolling_window, min_periods=1, closed="right")
            .apply(lambda x: (x * group.loc[x.index, 'pas']).sum() / group.loc[x.index, 'pas'].sum(), raw=False)
        )

    # Apply rolling weighted average by the given column
    rolling_avgs[events_list] = rolling_avgs.groupby(column, group_keys=False).apply(weighted_avg)

    
    return rolling_avgs

In [None]:
# Need to account for small sample parks

In [None]:
# park_average_df = rolling_averages(game_average_df, 243, 'venue_id')
# park_average_df = park_average_df[['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name'] + events_list + ['pas_rolling']]
# park_average_df.tail(1)

In [None]:
# team_average_df = rolling_averages(game_average_df, 243, 'away_name')
# team_average_df = team_average_df[['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name'] + events_list + ['pas_rolling']]
# team_average_df.tail(1)

##### Park Factors

In [None]:
def create_park_factors(park_avgs, team_avgs):
    # Sort by game_date
    park_avgs = park_avgs.sort_values('game_date')
    team_avgs = team_avgs.sort_values('game_date')

    # Create uniform team_name variable equal to name of interest
    park_avgs['team_name'] = park_avgs['home_name'].copy()
    team_avgs['team_name'] = team_avgs['away_name'].copy()

    # Set to datetime
    park_avgs['game_date'] = pd.to_datetime(park_avgs['game_date'])
    team_avgs['game_date'] = pd.to_datetime(team_avgs['game_date'])
    
    # Perform merge_asof
    park_factor_df = pd.merge_asof(park_avgs, team_avgs, left_on='game_date', right_on='game_date', by='team_name', direction='backward', suffixes=('_park', '_team'))

    # Calculate park factors
    for stat in events_list:
        park_factor_df[f'{stat}_pfx'] = park_factor_df[f'{stat}_park'] / park_factor_df[f'{stat}_team'] 
        
    park_factor_df.rename(columns={'gamePk_park': 'gamePk'}, inplace=True)
    keep_columns = ['gamePk'] + [col for col in park_factor_df.columns if col.endswith('pfx')]

    
    return park_factor_df[keep_columns]

In [None]:
# park_factor_df = create_park_factors(park_average_df, team_average_df)

In [None]:
# park_factor_df.tail(1)

### Analysis 

Merge together game averages, player averages, and park factors

In [None]:
def create_analysis_df(complete_dataset, league_average_df, park_factor_df):
    # Merge on league averages
    analysis_df = pd.merge(complete_dataset, league_average_df, on=['game_date'], how='inner')
    # Merge on park factors
    analysis_df = pd.merge(analysis_df, park_factor_df, on='gamePk', how='inner')
   
    
    # Extract dummies from venues
    venue_dummy_df = pd.get_dummies(analysis_df['venue_id'], prefix='venue')
    # Extract dummy column names
    venue_dummies = list(venue_dummy_df.columns)
    
    # Add in dummies
    analysis_df = pd.concat([analysis_df, venue_dummy_df], axis=1)
    
    # Select variables to keep
    variables = ['x_vect', 'y_vect', 'temperature'] + venue_dummies
    # Loop over events
    for event in events_list: 
        # Define the dependent variable (e.g., `b1`) and independent variables
        variables += [f'{event}_b_long', f'{event}_p_long', f'{event}_pfx']
    
    # Select relevant variables and drop missings
    analysis_df = analysis_df[["eventsModel", 'gamePk', 'game_date', 'venue_id', 'away_name', 'home_name', 'batter', 'pitcher', 'batSide', 'pitchHand'] + variables + [col for col in analysis_df if col.endswith("_lg")]].dropna()
    
    # Remove cut
    analysis_df = analysis_df[analysis_df['eventsModel'] != "Cut"]
    
    
    return analysis_df, venue_dummies

In [None]:
# analysis_df, venue_dummies = create_analysis_df(complete_dataset, league_average_df, park_factor_df)

### Create Datasets

In [None]:
complete_dataset['temperature'] = complete_dataset.apply(lambda row: 70 if 'Roof' in row['weather'] or 'Dome' in row['weather'] else row['temperature'], axis=1)

Generate or read base rates

In [None]:
### Generate base rates (base year = 2014)
# Only needs to be run once
# Generate:
# base_rate_df = base_rates(complete_dataset, 2014)
# base_rate_df.to_csv(os.path.join(baseball_path, "Base Rates.csv"), index=False)

# Read: 
base_rate_df = pd.read_csv(os.path.join(baseball_path, "Base Rates.csv"))

In [None]:
# List of dataframes
analysis_df_list = []
# Loop over batter sides
for batSide in ['L', 'R']:
    print(batSide)
    # Subset complete dataset
    complete_dataset_side = complete_dataset[complete_dataset['batSide'] == batSide]
    # Calculate game averages (average rates within a particular games)
    game_average_df = game_averages(complete_dataset_side)
    # # Calculate player averages (average rates of all players coming into the game) (deprecated? - player level is in complete_dataset, so it's unnecessary)
    # player_average_df = player_averages(complete_dataset_side)
    # Calculate league averages (average rates of all PAs over last n days coming into the day)
    league_average_df = league_average(complete_dataset_side, days=30)
    # Average rates at park over last n games (both teams)
    park_average_df = rolling_averages(game_average_df, 243, 'venue_id')
    # Average rates at away games over last n games (both teams)
    team_average_df = rolling_averages(game_average_df, 243, 'away_name')
    # Park factors
    park_factor_df = create_park_factors(park_average_df, team_average_df)
    # Create dataframe that can be used to train and analyze data
    analysis_df, venue_dummies = create_analysis_df(complete_dataset, league_average_df, park_factor_df)
    analysis_df_list.append(analysis_df)

Extract Dataframes

In [None]:
l_analysis_df = analysis_df_list[0].copy()
r_analysis_df = analysis_df_list[1].copy()

In [None]:
# del analysis_df_list

### Park Latest

This contains the latest data available at each park, used to create WFX <br>
Note: We can't just use multiplier dataset for this because it won't contain data at the end of the last game

Columns to Keep

In [None]:
park_latest_columns = ['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name'] + venue_dummies + [col for col in l_analysis_df.columns if col.endswith("_pfx")] + [col for col in l_analysis_df.columns if col.endswith("_lg")] 

Write Park's Last Values to CSV

In [None]:
l_analysis_df[park_latest_columns].sort_values('game_date').drop_duplicates('venue_id', keep='last').to_csv(os.path.join(baseball_path, "Park Latest - LHB.csv"), index=False)

In [None]:
r_analysis_df[park_latest_columns].sort_values('game_date').drop_duplicates('venue_id', keep='last').to_csv(os.path.join(baseball_path, "Park Latest - RHB.csv"), index=False)

### Shift

##### Park Factors

In [None]:
pfx_list = [col for col in l_analysis_df.columns if col.endswith("pfx")]

Previous game_date at venue_id

In [None]:
l_analysis_df[pfx_list] = l_analysis_df.groupby("venue_id")[pfx_list].shift(1)
l_analysis_df[pfx_list] = l_analysis_df.groupby(["venue_id", "game_date"])[pfx_list].transform("first")

In [None]:
r_analysis_df[pfx_list] = r_analysis_df.groupby("venue_id")[pfx_list].shift(1)
r_analysis_df[pfx_list] = r_analysis_df.groupby(["venue_id", "game_date"])[pfx_list].transform("first")

##### League Averages

In [None]:
lg_list = [col for col in l_analysis_df.columns if col.endswith("lg")]

In [None]:
l_analysis_df = l_analysis_df.sort_values('game_date', ascending=True)
l_analysis_df[lg_list] = l_analysis_df[lg_list].shift(1)
l_analysis_df[lg_list] = l_analysis_df.groupby("game_date")[lg_list].transform("first")

In [None]:
r_analysis_df = r_analysis_df.sort_values('game_date', ascending=True)
r_analysis_df[lg_list] = r_analysis_df[lg_list].shift(1)
r_analysis_df[lg_list] = r_analysis_df.groupby(["game_date"])[lg_list].transform("first")

##### Batter Average

Note: You need to shift by batter and pitchHand to get the batter's last PA against that hand

In [None]:
b_long_list = [col for col in l_analysis_df.columns if col.endswith("b_long")]

In [None]:
l_analysis_df[b_long_list] = l_analysis_df.groupby(['batter', 'pitchHand'])[b_long_list].shift(1)
r_analysis_df[b_long_list] = r_analysis_df.groupby(['batter', 'pitchHand'])[b_long_list].shift(1)

##### Pitcher Averages

In [None]:
p_long_list = [col for col in l_analysis_df.columns if col.endswith("p_long")]

Note: You don't to shift by batSide to get the pitcher's last PA against that hand because all hands are the same, but why not?

In [None]:
l_analysis_df[p_long_list] = l_analysis_df.groupby(['pitcher', 'batSide'])[p_long_list].shift(1)
r_analysis_df[p_long_list] = r_analysis_df.groupby(['pitcher', 'batSide'])[p_long_list].shift(1)

### Select Model Inputs

Inputs

In [None]:
# Identify inputs
training_input_list = ['x_vect', 'y_vect', 'temperature'] + venue_dummies
# Loop over events
for event in events_list: 
    # Define the dependent variable (e.g., `b1`) and independent variables
    training_input_list += [f'{event}_b_long', f'{event}_p_long', f'{event}_pfx']

In [None]:
# Identify inputs
testing_input_list = ['x_vect', 'y_vect', 'temperature'] + venue_dummies
# Loop over events
for event in events_list: 
    # Define the dependent variable (e.g., `b1`) and independent variables
    testing_input_list += [f'{event}_lg', f'{event}_lg', f'{event}_pfx']

### Select Data

Remove Infinite Values

In [None]:
l_analysis_df[training_input_list] = l_analysis_df[training_input_list].replace([np.inf, -np.inf], np.nan)
r_analysis_df[training_input_list] = r_analysis_df[training_input_list].replace([np.inf, -np.inf], np.nan)

Drop if Missing Data (Maybe after shift?)

In [None]:
l_analysis_df = l_analysis_df.dropna()
r_analysis_df = r_analysis_df.dropna()

### Train/Test Split

Split

In [None]:
np.random.seed(42)
l_analysis_df['split'] = np.random.choice([0, 0, 1], size=len(l_analysis_df))
r_analysis_df['split'] = np.random.choice([0, 0, 1], size=len(r_analysis_df))

Create masks to identify training and testing datasets (Might not use this)

In [None]:
l_training_mask = (l_analysis_df['split'] == 0)
r_training_mask = (r_analysis_df['split'] == 0)

In [None]:
len(training_input_list)

### WFX - L

##### Settings

In [None]:
layers = (83,83,83,83,83)
layers_str = ''.join(str(x) for x in layers)
activation = 'relu'
max_iter = 10
alpha = 0.0001
learning_rate = 0.00001
batch_size='auto'
random_state = random.randint(1,99999)
num_models = 1

quantiles = 10

wfx_l_filename = f"predict_wfx_l.pkl"
print(wfx_l_filename)

##### Train

In [None]:
%%time
if not hasattr(sys.modules['__main__'], '__file__'):
    # Create folder
    os.makedirs(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate), exist_ok=True)
    
    # Create Model
    predict_wfx_l = MLPClassifier(hidden_layer_sizes=layers, activation=activation, verbose=False, alpha=alpha, 
                                  learning_rate_init=learning_rate, early_stopping=True, random_state=random_state, max_iter=max_iter, batch_size=batch_size)

    # Fit
    predict_wfx_l.fit(l_analysis_df[training_input_list], l_analysis_df[['eventsModel']].values.ravel())

    # Save model
    pickle.dump(predict_wfx_l, open(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate, wfx_l_filename), 'wb'))

##### Predict

In [None]:
wfx_l_outputs = list(predict_wfx_l.classes_)
wfx_l_outputs_pred = [x + "_pred" for x in list(predict_wfx_l.classes_)]

l_analysis_df[wfx_l_outputs_pred] = predict_wfx_l.predict_proba(l_analysis_df[testing_input_list].values)

##### Evaluate

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    # Get dummies
    for event in events_list:
        l_analysis_df[event] = (l_analysis_df['eventsModel'] == event).astype(int)

##### Calculate WFX

In [None]:
for event in events_list:
    l_analysis_df[f'{event}_wfx_l'] = l_analysis_df[f'{event}_pred'] / base_rate_df[event][0]

### WFX - R

##### Settings

In [None]:
layers = (83,83,83,83,83)
layers_str = ''.join(str(x) for x in layers)
activation = 'relu'
max_iter = 100
alpha = 0.0001
learning_rate = 0.00001
batch_size='auto'
random_state = random.randint(1,99999)
num_models = 1

quantiles = 10

wfx_r_filename = f"predict_wfx_r.pkl"
print(wfx_r_filename)

##### Train

In [None]:
%%time
if not hasattr(sys.modules['__main__'], '__file__'):
    # Create Model
    predict_wfx_r = MLPClassifier(hidden_layer_sizes=layers, activation=activation, verbose=False, alpha=alpha, 
                                  learning_rate_init=learning_rate, early_stopping=True, random_state=random_state, max_iter=max_iter, batch_size=batch_size)

    # Fit
    predict_wfx_r.fit(r_analysis_df[training_input_list], r_analysis_df[['eventsModel']].values.ravel())

    # Save model
    pickle.dump(predict_wfx_r, open(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate, wfx_r_filename), 'wb'))

##### Predict

In [None]:
wfx_r_outputs = list(predict_wfx_r.classes_)
wfx_r_outputs_pred = [x + "_pred" for x in wfx_r_outputs]

r_analysis_df[wfx_r_outputs_pred] = predict_wfx_r.predict_proba(r_analysis_df[testing_input_list].values)

##### Evaluate

In [None]:
if not hasattr(sys.modules['__main__'], '__file__'):
    # Define the number of quantiles
    num_quantiles = 10  # Change as needed
    
    result_dict = {}
    
    for event in events_list:
        r_analysis_df[event] = (r_analysis_df['eventsModel'] == event).astype(int)

        # Create quantile bins based on event_pred
        r_analysis_df[f'{event}_quantile'] = pd.qcut(
            r_analysis_df[f'{event}_pred'], q=num_quantiles, duplicates='drop'
        )
    
        # Compute means for event and event_pred within each quantile
        summary_df = r_analysis_df.groupby(f'{event}_quantile')[[f'{event}_pred', event]].mean()
    
        # Store the result in a dictionary
        result_dict[event] = summary_df
        

##### Calculate WFX

In [None]:
for event in events_list:
    r_analysis_df[f'{event}_wfx_r'] = r_analysis_df[f'{event}_pred'] / base_rate_df[event][0]

### Multiplier Dataset

In [None]:
descriptive_columns = ['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name', 'x_vect', 'y_vect', 'temperature']
wfx_l_columns = [col for col in l_analysis_df.columns if col.endswith("_wfx_l")]
wfx_r_columns = [col for col in r_analysis_df.columns if col.endswith("_wfx_r")]

In [None]:
multiplier_dataset = pd.merge(l_analysis_df.drop_duplicates('gamePk', keep='last')[descriptive_columns + wfx_l_columns], 
                              r_analysis_df.drop_duplicates('gamePk', keep='last')[descriptive_columns + wfx_r_columns], on=descriptive_columns, how='left')

Write to CSV

In [None]:
multiplier_dataset['date'] = multiplier_dataset['game_date'].str.replace("-", "")

In [None]:
multiplier_dataset.to_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"), index=False)

In [None]:
BREAKLSDFL 

In [None]:
# Delete below

In [None]:
%%time
for i in range(num_models):
    
    # Create Model
    predict_wfx_l = MLPClassifier(hidden_layer_sizes=layers, activation=activation, verbose=False, alpha=alpha, 
                                  learning_rate_init=learning_rate, early_stopping=True, random_state=random_state+i, max_iter=max_iter, batch_size=batch_size)

    # Fit
    predict_wfx_l.fit(l_analysis_df[l_training_mask][input_list], l_analysis_df[l_training_mask][['eventsModel']].values.ravel())

    # Save model
    pickle.dump(predict_wfx_l, open(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate, wfx_l_filename), 'wb'))

    # Predict all types
    wfx_l_outputs = list(predict_wfx_l.classes_)
    wfx_l_outputs_pred = [x + "_pred" for x in wfx_l_outputs]
    
    l_analysis_df.loc[~l_training_mask, wfx_l_outputs_pred] = predict_wfx_l.predict_proba(l_analysis_df[~l_training_mask][input_list])

In [None]:
l_analysis_df.tail()

In [None]:
complete_dataset.query('batterName == "Jarren Duran"').query('pitchHand == "R"')[['event', 'so_b', 'b1_b']].tail(3)

### Train Models

$\hat{event}$ = event_b_long + event_p_long + event_pfx + x_vect + y_vect + temperature + venue_dummies

In [None]:
def train_models(analysis_df, venue_dummies, batSide, layers):
    # Identify inputs
    variables = ['x_vect', 'y_vect', 'temperature'] + venue_dummies
    # Loop over events
    for event in events_list: 
        # Define the dependent variable (e.g., `b1`) and independent variables
        variables += [f'{event}_b_long', f'{event}_p_long', f'{event}_pfx']
    
    # Prepare
    # Replace inf values with NaN (unlikely to occur in large samples)
    analysis_df[variables] = analysis_df[variables].replace([np.inf, -np.inf], np.nan)

    # Drop rows with NaN values
    X = analysis_df.dropna(subset=variables)[variables].values  # Independent variables
    y = analysis_df.dropna(subset=variables + ['eventsModel'])['eventsModel'].values  # Dependent variable

    # Define three neural network models with slightly different configurations
    nn_model_1 = MLPClassifier(hidden_layer_sizes=layers,activation='relu',solver='adam',max_iter=10,random_state=1)
    nn_model_2 = MLPClassifier(hidden_layer_sizes=layers,activation='relu',solver='adam',max_iter=10,random_state=2)
    nn_model_3 = MLPClassifier(hidden_layer_sizes=layers,activation='relu',solver='adam',max_iter=10,random_state=3)

    # Create a Voting Classifier with the three models
    voting_model = VotingClassifier([('nn1', nn_model_1),('nn2', nn_model_2),('nn3', nn_model_3)], voting='soft')

    # Train the Voting Regressor
    voting_model.fit(X, y)

    # Create directory for saving the model
    os.makedirs(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate), exist_ok=True)

    # Save the Voting Classifier
    with open(os.path.join(model_path, "M01. Park and Weather Factors", todaysdate, f"predict_wfx_{batSide.lower()}.pkl"), 'wb') as f:
        pickle.dump(voting_model, f)

    print(f"Voting model for {batSide}HB saved successfully.")

In [None]:
train_models(analysis_df, venue_dummies, "L", (2,))

### Run Predictions

In [None]:
def run_predictions(df, base_rate_df, model_date, batSide):
    variables = ['x_vect', 'y_vect', 'temperature'] + venue_dummies
    # Loop over events
    for event in events_list: 
        # Define the dependent variable (e.g., `b1`) and independent variables
        variables += [f'{event}_b_long', f'{event}_p_long', f'{event}_pfx']
    
    
    # Make predictions
    # Path to the saved model for
    saved_model_path = os.path.join(model_path, "M01. Park and Weather Factors", model_date, f"predict_wfx_{batSide.lower()}.pkl")

    # Load the model
    with open(saved_model_path, 'rb') as f:
        model = pickle.load(f)

    # Create input dataframe
    X = df.copy()

    X[variables] = X[variables].replace([np.inf, -np.inf], np.nan)

    # Drop rows with NaN values
    X = X.dropna(subset=variables)

    for event in events_list:
        # Use league averages to predict (NOT BASE RATES) 
        X[f'{event}_b_long'] = X[f'{event}_lg'].astype(float).copy()
        X[f'{event}_p_long'] = X[f'{event}_lg'].astype(float).copy()

        
    # Identify inputs
    variables = ['x_vect', 'y_vect', 'temperature'] + venue_dummies
    # Loop over events
    for event in events_list: 
        # Define the dependent variable (e.g., `b1`) and independent variables
        variables += [f'{event}_b_long', f'{event}_p_long', f'{event}_pfx']

    # Extract the feature data
    X = X[variables]
    
    # Predict using the loaded model
    class_list = list(model.classes_)
    prediction_columns = [f"{event}_pred" for event in class_list]
    prediction_df = pd.DataFrame(model.predict_proba(X), columns=prediction_columns)
    
    # Append 
    df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, prediction_df], axis=1)

    # Calculate wfx
    for event in events_list:
        # Compare to base year (NOT LEAGUE AVERAGE)
        df[f'{event}_wfx'] = df[f'{event}_pred'] / base_rate_df[event][0]
    
        
    return df

### Multiplier Dataset

In [None]:
train = True
model_date = "20250319"

##### Prepare

Note: You only have to prepare once even if you retrain the models

Read in complete dataset

In [None]:
%%time
# complete_dataset = create_pa_inputs(None, 2013, 2024, short=50, long=300, adjust=False)
complete_dataset = create_pa_inputs(None, start_year=2020, end_year=2020, short=50, long=300, adjust=False, generate=True, write=False)

In [None]:
analysis_df.tail(5)

##### Train

Rerun this when you want to retrain models

In [None]:
%%time
wfx_df_list = []
for batSide in ['L', 'R']:
    if batSide == 'L':
        analysis_df = analysis_df_list[0].copy()
    else:
        analysis_df = analysis_df_list[1].copy()
    
    # Drop missings
    analysis_df = analysis_df.dropna()
    
    # Train models
    if train == True:
        train_models(analysis_df, venue_dummies, batSide, layers=(38,38,38,38,38))
        
    # Create dataset with wfx
    wfx_df = run_predictions(analysis_df, base_rate_df, model_date, batSide)
    wfx_df_list.append(wfx_df)

Separate dataframes

In [None]:
lhb_df = wfx_df_list[0].copy()
rhb_df = wfx_df_list[1].copy()

Scale predictions:
- Numerator: Predicted rate
- Denominator: Sum of all event predicted rates (should be close to one, but won't be exact)

In [None]:
# List of predictions
pred_list = ['b1_pred', 'b2_pred', 'b3_pred', 'hr_pred', 'bb_pred', 'hbp_pred', 'so_pred', 'fo_pred', 'go_pred', 'lo_pred', 'po_pred']

# Sum of prediction odds
lhb_df['pred_sum'] = lhb_df[pred_list].sum(axis=1)
rhb_df['pred_sum'] = rhb_df[pred_list].sum(axis=1)

# Scaled
for event in pred_list:
    print(event)
    lhb_df[event] = lhb_df[event] / lhb_df['pred_sum']
    rhb_df[event] = rhb_df[event] / rhb_df['pred_sum']

Columns to keep

In [None]:
keep_list = ['gamePk', 'game_date', 'venue_id', 'away_name', 'home_name', 'x_vect', 'y_vect', 'temperature']
pfx_list = [col for col in wfx_df_list[0].columns if col.endswith('pfx')]
wfx_list = [col for col in wfx_df_list[0].columns if col.endswith('wfx')]
pred_list = [col for col in wfx_df_list[0].columns if col.endswith('_pred')]

In [None]:
event_dummies = pd.get_dummies(lhb_df['eventsModel']).astype(int)
lhb_df2 = pd.concat([lhb_df, event_dummies], axis=1)
lhb_df2 = lhb_df2.groupby(keep_list)[events_list + pfx_list + wfx_list + pred_list].mean(numeric_only=True).reset_index()

event_dummies = pd.get_dummies(rhb_df['eventsModel']).astype(int)
rhb_df2 = pd.concat([rhb_df, event_dummies], axis=1)
rhb_df2 = rhb_df2.groupby(keep_list)[events_list + pfx_list + wfx_list + pred_list].mean(numeric_only=True).reset_index()

In [None]:
rhb_df2.tail()

Create dataset

In [None]:
multiplier_df = pd.merge(lhb_df2, rhb_df2, on=keep_list, how='inner', suffixes=('_l', '_r'))

Read in game_df

In [None]:
%%time
game_df = create_games("20200101", todaysdate, team_map)

In [None]:
game_df['date'] = game_df['date'].astype(int).astype(str)

Add date

Note: game_date currently in multiplier_df will have original date in cases of postponements. date in game_df will have the correct date.

In [None]:
multiplier_df = multiplier_df.merge(game_df[['game_id', 'date']], left_on='gamePk', right_on=['game_id'], how='left')

Write to CSV

In [None]:
multiplier_df.sort_values(['date', 'gamePk'], ascending=[True, True]).to_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"), index=False)

### Evaluations

##### Rates by Quantile

In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(12, 9))  # 3 rows, 2 columns

# Flatten the axes array for easier iteration
axes = axes.flatten()

# Iterate through the events and their corresponding axes
for idx, event in enumerate(events_list):
    ax = axes[idx]  # Select the appropriate subplot
    
    # Step 1: Create quantile buckets for the current event
    lhb_df2['quantile'] = pd.qcut(lhb_df2[f'{event}_pred'], q=10, labels=False)  # 10 quantiles (adjust q as needed)
    
    # Step 2: Group by quantiles and calculate the mean
    quantile_means = lhb_df2.groupby('quantile').agg({f'{event}_pred': 'mean', event: 'mean'}).reset_index()
    
    # Step 3: Plot the predictions and actuals
    ax.plot(quantile_means['quantile'], quantile_means[f'{event}_pred'], label=f'Average {event}_pred', marker='o')
    ax.plot(quantile_means['quantile'], quantile_means[event], label=f'Average {event}', marker='x')
    
    # Add subplot details
    ax.set_title(f'{event} Predictions vs Actuals')
    ax.set_xlabel('Quantile')
    ax.set_ylabel('Average Value')
    ax.legend()
    ax.grid()

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(12, 9))  # 3 rows, 2 columns

# Flatten the axes array for easier iteration
axes = axes.flatten()

# Iterate through the events and their corresponding axes
for idx, event in enumerate(events_list):
    ax = axes[idx]  # Select the appropriate subplot
    
    # Step 1: Create quantile buckets for the current event
    rhb_df2['quantile'] = pd.qcut(rhb_df2[f'{event}_pred'], q=10, labels=False)  # 10 quantiles (adjust q as needed)
    
    # Step 2: Group by quantiles and calculate the mean
    quantile_means = rhb_df2.groupby('quantile').agg({f'{event}_pred': 'mean', event: 'mean'}).reset_index()
    
    # Step 3: Plot the predictions and actuals
    ax.plot(quantile_means['quantile'], quantile_means[f'{event}_pred'], label=f'Average {event}_pred', marker='o')
    ax.plot(quantile_means['quantile'], quantile_means[event], label=f'Average {event}', marker='x')
    
    # Add subplot details
    ax.set_title(f'{event} Predictions vs Actuals')
    ax.set_xlabel('Quantile')
    ax.set_ylabel('Average Value')
    ax.legend()
    ax.grid()

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

##### Yearly Trends

In [None]:
lhb_df2['year'] = lhb_df2['game_date'].str[:4]
event = 'hr'
lhb_df2.groupby('year')[[event, f'{event}_pred',  f'{event}_pfx', f'{event}_wfx']].mean()

##### Park Differences

In [None]:
rhb_df2['safe'] = rhb_df2[['b1', 'b2', 'b3', 'hr', 'bb', 'hbp']].sum(axis=1)
rhb_df2['out'] = rhb_df2[['so', 'go', 'lo', 'po', 'fo']].sum(axis=1)
rhb_df2['safe_pred'] = rhb_df2[['b1_pred', 'b2_pred', 'b3_pred', 'hr_pred', 'bb_pred', 'hbp_pred']].sum(axis=1)
rhb_df2['out_pred'] = rhb_df2[['so_pred', 'go_pred', 'lo_pred', 'po_pred', 'fo_pred']].sum(axis=1)

lhb_df2['safe'] = lhb_df2[['b1', 'b2', 'b3', 'hr', 'bb', 'hbp']].sum(axis=1)
lhb_df2['out'] = lhb_df2[['so', 'go', 'lo', 'po', 'fo']].sum(axis=1)
lhb_df2['safe_pred'] = lhb_df2[['b1_pred', 'b2_pred', 'b3_pred', 'hr_pred', 'bb_pred', 'hbp_pred']].sum(axis=1)
lhb_df2['out_pred'] = lhb_df2[['so_pred', 'go_pred', 'lo_pred', 'po_pred', 'fo_pred']].sum(axis=1)


In [None]:
park_error = rhb_df2[rhb_df2['venue_id'].astype('int').isin(team_map['VENUE_ID'])].groupby('venue_id')[['safe', 'safe_pred']].mean()
park_error['diff'] = park_error['safe'] - park_error['safe_pred']
park_error.sort_values('diff')

##### Park and Park x Weather Effects

In [None]:
rhb_df2.drop_duplicates('venue_id', keep='last')[['venue_id'] + [col for col in rhb_df2.columns if col.endswith("_pfx")] + [col for col in rhb_df2.columns if col.endswith("_wfx")]].sort_values('venue_id')

### Generate Park and Weather Factors files

In [None]:
multiplier_dataset = pd.read_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"))

In [None]:
# Select columns to keep
keep_columns = ['gamePk', 'game_date', 'date', 'venue_id', 'away_name', 'home_name', 'x_vect', 'y_vect', 'temperature'] + [col for col in multiplier_dataset.columns if "_wfx" in col]
    
multiplier_dataset.sort_values('date', inplace=True)
for date in multiplier_dataset[pd.to_datetime(multiplier_dataset['game_date']).dt.year >= 2022]['date'].unique():
    print(date)
    if date > "20220101":
        # Subset by date
        daily_weather_df = multiplier_dataset[multiplier_dataset['date'] == date][keep_columns]

        # Write to CSV
        daily_weather_df.to_csv(os.path.join(baseball_path, "A06. Weather", "3. Park and Weather Factors", f"{date} Park and Weather Factors.csv"), index=False)

In [None]:
multiplier_dataset.head()

### Note: Rerun B01. Matchups.ipynb if new historic Park x Weather Effects are generated