# M06. Park and Weather Factors
This creates estimate of event rate multipliers based on park and weather conditions
- Type: Model
- Run Frequency: Irregular
- Sources:
    - MLB API
- Dates:
    - Created: 4/19/2024
    - Updated: 4/21/2024

### Imports

In [1]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"

In [2]:
%run "A02. MLB API.ipynb"
%run "A03. Steamer.ipynb"

### Dataset

Create dataset of plays with rolling stats

In [None]:
complete_dataset = create_pa_inputs(park_factors, team_map, 2015, 2024, short=50, long=300, adjust=True)

In [None]:
# UPDATE IN IMPORTS
year_inputs = [f"year_{year}" for year in range(2015,2025)]

Remove imputed players

In [None]:
unimputed_df = complete_dataset.query('imp_b == 0 and imp_p == 0')

Remove park adjustments from outcomes

In [None]:
for stat in events_list:
    unimputed_df[stat] = (unimputed_df[stat] > 0).astype(int)

Create game averages

In [None]:
events_list_b = [f"{event}_b" for event in events_list]
events_list_p = [f"{event}_p" for event in events_list]

In [None]:
# Group stats by game/venue/batSide to get model inputs
game_df = unimputed_df.groupby(['gamePk', 'venue_id', 'batSide'])[events_list + events_list_b + events_list_p + ['x_vect', 'y_vect', 'temperature'] + year_inputs].mean().reset_index()

# Add in venue dummies, only keeping active parks
venue_dummies = pd.get_dummies(game_df['venue_id']).astype(int)
active_parks = list(team_map['VENUE_ID'].astype(str))
venue_dummies = venue_dummies[active_parks]
game_df = pd.concat([game_df, venue_dummies], axis=1)

# Only keep games in active parks
game_df['active_park'] = game_df[active_parks].sum(axis=1)
game_df = game_df.query('active_park == 1')
game_df = game_df.dropna().reset_index(drop=True)

# Create a mapping dictionary for renaming columns
active_venues = [f"venue_{park}" for park in active_parks]
column_mapping = {col: 'venue_' + col for col in active_parks}

# Rename only the columns present in active_parks
game_df.rename(columns=column_mapping, inplace=True)

# Add lefty dummy
game_df['lefty'] = (game_df['batSide'] == "L").astype(int)

In [None]:
# Step 2: Create interaction terms
interaction_terms = []
# Weather x Park
for col1 in ['x_vect', 'y_vect', 'temperature']:
    for col2 in active_venues:
        interaction_name = col1 + '_' + col2
        game_df[interaction_name] = game_df[col1] * game_df[col2]
        interaction_terms.append(interaction_name)
# Weather x Park x batSide
for col1 in ['x_vect', 'y_vect', 'temperature']:
    for col2 in active_venues:
        for col3 in ['lefty']:
            interaction_name = col1 + '_' + col2 + '_' + col3
            game_df[interaction_name] = game_df[col1] * game_df[col2] * game_df[col3]
            interaction_terms.append(interaction_name)
# Year x Park
for col1 in year_inputs:
    for col2 in active_venues:
        interaction_name = col1 + '_' + col2
        game_df[interaction_name] = game_df[col1] * game_df[col2]
        interaction_terms.append(interaction_name)
# Year x Park x batSide
for col1 in year_inputs:
    for col2 in active_venues:
        for col3 in ['lefty']:
            interaction_name = col1 + '_' + col2 + '_' + col3
            game_df[interaction_name] = game_df[col1] * game_df[col2] * game_df[col3]
            interaction_terms.append(interaction_name)
# batSide x Park
for col1 in ['lefty']:
    for col2 in active_venues:
        interaction_name = col1 + '_' + col2
        game_df[interaction_name] = game_df[col1] * game_df[col2]
        interaction_terms.append(interaction_name)
game_df['x_vect_temperature'] = np.abs(game_df['x_vect']) * game_df['temperature']
game_df['y_vect_temperature'] = game_df['y_vect'] * game_df['temperature']
game_df['x_vect_y_vect'] = np.abs(game_df['x_vect']) * game_df['y_vect']
interaction_terms.extend(['x_vect_temperature', 'y_vect_temperature', 'x_vect_y_vect'])

park x weather
park x weather x lefty
park x year 
park x year x lefty
park x lefty
weather x weather

### Regressions

##### 1B

In [None]:
# Step 3: Define features and target variable
X = game_df[['b1_b', 'b1_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['b1']

# Step 4: Fit a linear regression model
b1_model = LinearRegression()
b1_model.fit(X, y)

pickle.dump(b1_model, open(os.path.join(model_path, f"Weather Model - b1 {todaysdate}"), 'wb'))

X_copy = X.copy()

# Replace for predicting with average
X_copy['b1_b'] = unimputed_df['b1_b'].mean()
X_copy['b1_p'] = unimputed_df['b1_p'].mean()


# Step 5: Make predictions
predictions = b1_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_b1'] = predictions
game_df['decile_b1'] = pd.qcut(game_df['predicted_b1'], 10, labels=False)

##### 2B

In [None]:
# Step 3: Define features and target variable
X = game_df[['b2_b', 'b2_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['b2']

# Step 4: Fit a linear regression model
b2_model = LinearRegression()
b2_model.fit(X, y)

pickle.dump(b2_model, open(os.path.join(model_path, f"Weather Model - b2 {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace for predicting with average
X_copy['b2_b'] = unimputed_df['b2_b'].mean()
X_copy['b2_p'] = unimputed_df['b2_p'].mean()

# Step 5: Make predictions
predictions = b2_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_b2'] = predictions
game_df['decile_b2'] = pd.qcut(game_df['predicted_b2'], 10, labels=False)

##### 3B

In [None]:
# Step 3: Define features and target variable
X = game_df[['b3_b', 'b3_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['b3']

# Step 4: Fit a linear regression model
b3_model = LinearRegression()
b3_model.fit(X, y)

pickle.dump(b3_model, open(os.path.join(model_path, f"Weather Model - b3 {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace for predicting with average
X_copy['b3_b'] = unimputed_df['b3_b'].mean()
X_copy['b3_p'] = unimputed_df['b3_p'].mean()

# Step 5: Make predictions
predictions = b3_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_b3'] = predictions
game_df['decile_b3'] = pd.qcut(game_df['predicted_b3'], 10, labels=False)

##### HR

In [None]:
# Step 3: Define features and target variable
# X = game_df.drop(columns=['hr', 'batSide', 'gamePk'])
X = game_df[['hr_b', 'hr_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['hr']

# Step 4: Fit a linear regression model
hr_model = LinearRegression()
hr_model.fit(X, y)

pickle.dump(hr_model, open(os.path.join(model_path, f"Weather Model - hr {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace for predicting with average
X_copy['hr_b'] = unimputed_df['hr_b'].mean()
X_copy['hr_p'] = unimputed_df['hr_p'].mean()

# Step 5: Make predictions
predictions = hr_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted hr
game_df['predicted_hr'] = predictions
game_df['decile_hr'] = pd.qcut(game_df['predicted_hr'], 10, labels=False)

##### BB

In [None]:
# Step 3: Define features and target variable
X = game_df[['bb_b', 'bb_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['bb']

# Step 4: Fit a linear regression model
bb_model = LinearRegression()
bb_model.fit(X, y)

pickle.dump(bb_model, open(os.path.join(model_path, f"Weather Model - bb {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace for predicting with average
X_copy['bb_b'] = unimputed_df['bb_b'].mean()
X_copy['bb_p'] = unimputed_df['bb_p'].mean()

# Step 5: Make predictions
predictions = bb_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_bb'] = predictions
game_df['decile_bb'] = pd.qcut(game_df['predicted_bb'], 10, labels=False)

##### HBP

In [None]:
# Step 3: Define features and target variable
X = game_df[['hbp_b', 'hbp_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['hbp']

# Step 4: Fit a linear regression model
hbp_model = LinearRegression()
hbp_model.fit(X, y)

pickle.dump(hbp_model, open(os.path.join(model_path, f"Weather Model - hbp {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace for predicting with average
X_copy['hbp_b'] = unimputed_df['hbp_b'].mean()
X_copy['hbp_p'] = unimputed_df['hbp_p'].mean()

# Step 5: Make predictions
predictions = hbp_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_hbp'] = predictions
game_df['decile_hbp'] = pd.qcut(game_df['predicted_hbp'], 10, labels=False)

##### SO

In [None]:
# Step 3: Define features and target variable
X = game_df[['so_b', 'so_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['so']

# Step 4: Fit a linear regression model
so_model = LinearRegression()
so_model.fit(X, y)

pickle.dump(so_model, open(os.path.join(model_path, f"Weather Model - so {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace sor predicting with average
X_copy['so_b'] = unimputed_df['so_b'].mean()
X_copy['so_p'] = unimputed_df['so_p'].mean()

# Step 5: Make predictions
predictions = so_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_so'] = predictions
game_df['decile_so'] = pd.qcut(game_df['predicted_so'], 10, labels=False)

##### FO

In [None]:
# Step 3: Define features and target variable
X = game_df[['fo_b', 'fo_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['fo']

# Step 4: Fit a linear regression model
fo_model = LinearRegression()
fo_model.fit(X, y)

pickle.dump(fo_model, open(os.path.join(model_path, f"Weather Model - fo {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace for predicting with average
X_copy['fo_b'] = unimputed_df['fo_b'].mean()
X_copy['fo_p'] = unimputed_df['fo_p'].mean()

# Step 5: Make predictions
predictions = fo_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_fo'] = predictions
game_df['decile_fo'] = pd.qcut(game_df['predicted_fo'], 10, labels=False)

##### GO

In [None]:
# Step 3: Define features and target variable
X = game_df[['go_b', 'go_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['go']

# Step 4: Fit a linear regression model
go_model = LinearRegression()
go_model.fit(X, y)

pickle.dump(go_model, open(os.path.join(model_path, f"Weather Model - go {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace for predicting with average
X_copy['go_b'] = unimputed_df['go_b'].mean()
X_copy['go_p'] = unimputed_df['go_p'].mean()

# Step 5: Make predictions
predictions = go_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_go'] = predictions
game_df['decile_go'] = pd.qcut(game_df['predicted_go'], 10, labels=False)

##### LO

In [None]:
# Step 3: Define features and target variable
X = game_df[['lo_b', 'lo_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['lo']

# Step 4: Fit a linear regression model
lo_model = LinearRegression()
lo_model.fit(X, y)

pickle.dump(lo_model, open(os.path.join(model_path, f"Weather Model - lo {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace for predicting with average
X_copy['lo_b'] = unimputed_df['lo_b'].mean()
X_copy['lo_p'] = unimputed_df['lo_p'].mean()

# Step 5: Make predictions
predictions = lo_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_lo'] = predictions
game_df['decile_lo'] = pd.qcut(game_df['predicted_lo'], 10, labels=False)

##### PO

In [None]:
# Step 3: Define features and target variable
X = game_df[['po_b', 'po_p', 'x_vect', 'y_vect', 'temperature', 'lefty'] + year_inputs + interaction_terms] 
y = game_df['po']

# Step 4: Fit a linear regression model
po_model = LinearRegression()
po_model.fit(X, y)

pickle.dump(po_model, open(os.path.join(model_path, f"Weather Model - po {todaysdate}"), 'wb'))


X_copy = X.copy()

# Replace for predicting with average
X_copy['po_b'] = unimputed_df['po_b'].mean()
X_copy['po_p'] = unimputed_df['po_p'].mean()

# Step 5: Make predictions
predictions = po_model.predict(X_copy)

# Step 6: Calculate deciles based on predicted 
game_df['predicted_po'] = predictions
game_df['decile_po'] = pd.qcut(game_df['predicted_po'], 10, labels=False)

### Graph

In [None]:
# Choose an event to graph and view park/weather factors
event = 'b2'

# Step 7: Calculate average predicted and actual hr for each decile
decile_means = game_df.query('venue_id == "3"').groupby(f'decile_{event}').agg({f'predicted_{event}': 'mean', f'{event}': 'mean'})

# Step 8: Plot actual hr vs predicted hr by decile
plt.figure(figsize=(10, 6))
plt.plot(decile_means.index, decile_means[f'predicted_{event}'], label=f'Predicted {event}', marker='o')
plt.plot(decile_means.index, decile_means[f'{event}'], label=f'Actual {event}', marker='o')

plt.xlabel('Decile')
plt.ylabel(f'{event}')
plt.title(f'Actual vs Predicted {event} by Decile')
plt.legend()
plt.show()

In [None]:
game_df['is_out_pred'] = game_df[['predicted_so', 'predicted_go', 'predicted_lo', 'predicted_fo', 'predicted_po']].sum(axis=1)
game_df['is_out'] = game_df[['so', 'go', 'lo', 'fo', 'po']].sum(axis=1)

In [None]:
# Choose an event to graph and view park/weather factors
event = 'out'


# game_df['decile_out'] = pd.qcut(game_df['is_out_pred'], 10, labels=False)
# decile_means = game_df.groupby('decile_out').agg({'is_out_pred': 'mean', 'is_out': 'mean'})

game_df['decile_out'] = pd.qcut(game_df.query('venue_id == "3"')['is_out_pred'], 10, labels=False)
decile_means = game_df.query('venue_id == "3"').groupby('decile_out').agg({'is_out_pred': 'mean', 'is_out': 'mean'})


# Step 8: Plot actual hr vs predicted hr by decile
plt.figure(figsize=(10, 6))
plt.plot(decile_means.index, decile_means['is_out_pred'], label=f'Predicted {event}', marker='o')
plt.plot(decile_means.index, decile_means['is_out'], label=f'Actual {event}', marker='o')

# Set the y-axis limits
plt.ylim(0.62, 0.72)

plt.xlabel('Decile')
plt.ylabel(f'{event}')
plt.title(f'Actual vs Predicted {event} by Decile')
plt.legend()
plt.show()

In [None]:
game_df.query('venue_id == "3"').groupby('decile_out')[['is_out', 'x_vect', 'y_vect', 'temperature']].mean(numeric_only=True)

### Display Park/Weather Factors

In [None]:
event = 'hr'

# Calculate park factors (=predicted rate/average rate)
game_df[f'{event}_factor'] = game_df[f'predicted_{event}'] / game_df[event].mean()

# Calculate averages by park
park_averages = game_df[game_df['venue_id'].isin(team_map['VENUE_ID'].astype(str))].groupby('venue_id')[[f'decile_{event}', f'{event}', f'predicted_{event}', f'{event}_b', f'{event}_p']].mean().sort_values(by=[f'decile_{event}'], ascending=False).reset_index()
# Calculate factor summary statistics for parks for given state
park_descriptions = game_df[game_df['venue_id'].isin(team_map['VENUE_ID'].astype(str))].groupby('venue_id')[f'{event}_factor'].describe().sort_values('mean', ascending=False)

# Merge on team information
merge_map = team_map[['BBREFTEAM', 'VENUE_ID']]
merge_map['VENUE_ID'] = merge_map['VENUE_ID'].astype(str)
park_descriptions = park_descriptions.merge(merge_map, left_on=['venue_id'], right_on=['VENUE_ID'], how='left')
# Merge on park averages
park_descriptions = park_descriptions.merge(park_averages, left_on=['VENUE_ID'], right_on=['venue_id'], how='left')

# Calculate park factors
park_descriptions[f'{event}_factor'] = park_descriptions[f'predicted_{event}'] / game_df[event].mean()
park_descriptions = park_descriptions[['BBREFTEAM', 'VENUE_ID', f'{event}_factor', f'{event}', f'predicted_{event}', f'{event}_b', f'{event}_p', f'decile_{event}', 'mean', 'count', 'std', 'min', '25%', '50%', '75%', 'max']]

park_descriptions

Future Considerations:
- Consider rolling park factors to avoid using year
- Clean to remove certain anomalous games (wind at the Trop, for example)

In [None]:
unimputed_df[events_list].mean()

In [None]:
park_descriptions