# 006B. Park and Weather Factors

Note: doing rolling pas. Not sure if calendar date would be better. Has advantages with seasonality, but better hitting weather should be reflected in numerator and denominator even without that, so it might not matter

### Imports

In [1]:
if "running_pipeline" not in globals():
    print("Running imports...")
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
    print("Imports in.")
else:
    print("Imports already in.")

Running imports...
Imports in.


We want some functions from MLB API notebook

In [2]:
run_datasets = False

In [3]:
%run "C:\Users\james\Documents\MLB\Code\002. MLB API.ipynb"

### Games

In [4]:
if "running_pipeline" not in globals():
    # Set date range 
    start_date = yesterdaysdate
    end_date = yesterdaysdate
    print(f"Creating game_df for {start_date} to {end_date}...")
    game_df = read_and_save_games(team_map, generate=True)
    game_df = game_df[(game_df['date'] >= start_date) & (game_df['date'] <= end_date)].reset_index(drop=True)
    print("game_df created.")
else:
    print("game_df already generated.")

Creating game_df for 20240817 to 20240817...
game_df created.


### Notes

Rolling averages are NOT shifted

Calculate Park Factors <br>
Park Factor = Rate at Park (both teams) / Rate in Home Team's Away Games (both teams) <br>
For example: Fenway Park HR Factor = HR rate at Fenway / HR rate in games where Red Sox are away team

### Functions

##### Period Averages

Average of stats league-wide over period of interest, used as base for calculating multipliers

In [5]:
def period_averages(df):
    # Convert to datetime
    df['game_date'] = pd.to_datetime(df['game_date'])

    # Select period of interest
    df = df[df['game_date'] >= '01-01-2015']

    # Calculate averages over period of interest
    period_avgs = pd.DataFrame(df[events_list].mean()).T

    
    return period_avgs

##### Game Averages

Averages of stats that occurred in the game

In [6]:
def game_averages(df):
    # Calculate averages by game
    game_avgs = df.groupby(['gamePk', 'game_date', 'venue_id', 'x_vect', 'y_vect', 'temperature'])[events_list].mean().reset_index()

    
    return game_avgs

##### Player Averages

Averages of stats for players in game, coming into the game

In [7]:
def player_averages(df):
    # Note: these are already shifted if using create_pa_inputs from A02. We want the first PA for players in each game.
    # Stats to average
    batter_inputs_short = [f"{event}_b_long" for event in events_list]
    pitcher_inputs_short = [f"{event}_p_long" for event in events_list]

    # Apply stats from first at bat to entire game
    # First at bat has stats through end of last game
    # This ensures that no stats generated in-game are reflected
    # Note: we're doing this instead of dropping duplicates to properly weight by PA
    df[batter_inputs_short] = df.groupby(['gamePk', 'batter'])[batter_inputs_short].transform('first')
    df[pitcher_inputs_short] = df.groupby(['gamePk', 'pitcher'])[pitcher_inputs_short].transform('first')
    
    # Calculate player averages by game
    batter_avgs = df.groupby(['gamePk'])[batter_inputs_short].mean().reset_index()
    pitcher_avgs = df.groupby(['gamePk'])[pitcher_inputs_short].mean().reset_index()

    # Merge together
    player_avgs = pd.merge(batter_avgs, pitcher_avgs, on='gamePk', how='inner')

    
    return player_avgs

##### League Averages

Averages of stats league-wide, coming into the game

In [8]:
def league_averages(df, league_window, league_window_min, base_year=2014):    
    # Calculate rolling average of stats
    league_avgs = df[events_list].rolling(window=league_window, min_periods=league_window_min).mean()
    league_avgs.columns = [f'{col}_league' for col in league_avgs.columns]

    # Keep column names in a list
    column_names = league_avgs.columns

    # Add game date onto stats
    league_avgs = pd.concat([df[['game_date']], league_avgs], axis=1)

    # Drop duplicates, keeping last
    league_avgs.drop_duplicates('game_date', keep='last', inplace=True)

    # Shift so dates reflect stats through the end of the prior date
    league_avgs[column_names] = league_avgs[column_names].shift(1)

    # Create date variables
    league_avgs['game_date'] = pd.to_datetime(league_avgs['game_date'])
    league_avgs['month'] = league_avgs['game_date'].dt.month
    league_avgs['day'] = league_avgs['game_date'].dt.day
    
    # Subset base year
    base_year_df = league_avgs[league_avgs['game_date'].dt.year == base_year]
    
    # Merge on base year
    league_avgs = pd.merge(league_avgs, base_year_df, on=['month', 'day'], how='left', suffixes=("", "_base"))
    
    # Identify columns that contain '_base' in their names
    base_columns = [col for col in league_avgs.columns if '_base' in col]

    # Apply forward fill to those columns
    league_avgs[base_columns] = league_avgs[base_columns].ffill()

    # Create multipliers
    for column in column_names:
        league_avgs[column] = league_avgs[column] / league_avgs[f'{column}_base']
        league_avgs[column].fillna(1, inplace=True)
    
    keep_list = ['game_date'] + list(column_names)
    
    league_avgs = league_avgs[keep_list]
    
    
    return league_avgs

##### Park Averages

Averages of stats at park

In [9]:
def park_averages(df, park_window, park_window_min):
    # Calculate rolling averages by park 
    park_avgs = df.groupby('venue_id')[events_list].rolling(window=park_window, min_periods=park_window_min).mean()
   
    # Reset index to align with original DataFrame
    park_avgs = park_avgs.reset_index(level=0, drop=False)
    
    # Rename columns to indicate they are park averages
    for column in park_avgs[events_list]:
        park_avgs.rename(columns={column: f"{column}_park"}, inplace=True)

    # Sort to return to correct ordering
    park_avgs.sort_index(ascending=True, inplace=True)
    
    # Add in date
    park_avgs = pd.concat([df[['game_date', 'home_name']], park_avgs], axis=1)

    # Only keep one observation per park
    park_avgs.drop_duplicates(['game_date', 'venue_id'], keep='last', inplace=True)

    
    column_names = [column for columns in park_avgs.columns if "_park" in column]
    
    # Shift so dates reflect stats through the end of the prior date
    park_avgs.groupby(['game_date', 'venue_id'])[column_names].shift(1)
    
    
    return park_avgs

##### Team Averages

Average of stats by team

In [10]:
def team_averages(df, park_window, park_window_min):
    # Calculate rolling averages by park 
    team_avgs = df.groupby('away_name')[events_list].rolling(window=park_window, min_periods=park_window_min).mean()
   
    # Reset index to align with original DataFrame
    team_avgs = team_avgs.reset_index(level=0, drop=False)
    
    # Rename columns to indicate they are park averages
    for column in team_avgs[events_list]:
        team_avgs.rename(columns={column: f"{column}_team"}, inplace=True)

    # Sort to return to correct ordering
    team_avgs.sort_index(ascending=True, inplace=True)
    
    # Add in date
    team_avgs = pd.concat([df[['game_date']], team_avgs], axis=1)

    # Only keep one observation per park
    team_avgs.drop_duplicates(['game_date', 'away_name'], keep='last', inplace=True)
    
    
    return team_avgs

##### Calculate Park Factors

In [11]:
# # Identify last date in team_avgs before given dates in park_avgs
# latest_dates = []

# for index, row in park_avgs.iterrows():
#     # Filter team_avgs based on criteria
#     filtered_team_avgs = team_avgs[(team_avgs['away_name'] == row['home_name']) & (team_avgs['game_date'] < row['game_date'])]
    
#     # Find the latest date in the filtered dataframe
#     if not filtered_team_avgs.empty:
#         latest_date = filtered_team_avgs['game_date'].max()
#         latest_dates.append(latest_date)
#     else:
#         latest_dates.append(pd.NaT)  # Append NaT if no matching date found

# # Add the latest_dates to park_avgs
# park_avgs['last_road_date'] = latest_dates

# # Merge 
# factor_df = pd.merge(park_avgs, team_avgs, left_on=['home_name', 'last_road_date'], right_on=['away_name', 'game_date'], how='left', suffixes=("", "_"))

# # Loop over event rates and calculate factors
# for event in events_list:
#     factor_df[f'{event}_factor'] = factor_df[f'{event}_park'].astype(float) / factor_df[f'{event}_team'].astype(float)

# factor_df.tail()

##### Create Dataset

Calculates Park x Weather multipliers and all necessary components

In [12]:
def create_dataset(df, park_window, park_window_min, league_window, league_window_min, base_year, batSide="L"):
    # Only keep regular season games
    df = df[df['game_type_x'] == "R"]
    
    # Only look at one side of the plate
    df = df[df['batSide'] == batSide]
    
    # Reset index
    df.reset_index(inplace=True, drop=True)

    # Create uniform Cleveland name (only necessary for away team, but do both)
    df['away_name'] = np.where(df['away_name'] == "Cleveland Indians", "Cleveland Guardians", df['away_name'])
    df['home_name'] = np.where(df['home_name'] == "Cleveland Indians", "Cleveland Guardians", df['home_name'])

    
    # Convert to datetime
    df['game_date'] = pd.to_datetime(df['game_date'])

    # Convert outputs to numeric (some are boolean)
    df[events_list] = df[events_list].astype('float64')
    

    ### Game Averages
    game_avgs = game_averages(df)

    ### Player Averages
    player_avgs = player_averages(df)


    ### League Averages
    league_avgs = league_averages(df, league_window, league_window_min, base_year)


    ### Park Averages
    park_avgs = park_averages(df, park_window, park_window_min)
    park_avgs['home_name'] = np.where(park_avgs['home_name'] == "Cleveland Indians", "Cleveland Guardians", park_avgs['home_name'])

    ### Team Averages
    team_avgs = team_averages(df, park_window, park_window_min)
    team_avgs['away_name'] = np.where(team_avgs['away_name'] == "Cleveland Indians", "Cleveland Guardians", team_avgs['away_name'])


    ### Calculate Park Factors from Park Averages and Team Averages
    # Identify last date in team_avgs before given dates in park_avgs
    # This is so we can identify what the team was doing on the road leading up to their game at home
    latest_dates = []
    
    for index, row in park_avgs.iterrows():
        # Filter team_avgs based on criteria
        filtered_team_avgs = team_avgs[(team_avgs['away_name'] == row['home_name']) & (team_avgs['game_date'] < row['game_date'])]
        
        # Find the latest date in the filtered dataframe
        if not filtered_team_avgs.empty:
            latest_date = filtered_team_avgs['game_date'].max()
            latest_dates.append(latest_date)
        else:
            latest_dates.append(pd.NaT)  # Append NaT if no matching date found

    # Add the latest_dates to park_avgs
    park_avgs['last_road_date'] = latest_dates

    # Merge 
    factor_df = pd.merge(park_avgs, team_avgs, left_on=['home_name', 'last_road_date'], right_on=['away_name', 'game_date'], how='left', suffixes=("", "_"))
    
    # Calculate Park Factors
    for event in events_list:
        factor_df[f'{event}_factor'] = factor_df[f'{event}_park'].astype(float) / factor_df[f'{event}_team'].astype(float)

    # Keep relevant columns
    park_columns = [column for column in factor_df if "_factor" in column] + [column for column in factor_df if "_park" in column] + [column for column in factor_df if "_team" in column]
    keep_columns = ['home_name', 'game_date', 'venue_id'] + park_columns

    factor_df = factor_df[keep_columns]

    # Cleveland has two names in the data. Need to treat as one.
    factor_df['home_name'] = np.where(factor_df['home_name'] == "Cleveland Indians", "Cleveland Guardian", factor_df['home_name'])

    factor_df['game_date'] = pd.to_datetime(factor_df['game_date'])

    
    ### Merge
    dataset = pd.merge(game_avgs, player_avgs, on='gamePk', how='inner')
    dataset = pd.merge(dataset, league_avgs, on='game_date', how='inner')
    dataset = pd.merge(dataset, factor_df, on=['game_date', 'venue_id'], how='inner')

    # Sort
    dataset.sort_values('game_date', ascending=True, inplace=True)

    # Reset index
    dataset.reset_index(drop=True, inplace=True)
    
    return dataset

Subsets and cleans dataset

In [13]:
def clean_dataset(dataset):
    # Select active ballparks
    active_dataset = dataset[dataset['venue_id'].astype(int).isin(list(team_map['VENUE_ID']))].reset_index(drop=True)
    
    # Restrict to 2015-
    active_dataset = active_dataset[active_dataset['game_date'] >= '01-01-2015']
    
    # Create venue_id dummies
    active_dataset['venue_id_copy'] = active_dataset['venue_id'].copy()
    active_dataset = pd.get_dummies(active_dataset, columns=['venue_id_copy'], drop_first=False, prefix="venue", prefix_sep="_")
    
    # Create interactions of weather and park variables
    weather_interactions = []
    
    for venue in team_map['VENUE_ID']:
        for weather in ['x_vect', 'y_vect', 'temperature']:
            active_dataset[f'venue_{venue}_{weather}'] = active_dataset[f'venue_{venue}'] * active_dataset[weather]
            weather_interactions.append(f'venue_{venue}_{weather}')

            
    # weather_interactions = weather_interactions + [f'venue_{venue}' for venue in list(team_map['VENUE_ID'].unique())] + ['x_vect', 'y_vect', 'temperature']
            
    return active_dataset, weather_interactions

##### Model

Runs model, training it if selected

In [14]:
def run_model(df, event, batSide, weather_interactions, train=False):
    # Define the dependent variable (y) and independent variables (X)
    y = df.dropna()[f'{event}']

    # Select model inputs
    X_columns = [f'{event}_b_long', f'{event}_p_long', f'{event}_league', f'{event}_factor'] + weather_interactions
    
    # Drop missings
    X = df.dropna()[X_columns]
    # Convert columns to numeric
    X[X_columns] = X[X_columns].astype(float)

    # Add a constant to the independent variables matrix to include an intercept in the model
    X = sm.add_constant(X)
    
    # Add a constant to the independent variables matrix to include an intercept in the model
    if train == True:  
        # Fit the linear regression model
        model = sm.OLS(y, X).fit()

    else:
        # Select model      
        model = globals().get(f'{event}_{str.lower(batSide)}_model')

        
    return model, X
# # WHY DIVIDE BY ACVG WHEN IT SHOULD BE BASE YEAR?
# # Also shouold league be based on a moving target?

Apply predictions to dataset

In [15]:
def run_predictions(active_dataset, batSide, weather_interactions, period_avgs, train=False):
    model_dictionary = {}
    for event in events_list:
        print(event)
        # Train model
        model, X = run_model(active_dataset, event, batSide, weather_interactions, train)
        # If we trained a new model,
        if train == True:
            # Save model
            pickle.dump(model, open(os.path.join(model_path, f"Weather Model - {event} {batSide} {todaysdate}"), 'wb'))
            # Save to dictionary
            # model_dictionary[event] = model.summary()
    
        # Replace with average
        X[f'{event}_b_long'] = period_avgs[f'{event}'][0]
        X[f'{event}_p_long'] = period_avgs[f'{event}'][0]
    
        # Predict
        X[f'{event}_pred'] = model.predict(X)
        X[f'{event}_mult'] = X[f'{event}_pred'] / period_avgs[f'{event}'][0]
    
        # Copy predicted rate and multiplier to active_dataset
        active_dataset[f'{event}_pred'] = X[f'{event}_pred'].copy()    
        active_dataset[f'{event}_mult'] = X[f'{event}_mult'].copy()

    return active_dataset

In [31]:
def park_and_weather_factors(game_df, multiplier_dataset, period_avg_df, swish_df=None, date=None, overwrite_year=None, historic=False):
    print(date)
    # Extract daily games
    daily_game_df = game_df.query(f'date == "{date}"').reset_index(drop=True)
    daily_game_df.sort_values('game_datetime', inplace=True)
    daily_game_df['game_num'] = daily_game_df.groupby('home_team').cumcount() + 1

    # If we have Swish weather data, merge it in
    if swish_df is not None:
        # Add game num
        swish_df['game_num'] = swish_df.groupby('home_team').cumcount() + 1
        daily_game_df = daily_game_df.merge(swish_df, on=['home_team', 'game_num'], how='left')

    
    daily_weather_list = []
    # Loop over games
    for i in range(len(daily_game_df)):
        # Extract relevant information
        game_id = daily_game_df['game_id'][i]
        venue_id = daily_game_df['venue_id'][i]
        away_team = daily_game_df['away_team'][i]
        home_team = daily_game_df['home_team'][i]
        datetime = daily_game_df['game_datetime'][i]
        date = daily_game_df['date'][i]
        year = daily_game_df['year'][i]

        # If we have Swish Analytics data, extract it
        if swish_df is not None:
            temperature = daily_game_df['temperature'][i]
            x_vect = daily_game_df['x_vect'][i].astype(float)
            y_vect = daily_game_df['y_vect'][i].astype(float)
        
        # Extract weather from box score
        weather, wind, park, full_date, missing_weather = create_box(game_id)
        
        # Create game weather dataframe
        game_weather_dictionary = {
            'game_id': game_id,
            'away_team': away_team,
            'home_team': home_team,
            'venue_id': venue_id,
            'park': park,
            'datetime': datetime,
            'date': date,
            'year': year,
            'weather': weather,
            'wind': wind
        }
        game_weather_df = pd.DataFrame(game_weather_dictionary, index=[0])
        game_weather_df = clean_weather(game_weather_df)

        # If we do not have MLB Stats API data yet, use Swish Analytics
        if missing_weather == True:
            print(f"{away_team}@{home_team} uses Swish Analytics weather.")
            game_weather_df['temperature'] = float(temperature)
            game_weather_df['x_vect'] = x_vect
            game_weather_df['y_vect'] = y_vect
            game_weather_df['windDirection'] = "Predicted"
        
        daily_weather_list.append(game_weather_df)

    # Create day weather dataframe
    daily_weather_df = pd.concat(daily_weather_list, axis=0).reset_index(drop=True)
    
    # Overwrite year (good option if early in season, but may want to develop a rolling method)
    if overwrite_year is not None:
        daily_weather_df['year'] = overwrite_year
    
    # Columns to keep
    keep_columns = list(daily_weather_df.columns)
    
    # Add park dummies
    active_venues = list(team_map['VENUE_ID'])
    active_venues = [str(venue) for venue in active_venues]
    for park in active_venues:
        daily_weather_df[f'venue_{park}'] = (daily_weather_df['venue_id'].astype(str) == park).astype(int)
    active_venue_columns = [f"venue_{park}" for park in active_venues]


    
    

    # Loop over lefty/righty dummy
    for lefty_dummy in [0,1]:
        # Assign lefty dummy
        daily_weather_df['lefty'] = lefty_dummy

        if lefty_dummy == 0:
            side = 'r'
        else:
            side = 'l'

        # Step 2: Create interaction terms
        weather_interactions = []
        
        # Weather x Park
        for col1 in active_venue_columns:
            for col2 in ['x_vect', 'y_vect', 'temperature']:
                interaction_name = col1 + '_' + col2
                daily_weather_df[interaction_name] = daily_weather_df[col1] * daily_weather_df[col2]
                weather_interactions.append(interaction_name)
    
    # If it's historic, 
    if historic == True:
        # We already have the multiplier
        multiplier_columns = [column for column in multiplier_dataset.columns if "mult" in column]
        multiplier_dataset.rename(columns={'gamePk':'game_id'}, inplace=True)

        # So keep them
        keep_columns = ['game_id', 'venue_id'] + multiplier_columns
        daily_weather_df = pd.merge(daily_weather_df, multiplier_dataset[keep_columns], on=['game_id', 'venue_id'], how='left')
   
    
    # If it's the day of,
    else:
        # We need to calculate it from the league averages, park factors, and league environment (period averages)
        league_avg_columns = [column for column in multiplier_dataset.columns if "league" in column]
        factor_columns = [column for column in multiplier_dataset.columns if "factor" in column]
        
        keep_columns = ['venue_id'] + league_avg_columns + factor_columns

        # Keep most recent game at each venue
        last_game_df = multiplier_dataset.drop_duplicates('venue_id', keep='last')[keep_columns]

        # Merge that onto the weather
        daily_weather_df = pd.merge(daily_weather_df, last_game_df, on=['venue_id'], how='left')


        ### Calculate multipliers
        # Loop over events
        for event in events_list:
            # Loop over sides
            for side in ['l', 'r']:
                # Select model      
                model = globals().get(f'{event}_{side}_model')
            
                # Assign long-term average to average of team's stats for predicting a team-agnostic rate 
                daily_weather_df[f'{event}_b_long'] = period_avg_df[event][0]
                daily_weather_df[f'{event}_p_long'] = period_avg_df[event][0]

                daily_weather_df[f'{event}_league'] = daily_weather_df[f'{event}_league_{side}']
                daily_weather_df[f'{event}_factor'] = daily_weather_df[f'{event}_factor_{side}']


                # Model prediction inputs
                X = daily_weather_df[[f'{event}_b_long', f'{event}_p_long', f'{event}_league', f'{event}_factor'] + weather_interactions]
                X = sm.add_constant(X, has_constant='add')

                # Predict probability of event
                daily_weather_df[f'predicted_{event}'] = model.predict(X)
                # Calculate multiplier
                daily_weather_df[f'{event}_mult_{side}'] = daily_weather_df[f'predicted_{event}'] / period_avg_df[event][0]
            
    
    # Keep relevant variables
    daily_weather_df = daily_weather_df[['game_id', 'away_team', 'home_team', 'venue_id', 'park', 'datetime', 'date', 'year', 
                                         'weather', 'wind', 'temperature', 'windSpeed', 'windDirection', 'x_vect', 'y_vect'] + 
                                         [f'{event}_mult_l' for event in events_list] + 
                                         [f'{event}_mult_r' for event in events_list]]

    
    # Fill missings with 1
    for event in events_list:
        daily_weather_df[f'{event}_mult_l'].fillna(1, inplace=True)
        daily_weather_df[f'{event}_mult_r'].fillna(1, inplace=True)
        

    return daily_weather_df

##### Create Multiplier Dataset

In [17]:
def create_multiplier_dataset(start_year, end_year, short, long, adjust, park_window, park_window_min, league_window, league_window_min, base_year):
    ### Create PA dataset
    complete_dataset = create_pa_inputs(park_factors, team_map, 2013, 2024, short=50, long=300, adjust=False)
    
    ### Create or read period averages
    if train == True:
        # Calculate averages of each stat
        period_avgs = period_averages(complete_dataset)
        period_avgs.to_csv(os.path.join(baseball_path, "Period Averages.csv"), index=False)
    else:
        period_avgs = pd.read_csv(os.path.join(baseball_path, "Period Averages.csv"))
        
        
    ### Create multipliers
    # LHB
    batSide = "L"
    dataset_l = create_dataset(complete_dataset, park_window=park_window, park_window_min=park_window_min, league_window=league_window, league_window_min=league_window_min, base_year=base_year, batSide=batSide)
    active_dataset_l, weather_interactions = clean_dataset(dataset_l)
    active_dataset_l = run_predictions(active_dataset_l, batSide, weather_interactions, period_avgs, train=train)
    
    # RHB
    batSide = "R"
    dataset_r = create_dataset(complete_dataset, park_window=park_window, park_window_min=park_window_min, league_window=league_window, league_window_min=league_window_min, base_year=base_year, batSide=batSide)
    active_dataset_r, weather_interactions = clean_dataset(dataset_r)
    active_dataset_r = run_predictions(active_dataset_r, batSide, weather_interactions, period_avgs, train=train)
    
    ### Merge
    # Columns to merge
    columns = ['gamePk', 'game_date', 'x_vect', 'y_vect', 'temperature', 'venue_id']
    league_avg_columns = [column for column in active_dataset_r if "league" in column]
    factor_columns = [column for column in active_dataset_r if "factor" in column]
    multiplier_columns = [column for column in active_dataset_r if "mult" in column]
    # venue_columns = [column for column in active_dataset_r if "venue_" in column]
    
    multiplier_dataset = pd.merge(active_dataset_l[columns + league_avg_columns + factor_columns + multiplier_columns], active_dataset_r[columns + league_avg_columns + factor_columns + multiplier_columns], on=columns, how='left', suffixes=("_l", "_r"))
    
    ### Save
    multiplier_dataset.to_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"), index=False)
    

### Settings

In [None]:
if "running_pipeline" not in globals():
    train = False
    start_year = 2013
    end_year = 2024
    short = 50
    long = 300
    adjust = False    
    park_window = 10000
    park_window_min = 5000
    league_window = 60000
    league_window_min = 60000
    base_year = 2014
    
    historic = True
    run_multiplier_dataset = True

### Run

In [19]:
if run_multiplier_dataset == True:
    create_multiplier_dataset(start_year, end_year, short, long, adjust, park_window, park_window_min, league_window, league_window_min, base_year)

b1
b2
b3
hr
bb
hbp
so
fo
go
lo
po
b1
b2
b3
hr
bb
hbp
so
fo
go
lo
po


In [32]:
# Read in datasets
multiplier_dataset = pd.read_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"))
period_avg_df = pd.read_csv(os.path.join(baseball_path, "Period Averages.csv"))
if historic == False:
    swish_df = pd.read_csv(os.path.join(baseball_path, "A06. Weather", "1. Swish Analytics", f"Swish Analytics {todaysdate}.csv"), encoding='iso-8859-1')
else:
    swish_df = None
        
# Calculate park x weather factor
for date in game_df['date'].unique():
    daily_weather_df = park_and_weather_factors(game_df, multiplier_dataset, period_avg_df, swish_df=swish_df, date=date, overwrite_year=None, historic=historic)

    # Fill missings with 1
    daily_weather_df.fillna(1, inplace=True)

    # To csv
    daily_weather_df.to_csv(os.path.join(baseball_path, "A06. Weather", "4. Park and Weather Factors", f'Park and Weather Factors {date}.csv'), index=False)

20240817
