# A06. Weather 

Note: All historic Park and Weather Factors files are created from M01. Park and Weatehr Factors.ipynb upon the training of new models. A06. Weather is for daily files only.

### Imports

In [1]:
import sys
if not hasattr(sys.modules['__main__'], '__file__'):
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Functions.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"
    print("Imports executed")

Loaded 5 models into ensemble
Imports executed


### Games

In [48]:
if not hasattr(sys.modules['__main__'], '__file__'):
    # Set date range 
    start_date = '20250528'
    end_date = '20250528'
    all_game_df = pd.read_csv(os.path.join(baseball_path, "game_df.csv"))
    game_df = all_game_df[(all_game_df['date'].astype(str) >= start_date) & (all_game_df['date'].astype(str) <= end_date)].reset_index(drop=True)

### Venue

Merge in venue-specific data

In [49]:
game_df = pd.merge(game_df, venue_map_df[['id', 'location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude',
                                          'fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 
                                          'location.elevation', 'location.azimuthAngle', 'fieldInfo.roofType', 'active']], 
                                           left_on=['venue_id'], right_on=['id'], how='left')

Convert to datetime

In [50]:
game_df["game_datetime"] = pd.to_datetime(game_df["game_datetime"])

Drop if missing coordinates

In [51]:
game_df.dropna(subset=['location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude'], inplace=True)

### Settings

In [None]:
similar_games = 50

### Functions

##### 1. Open Meteo

Historic

In [57]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def fetch_historical_weather_data(latitude, longitude, game_datetime):
    game_date = game_datetime.strftime("%Y-%m-%d")
    next_day = (game_datetime + pd.Timedelta(days=1)).strftime("%Y-%m-%d")

    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": game_date,
        "end_date": next_day,  # include next day to cover all 24 hours
        "hourly": [
            "temperature_2m", "relative_humidity_2m", "dew_point_2m", 
            "weather_code", "surface_pressure", "wind_speed_10m", "wind_direction_10m"
        ],
        "temperature_unit": "fahrenheit",
        "wind_speed_unit": "mph",
        "precipitation_unit": "inch",
        "timezone": "UTC"  # important!
    }

    responses = openmeteo.weather_api(url, params=params)
    response = responses[0]

    hourly = response.Hourly()
    hourly_data = {
        "datetime": pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left"
        ),
        "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
        "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
        "dew_point_2m": hourly.Variables(2).ValuesAsNumpy(),
        "weather_code": hourly.Variables(3).ValuesAsNumpy(),
        "surface_pressure": hourly.Variables(4).ValuesAsNumpy(),
        "wind_speed_10m": hourly.Variables(5).ValuesAsNumpy(),
        "wind_direction_10m": hourly.Variables(6).ValuesAsNumpy()
    }

    
    return pd.DataFrame(hourly_data)


def create_historic_weather_df(game_df):
    """Append weather data to each game in game_df based on game_datetime."""

    # Convert game_datetime to UTC
    game_df["game_datetime"] = pd.to_datetime(game_df["game_datetime"], utc=True)

    # Lists to store the matched weather data
    weather_columns = [
        "temperature_2m", "relative_humidity_2m", "dew_point_2m",
        "weather_code", "surface_pressure", "wind_speed_10m", "wind_direction_10m"
    ]
    weather_data_lists = {col: [] for col in weather_columns}

    # Loop through each game in the DataFrame
    for _, row in game_df.iterrows():
        latitude = row["location.defaultCoordinates.latitude"]
        longitude = row["location.defaultCoordinates.longitude"]
        game_datetime = row["game_datetime"]

        # Fetch historical weather data for that day
        weather_data = fetch_historical_weather_data(latitude, longitude, game_datetime)
        
        # Find the closest weather timestamp to game_datetime (typically, first top of the hour after game starts)
        closest_weather_row = weather_data.iloc[
            (weather_data["datetime"] - game_datetime).abs().argsort()[0]
        ]

        # Append the closest weather data to lists
        for col in weather_columns:
            weather_data_lists[col].append(closest_weather_row[col])

    # Add the weather data as new columns in game_df
    for col in weather_columns:
        game_df[col] = weather_data_lists[col]


    return game_df

Forecast

In [53]:
def fetch_weather_data(latitude, longitude, start, end):
    """Fetch hourly weather forecast or recent past data from Open-Meteo."""
    cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    openmeteo = openmeteo_requests.Client(session=retry_session)

    url = "https://api.open-meteo.com/v1/forecast"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "hourly": [
            "temperature_2m", "relative_humidity_2m", "dew_point_2m", 
            "precipitation_probability", "surface_pressure", 
            "wind_speed_10m", "wind_direction_10m", "weather_code"
        ],
        "start": start,  # ISO 8601
        "end": end,      # ISO 8601
        "wind_speed_unit": "mph",
        "temperature_unit": "fahrenheit",
        "precipitation_unit": "inch",
        "timezone": "UTC",       # ✅ Ensure UTC so hourly timestamps align
        "past_days": 2           # ✅ Include recent data in case game time is recent past
    }

    responses = openmeteo.weather_api(url, params=params)
    response = responses[0]
    hourly = response.Hourly()

    hourly_data = {
        "datetime": pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left"
        ),
        "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
        "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
        "dew_point_2m": hourly.Variables(2).ValuesAsNumpy(),
        "precipitation_probability": hourly.Variables(3).ValuesAsNumpy(),
        "surface_pressure": hourly.Variables(4).ValuesAsNumpy(),
        "wind_speed_10m": hourly.Variables(5).ValuesAsNumpy(),
        "wind_direction_10m": hourly.Variables(6).ValuesAsNumpy(),
        "weather_code": hourly.Variables(7).ValuesAsNumpy(),
    }

    df = pd.DataFrame(hourly_data)

    # Filter the data to only include the requested window
    df = df[(df["datetime"] >= pd.to_datetime(start, utc=True)) &
            (df["datetime"] <= pd.to_datetime(end, utc=True))]

    
    return df


def create_daily_weather_df(game_df):
    """Append hourly weather data (forecast or recent) to each game."""
    weather_columns = [
        "temperature_2m", "relative_humidity_2m", "dew_point_2m",
        "precipitation_probability", "surface_pressure",
        "wind_speed_10m", "wind_direction_10m", "weather_code"
    ]
    weather_data_lists = {col: [] for col in weather_columns}

    for _, row in game_df.iterrows():
        latitude = row["location.defaultCoordinates.latitude"]
        longitude = row["location.defaultCoordinates.longitude"]
        game_datetime = pd.to_datetime(row["game_datetime"], utc=True)

        # Fetch 2 hours around game start to ensure coverage
        start = (game_datetime - pd.Timedelta(hours=1)).isoformat()
        end = (game_datetime + pd.Timedelta(hours=1)).isoformat()

        try:
            weather_data = fetch_weather_data(latitude, longitude, start, end)
            if not weather_data.empty:
                # Find record closest to game time
                closest = weather_data.iloc[(weather_data["datetime"] - game_datetime).abs().argsort()[0]]
                for col in weather_columns:
                    weather_data_lists[col].append(closest[col])
            else:
                # If API returned no data, append NaN
                for col in weather_columns:
                    weather_data_lists[col].append(np.nan)
        except Exception as e:
            print(f"⚠️ Weather fetch failed for {latitude},{longitude} at {game_datetime}: {e}")
            for col in weather_columns:
                weather_data_lists[col].append(np.nan)

    # Add the weather columns
    for col in weather_columns:
        game_df[col] = weather_data_lists[col]

    
    return game_df

##### 1. Swish Analytics

Swish Analytics contains weather projections to be used before MLB Stats API updates theirs.

Scrape

In [None]:
# Scrape Swish Analytics for weather data
def swishanalytics(date):
    # Reformat date to fit URL
    date_dash = f"{date[:4]}-{date[4:6]}-{date[6:8]}"
    
    # Send a GET request to the URL
    response = requests.get(f"https://swishanalytics.com/mlb/weather?date={date_dash}")
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all divs with the class 'weather-card'
        weather_cards = soup.find_all('div', class_='weather-card')
        
        # Initialize an empty list to store DataFrames
        dfs = []
        
        # Iterate over each weather card
        for weather_card in weather_cards:
            # Extract relevant information from the weather card
            time_info = weather_card.find('small', class_='text-muted')
            location_info = weather_card.find('h4', class_='lato inline vert-mid bold')
            
            # Extract time and location information
            time = time_info.text.strip() if time_info else None
            location = location_info.text.strip() if location_info else None
            
            # Find the table within the weather card
            table = weather_card.find('table', class_='table-bordered')
            
            # If table exists, extract data from it
            if table:
                # Extract table data into a list of lists
                rows = table.find_all('tr')
                data = []
                for row in rows:
                    cells = row.find_all(['th', 'td'])
                    row_data = [cell.text.strip() for cell in cells]
                    data.append(row_data)
                
                # Convert data into a pandas DataFrame
                df = pd.DataFrame(data)
                
                # Set the first row as the column headers
                df.columns = df.iloc[0]
                df = df[1:]  # Remove the first row since it's the header row
                
                # Add time and location as additional columns
                df['Time'] = time
                df['Location'] = location

                # Create dataframem from the second time period scraped
                daily_weather_df = pd.DataFrame(df.iloc[:, 2]).T
                # Extract home team name 
                daily_weather_df['Matchup'] = df['Location'][1]
                daily_weather_df['FANGRAPHSTEAM'] = daily_weather_df['Matchup'].str.split("@", expand=True).iloc[:, 1]
                daily_weather_df['FANGRAPHSTEAM'] = daily_weather_df['FANGRAPHSTEAM'].str.replace("\xa0\xa0", "")

                dfs.append(daily_weather_df)
        
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)

    # Append together dataframes
    df = pd.concat(dfs, axis=0)
    
    # Rename columns
    df.rename(columns={1:'Weather', 2:'Temperature', 3:'Feels Like', 4:'Humidity', 5:'Speed', 6:'Direction', 'BBREFTEAM': 'home_team'}, inplace=True)

    # Clean
    df['Speed'] = df['Speed'].str.replace(" mph", "").astype(float)
    df['Temperature'] = df['Temperature'].str.replace('°', '')
    df['Feels Like'] = df['Feels Like'].str.replace('°', '')
    df.reset_index(drop=False, inplace=True, names='Time')

    
    return df

##### 2. RotoGrinders

RotoGrinders hosts weather warnings used to identify matchups to avoid based on weather risk.

Scrape

In [None]:
def rotogrinders(date, team_map):
    # Send a GET request to the URL and retrieve the response
    response = requests.get("https://rotogrinders.com/weather/mlb")

    # Check if the response is successful (status code 200)
    if response.status_code == 200:
        # Get the HTML content from the response
        html_content = response.text

        soup = BeautifulSoup(html_content, "html.parser")

        # Find all <li> elements within the <ul>
        li_elements = soup.find_all("li", class_="weather-blurb")

        # Create an empty list to store the data
        data = []

        for li_element in li_elements:
            # Extract the tag colors from the <span> elements
            tag_elements = li_element.find_all("span", class_=["green", "yellow", "orange", "red"])
        
            # Extract the first tag color
            tag = tag_elements[0].text.strip() if tag_elements else None
        
            # Extract the second tag color if it exists
            tag2 = tag_elements[1].text.strip() if len(tag_elements) > 1 else None
        
            # Extract the matchup from the <span> element with class "bold"
            matchup_span = li_element.find("span", class_="bold")
            matchup = matchup_span.text.strip() if matchup_span else None
        
            # Extract the description if it exists
            if matchup_span:
                description_span = matchup_span.find_next_sibling("span")
                description = description_span.text.strip() if description_span else None
            else:
                description = None
        
            # Append the data to the list
            data.append({"Tag": tag, "Tag2": tag2, "Matchup": matchup, "Description": description})


        # Convert the list of dictionaries to a DataFrame
        df = pd.DataFrame(data)

        df[['away', 'home']] = df['Matchup'].str.split(" @ ", expand=True)

        # Add in DK team abbreviations 
        df = df.merge(team_map[['ROTOGRINDERSTEAM', 'DKTEAM']], left_on=['away'], right_on=['ROTOGRINDERSTEAM'], how='left', suffixes=("", "_away"))
        df = df.merge(team_map[['ROTOGRINDERSTEAM', 'DKTEAM']], left_on=['home'], right_on=['ROTOGRINDERSTEAM'], how='left', suffixes=("", "_home"))
        df = df[['Tag', 'Tag2', 'Matchup', 'DKTEAM', 'DKTEAM_home', 'Description']]
        df.rename(columns={'DKTEAM':'Away', 'DKTEAM_home': 'Home'}, inplace=True)
        
        # Add the date column to the DataFrame
        df['date'] = date

        return df
    else:
        # Return an error message if the response is not successful
        return "Failed to retrieve data. Response status code: {}".format(response.status_code)

##### 3. Park x Weather Factors

Calculate wind x and y vectors

In [None]:
def calculate_vectors(row, azimuth_column, wind_column, speed_column):
    angle = row[wind_column] - row[azimuth_column]
    
    # Calculate vectors
    x_vect = round(math.sin(math.radians(angle)), 5) * row[speed_column] * -1
    y_vect = round(math.cos(math.radians(angle)), 5) * row[speed_column] * -1

    return pd.Series([x_vect, y_vect], index=['x_vect', 'y_vect'])

### Run

##### 1. Open Meteo

Columns to extract

In [58]:
# Columns from game_df
game_columns = ['game_id', 'game_datetime', 'game_date', 'date', 'year', 'game_type', 'status', 'away_team', 'home_team', 'doubleheader', 'game_num', 'venue_id', 'venue_name']
# Columns Venue Map
venue_columns = ['location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude', 
                 'fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter',
                 'fieldInfo.rightCenter', 'location.elevation', 'location.azimuthAngle', 'fieldInfo.roofType', 'active']
# Columns from Open Mateo 
weather_columns = ['temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'surface_pressure', 'wind_speed_10m', 'wind_direction_10m', 'weather_code']
# Forecast-only columns from Open Meteo
forecast_only_columns = ['precipitation_probability']

In [59]:
%%time
# Loop over dates
for date in game_df['date'].unique():
    print(date)
    if int(date) == int(todaysdate):
        print("Open Meteo: Today")
        # Create daily weather dataframe (forecast)
        create_daily_weather_df(game_df[game_df['date'].astype(int) == date])[game_columns + venue_columns + weather_columns + forecast_only_columns].to_csv(
            os.path.join(baseball_path, "A06. Weather", "1. Open Meteo", f"Open Meteo {date}.csv"), index=False)
    else:
        print("Open Meteo: Historic")
        # Create historic weather dataframe
        create_historic_weather_df(game_df[game_df['date'] == date])[game_columns + venue_columns + weather_columns].to_csv(
            os.path.join(baseball_path, "A06. Weather", "1. Open Meteo", f"Open Meteo {date}.csv"), index=False)
        time.sleep(2)

Open Meteo: Historic
CPU times: total: 78.1 ms
Wall time: 2.04 s


In [18]:
game_df['date'].head()

0    20250528
1    20250528
2    20250528
3    20250528
4    20250528
Name: date, dtype: int64

##### 1. Swish Analytics

In [10]:
try:
    # Scrape Swish Analytics
    swishanalytics_df = swishanalytics(todaysdate)
    # To CSV
    swishanalytics_df.to_csv(os.path.join(baseball_path, "A06. Weather", "1. Swish Analytics", f"Swish Analytics {todaysdate}.csv"), index=False, encoding='iso-8859-1')
except:
    print("Could not scrape Swish Analytics weather data.")

Could not scrape Swish Analytics weather data.


##### 2. RotoGrinders

In [None]:
try:
    # Scrape RotoGrinders
    rotogrinders_df = rotogrinders(todaysdate, team_map)
    # To CSV
    rotogrinders_df.to_csv(os.path.join(baseball_path, "A06. Weather", "2. RotoGrinders", f"RotoGrinders {todaysdate}.csv"), index=False)
except:
    print("Could not scrape RotoGrinders weather data.")

##### 3. Park x Weather Factors

In [None]:
mlb_weather_variables = ['x_vect', 'y_vect', 'temperature'] # drop weather
meteo_duplicates_variables = ['meteo_x_vect', 'meteo_y_vect', 'temperature_2m']
meteo_weather_variables = ['relative_humidity_2m', 'dew_point_2m', 'surface_pressure']
mlb_park_variables = ['fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 'location.elevation'] # drop roof type

In [None]:
# Would be nice to generate this in a cleverer way

In [None]:
venue_dummy_list = ['venue_1', 'venue_2', 'venue_3', 'venue_4', 'venue_5', 'venue_7', 'venue_10', 'venue_12', 'venue_13', 'venue_14', 'venue_15', 'venue_17', 'venue_19', 'venue_22', 'venue_31', 'venue_32', 'venue_680', 'venue_2392', 'venue_2394', 'venue_2395', 'venue_2602', 'venue_2680', 'venue_2681', 'venue_2889', 'venue_3289', 'venue_3309', 'venue_3312', 'venue_3313', 'venue_4169', 'venue_4705', 'venue_5325']

Read in Meteo data

In [None]:
meteo_df = pd.read_csv(os.path.join(baseball_path, "A06. Weather", "1. Open Meteo", f"Open Meteo {todaysdate}.csv"), encoding='iso-8859-1')

Read in park latest

In [None]:
park_latest_df = pd.read_csv(os.path.join(baseball_path, "Park Latest.csv"))
l_park_latest_df = park_latest_df[park_latest_df['batSide'] == "L"]
r_park_latest_df = park_latest_df[park_latest_df['batSide'] == "R"]

pfx_variables = [col for col in park_latest_df if col.endswith("pfx")]

Read in event averages

In [None]:
event_averages = pd.read_csv(os.path.join(baseball_path, "Event Averages.csv"))
event_averages = event_averages.add_suffix("_pred_batted")
event_variables = list(event_averages.columns)

Add weather data

In [None]:
meteo_df[['meteo_x_vect', 'meteo_y_vect']] = meteo_df.apply(lambda row: calculate_vectors(row, 'location.azimuthAngle', 'wind_direction_10m', 'wind_speed_10m'), axis=1)
meteo_df[['weather', 'wind', 'venue', 'date', 'missing_weather']] = meteo_df['game_id'].apply(lambda game_id: pd.Series(create_box(game_id)))

Adjust weather data in dome

In [None]:
mask = meteo_df['weather'].str.contains('Roof|Dome', case=False, na=False)

meteo_df.loc[mask, 'temperature_2m'] = 68
meteo_df.loc[mask, 'meteo_x_vect'] = 0
meteo_df.loc[mask, 'meteo_y_vect'] = 0
meteo_df.loc[mask, 'relative_humidity_2m'] = 60
meteo_df.loc[mask, 'dew_point_2m'] = 57

##### Inputs

In [None]:
wfx_inputs = event_variables + pfx_variables + meteo_duplicates_variables + meteo_weather_variables + venue_dummy_list + ['b_L']

##### LHB

Merge

In [None]:
l_wfx_df = meteo_df.merge(l_park_latest_df, on=['venue_id'], how='left')
l_wfx_df = l_wfx_df.merge(event_averages, how='cross')

Assign venue dummies

In [None]:
for venue_dummy in venue_dummy_list:
    l_wfx_df[venue_dummy] = (venue_dummy == "venue_" + l_wfx_df["venue_id"].astype(str)).astype(int)

Assign batter is lefty dummy

In [None]:
l_wfx_df['b_L'] = 1

Scale

In [None]:
X2 = l_wfx_df[wfx_inputs].values
X2_scaled = scale_wfx.transform(X2)

Predict

In [None]:
predictions2 = predict_wfx.predict(X2_scaled)

prediction_df2 = pd.DataFrame(predictions2, columns=events_list)
prediction_df2 = prediction_df2.add_suffix('_pred_weather')


l_wfx_df = pd.concat([l_wfx_df, prediction_df2.reset_index()], axis=1)

In [None]:
l_wfx_df.head()

Calculate WFX

In [None]:
for event in events_list:
    l_wfx_df[f'{event}_wfx_unadj'] = l_wfx_df[f'{event}_pred_weather'] / l_wfx_df[f'{event}_pred_batted']

Calibrate

In [None]:
park_and_weather_df = pd.read_csv(os.path.join(baseball_path, "Park and Weather Factors.csv"))
park_and_weather_df.head()

In [None]:
for event in events_list:
    pred_weather_col = f'{event}_pred_weather'
    ref_pred_weather_col = f'{event}_pred_weather_l'
    ref_pred_batted_col = f'{event}_pred_batted_l'
    ref_actual_col = f'{event}_l'

    pred_batted_output = []
    actual_output = []

    # Pull reference columns as arrays for speed
    pwf_venue = park_and_weather_df['venue_id'].values
    pwf_pred_weather = park_and_weather_df[ref_pred_weather_col].values
    pwf_pred_batted = park_and_weather_df[ref_pred_batted_col].values
    pwf_actual = park_and_weather_df[ref_actual_col].values

    for i, row in l_wfx_df.iterrows():
        venue = row['venue_id']
        target_pred_weather = row[pred_weather_col]

        mask = pwf_venue == venue
        pred_weather_vals = pwf_pred_weather[mask]
        pred_batted_vals = pwf_pred_batted[mask]
        actual_vals = pwf_actual[mask]

        if len(pred_weather_vals) == 0 or np.isnan(target_pred_weather):
            pred_batted_output.append(np.nan)
            actual_output.append(np.nan)
            continue

        nearest_idx = np.argsort(np.abs(pred_weather_vals - target_pred_weather))[:similar_games]
        pred_batted_output.append(np.nanmean(pred_batted_vals[nearest_idx]))
        actual_output.append(np.nanmean(actual_vals[nearest_idx]))

    # Save results to l_wfx_df
    l_wfx_df[f'{event}_pred_batted'] = pred_batted_output
    l_wfx_df[event] = actual_output

    # Calculate adjusted wfx
    l_wfx_df[f'{event}_wfx_adj'] = l_wfx_df[event] / l_wfx_df[f'{event}_pred_batted'] 

##### RHB

Merge

In [None]:
r_wfx_df = meteo_df.merge(r_park_latest_df, on=['venue_id'], how='left')
r_wfx_df = r_wfx_df.merge(event_averages, how='cross')

Assign venue dummies

In [None]:
for venue_dummy in venue_dummy_list:
    r_wfx_df[venue_dummy] = (venue_dummy == "venue_" + r_wfx_df["venue_id"].astype(str)).astype(int)

Assign batter is lefty dummy

In [None]:
r_wfx_df['b_L'] = 0

Scale

In [None]:
X2 = r_wfx_df[wfx_inputs].values
X2_scaled = scale_wfx.transform(X2)

Predict

In [None]:
predictions2 = predict_wfx.predict(X2_scaled)

prediction_df2 = pd.DataFrame(predictions2, columns=events_list)
prediction_df2 = prediction_df2.add_suffix('_pred_weather')


r_wfx_df = pd.concat([r_wfx_df, prediction_df2.reset_index()], axis=1)

Calculate WFX

In [None]:
for event in events_list:
    r_wfx_df[f'{event}_wfx_unadj'] = r_wfx_df[f'{event}_pred_weather'] / r_wfx_df[f'{event}_pred_batted']

Calibrate

In [None]:
for event in events_list:
    pred_weather_col = f'{event}_pred_weather'
    ref_pred_weather_col = f'{event}_pred_weather_r'
    ref_pred_batted_col = f'{event}_pred_batted_r'
    ref_actual_col = f'{event}_r'

    pred_batted_output = []
    actual_output = []

    # Pull reference columns as arrays for speed
    pwf_venue = park_and_weather_df['venue_id'].values
    pwf_pred_weather = park_and_weather_df[ref_pred_weather_col].values
    pwf_pred_batted = park_and_weather_df[ref_pred_batted_col].values
    pwf_actual = park_and_weather_df[ref_actual_col].values

    for i, row in r_wfx_df.iterrows():
        venue = row['venue_id']
        target_pred_weather = row[pred_weather_col]

        mask = pwf_venue == venue
        pred_weather_vals = pwf_pred_weather[mask]
        pred_batted_vals = pwf_pred_batted[mask]
        actual_vals = pwf_actual[mask]

        if len(pred_weather_vals) == 0 or np.isnan(target_pred_weather):
            pred_batted_output.append(np.nan)
            actual_output.append(np.nan)
            continue

        nearest_idx = np.argsort(np.abs(pred_weather_vals - target_pred_weather))[:similar_games]
        pred_batted_output.append(np.nanmean(pred_batted_vals[nearest_idx]))
        actual_output.append(np.nanmean(actual_vals[nearest_idx]))

    # Save results to l_wfx_df
    r_wfx_df[f'{event}_pred_batted'] = pred_batted_output
    r_wfx_df[event] = actual_output

    # Calculate adjusted wfx
    r_wfx_df[f'{event}_wfx_adj'] = r_wfx_df[event] / r_wfx_df[f'{event}_pred_batted'] 

Combine

In [None]:
wfx_df = pd.merge(l_wfx_df, r_wfx_df[["venue_id", "game_num"] + [col for col in r_wfx_df if "wfx" in col]], on=['venue_id', 'game_num'], how='left', suffixes=("_l", "_r"))

Rename (game_id is generated in historic wfx code)

In [None]:
wfx_df.rename(columns={'game_id': 'gamePk'}, inplace=True)

Keep relevant columns

In [None]:
keep_columns = ['gamePk', 'game_datetime', 'game_date', 'date', 'year', 'game_type', 'status', 'away_team', 'home_team', 'doubleheader', 'game_num', 'venue_id', 'venue_name', 
                'location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude', 'fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 
                'location.elevation', 'location.azimuthAngle', 'fieldInfo.roofType', 'active', 
                'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'surface_pressure', 'wind_speed_10m', 'wind_direction_10m', 'weather_code', 'precipitation_probability', 
                'meteo_x_vect', 'meteo_y_vect', 'weather', 'wind', 'missing_weather']

Write to CSV

In [None]:
wfx_df[keep_columns + [col for col in wfx_df if "wfx" in col]].to_csv(os.path.join(baseball_path, "A06. Weather", "3. Park and Weather Factors", f"Park and Weather Factors {todaysdate}.csv"), index=False, encoding='iso-8859-1')