# A06. Weather 

Note: All historic Park and Weather Factors files are created from M01. Park and Weatehr Factors.ipynb upon the training of new models. A06. Weather is for daily files only.

### Imports

In [1]:
import sys
if not hasattr(sys.modules['__main__'], '__file__'):
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"
    print("Imports executed")

C:\Users\james\anaconda3\lib\site-packages\pybaseball\statcast.py
Imports executed


### Games

In [2]:
if not hasattr(sys.modules['__main__'], '__file__'):
    # Set date range 
    start_date = todaysdate
    end_date = todaysdate
    all_game_df = pd.read_csv(os.path.join(baseball_path, "game_df.csv"))
    game_df = all_game_df[(all_game_df['date'].astype(str) >= start_date) & (all_game_df['date'].astype(str) <= end_date)].reset_index(drop=True)

### Venue

Merge in venue-specific data

In [3]:
game_df = pd.merge(game_df, venue_map_df[['id', 'location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude',
                                          'fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 
                                          'location.elevation', 'location.azimuthAngle', 'fieldInfo.roofType', 'active']], 
                                           left_on=['venue_id'], right_on=['id'], how='left')

Convert to datetime

In [4]:
# game_df["game_datetime"] = pd.to_datetime(game_df["game_datetime"])

Drop if missing coordinates

In [5]:
game_df.dropna(subset=['location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude'], inplace=True)

### Settings

In [None]:
similar_games = 50

### Functions

##### 1. Open Meteo

Historic

In [6]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def fetch_historical_weather_data(latitude, longitude, game_datetime):
    """Fetch historical weather data for a given game datetime and location."""

    # Convert game_datetime to date for API request
    game_date = game_datetime.strftime("%Y-%m-%d")

    # Define the parameters for the weather request
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": game_date,
        "end_date": game_date,
        "hourly": [
            "temperature_2m", "relative_humidity_2m", "dew_point_2m", 
            "weather_code", "surface_pressure", "wind_speed_10m", "wind_direction_10m"
        ],
        "temperature_unit": "fahrenheit",
        "wind_speed_unit": "mph",
        "precipitation_unit": "inch"
    }

    # Fetch data from Open-Meteo API
    responses = openmeteo.weather_api(url, params=params)
    response = responses[0]

    # Process hourly data
    hourly = response.Hourly()
    hourly_data = {
        "datetime": pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left"
        ),
        "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
        "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
        "dew_point_2m": hourly.Variables(2).ValuesAsNumpy(),
        "weather_code": hourly.Variables(3).ValuesAsNumpy(),
        "surface_pressure": hourly.Variables(4).ValuesAsNumpy(),
        "wind_speed_10m": hourly.Variables(5).ValuesAsNumpy(),
        "wind_direction_10m": hourly.Variables(6).ValuesAsNumpy()
    }

    return pd.DataFrame(hourly_data)

def create_historic_weather_df(game_df):
    """Append weather data to each game in game_df based on game_datetime."""

    # Convert game_datetime to UTC
    game_df["game_datetime"] = pd.to_datetime(game_df["game_datetime"], utc=True)

    # Lists to store the matched weather data
    weather_columns = [
        "temperature_2m", "relative_humidity_2m", "dew_point_2m",
        "weather_code", "surface_pressure", "wind_speed_10m", "wind_direction_10m"
    ]
    weather_data_lists = {col: [] for col in weather_columns}

    # Loop through each game in the DataFrame
    for _, row in game_df.iterrows():
        latitude = row["location.defaultCoordinates.latitude"]
        longitude = row["location.defaultCoordinates.longitude"]
        game_datetime = row["game_datetime"]

        # Fetch historical weather data for that day
        weather_data = fetch_historical_weather_data(latitude, longitude, game_datetime)

        # Find the closest weather timestamp to game_datetime (typically, first top of the hour after game starts)
        closest_weather_row = weather_data.iloc[
            (weather_data["datetime"] - game_datetime).abs().argsort()[0]
        ]

        # Append the closest weather data to lists
        for col in weather_columns:
            weather_data_lists[col].append(closest_weather_row[col])

    # Add the weather data as new columns in game_df
    for col in weather_columns:
        game_df[col] = weather_data_lists[col]


    return game_df

Forecast

In [7]:
# Make sure it doesn't have issues with double headers

In [8]:
def fetch_weather_data(latitude, longitude, start, end):
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    openmeteo = openmeteo_requests.Client(session=retry_session)

    url = "https://api.open-meteo.com/v1/forecast"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", 
                   "precipitation_probability", "surface_pressure", 
                   "wind_speed_10m", "wind_direction_10m", "weather_code"],
        "start": start,  # Start time of forecast
        "end": end,  # End time of forecast
        "wind_speed_unit": "mph",
        "temperature_unit": "fahrenheit",
        "precipitation_unit": "inch",
        "past_days": 2
    }
    responses = openmeteo.weather_api(url, params=params)

    # Process the weather data
    response = responses[0]
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
    hourly_dew_point_2m = hourly.Variables(2).ValuesAsNumpy()
    hourly_precipitation_probability = hourly.Variables(3).ValuesAsNumpy()
    hourly_surface_pressure = hourly.Variables(4).ValuesAsNumpy()
    hourly_wind_speed_10m = hourly.Variables(5).ValuesAsNumpy()
    hourly_wind_direction_10m = hourly.Variables(6).ValuesAsNumpy()
    hourly_weather_code = hourly.Variables(7).ValuesAsNumpy()

    # Create the DataFrame with the weather data
    hourly_data = {
        "date": pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left"
        )
    }

    hourly_data["temperature_2m"] = hourly_temperature_2m
    hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
    hourly_data["dew_point_2m"] = hourly_dew_point_2m
    hourly_data["precipitation_probability"] = hourly_precipitation_probability
    hourly_data["surface_pressure"] = hourly_surface_pressure
    hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
    hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
    hourly_data["weather_code"] = hourly_weather_code

    hourly_dataframe = pd.DataFrame(data=hourly_data)

    # Filter the data to only include rows within the requested time range
    hourly_dataframe["date"] = pd.to_datetime(hourly_dataframe["date"], utc=True)
    hourly_dataframe = hourly_dataframe[
        (hourly_dataframe["date"] >= pd.to_datetime(start, utc=True)) & 
        (hourly_dataframe["date"] <= pd.to_datetime(end, utc=True))
    ]

    return hourly_dataframe


# Now iterate over each game in game_df to fetch the weather data
def create_daily_weather_df(game_df):
    # Lists to hold the weather data columns
    temperature_2m_list = []
    relative_humidity_2m_list = []
    dew_point_2m_list = []
    precipitation_probability_list = []
    surface_pressure_list = []
    wind_speed_10m_list = []
    wind_direction_10m_list = []
    weather_code_list = []

    # Iterate over the rows of game_df
    for index, row in game_df.iterrows():
        latitude = row["location.defaultCoordinates.latitude"]
        longitude = row["location.defaultCoordinates.longitude"]
        game_datetime = pd.to_datetime(row["game_datetime"])

        # Set start and end time for the forecast
        start = game_datetime.isoformat()  # Ensure ISO 8601 format without 'Z'
        end = (game_datetime + pd.Timedelta(hours=1)).isoformat()  # Add 1 hour

        # Fetch the weather data for the game
        weather_data = fetch_weather_data(latitude, longitude, start, end)
        
        # Get the first row of the weather data (since we're getting 1 hour of forecast)
        first_row = weather_data.iloc[0]

        # Append the weather data to the lists
        temperature_2m_list.append(first_row["temperature_2m"])
        relative_humidity_2m_list.append(first_row["relative_humidity_2m"])
        dew_point_2m_list.append(first_row["dew_point_2m"])
        precipitation_probability_list.append(first_row["precipitation_probability"])
        surface_pressure_list.append(first_row["surface_pressure"])
        wind_speed_10m_list.append(first_row["wind_speed_10m"])
        wind_direction_10m_list.append(first_row["wind_direction_10m"])
        weather_code_list.append(first_row["weather_code"])

    # Append the new weather columns to game_df
    game_df["temperature_2m"] = temperature_2m_list
    game_df["relative_humidity_2m"] = relative_humidity_2m_list
    game_df["dew_point_2m"] = dew_point_2m_list
    game_df["precipitation_probability"] = precipitation_probability_list
    game_df["surface_pressure"] = surface_pressure_list
    game_df["wind_speed_10m"] = wind_speed_10m_list
    game_df["wind_direction_10m"] = wind_direction_10m_list
    game_df["weather_code"] = weather_code_list


    return game_df

##### 1. Swish Analytics

Swish Analytics contains weather projections to be used before MLB Stats API updates theirs.

Scrape

In [9]:
# Scrape Swish Analytics for weather data
def swishanalytics(date):
    # Reformat date to fit URL
    date_dash = f"{date[:4]}-{date[4:6]}-{date[6:8]}"
    
    # Swish Analytics URL 
    url = "https://swishanalytics.com/mlb/weather?date=" + date_dash

     # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all divs with the class 'weather-card'
        weather_cards = soup.find_all('div', class_='weather-card')
        
        # Initialize an empty list to store DataFrames
        dfs = []
        
        # Iterate over each weather card
        for weather_card in weather_cards:
            # Extract relevant information from the weather card
            time_info = weather_card.find('small', class_='text-muted')
            location_info = weather_card.find('h4', class_='lato inline vert-mid bold')
            
            # Extract time and location information
            time = time_info.text.strip() if time_info else None
            location = location_info.text.strip() if location_info else None
            
            # Find the table within the weather card
            table = weather_card.find('table', class_='table-bordered')
            
            # If table exists, extract data from it
            if table:
                # Extract table data into a list of lists
                rows = table.find_all('tr')
                data = []
                for row in rows:
                    cells = row.find_all(['th', 'td'])
                    row_data = [cell.text.strip() for cell in cells]
                    data.append(row_data)
                
                # Convert data into a pandas DataFrame
                df = pd.DataFrame(data)
                
                # Set the first row as the column headers
                df.columns = df.iloc[0]
                df = df[1:]  # Remove the first row since it's the header row
                
                # Add time and location as additional columns
                df['Time'] = time
                df['Location'] = location

                # Create dataframem from the second time period scraped
                daily_weather_df = pd.DataFrame(df.iloc[:, 2]).T
                # Extract home team name 
                daily_weather_df['Matchup'] = df['Location'][1]
                daily_weather_df['FANGRAPHSTEAM'] = daily_weather_df['Matchup'].str.split("@", expand=True).iloc[:, 1]
                daily_weather_df['FANGRAPHSTEAM'] = daily_weather_df['FANGRAPHSTEAM'].str.replace("\xa0\xa0", "")

                dfs.append(daily_weather_df)
        
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)

    # Append together dataframes
    df = pd.concat(dfs, axis=0)
    
    # Rename columns
    df.rename(columns={1:'Weather', 2:'Temperature', 3:'Feels Like', 4:'Humidity', 5:'Speed', 6:'Direction', 'BBREFTEAM': 'home_team'}, inplace=True)

    # Clean
    df['Speed'] = df['Speed'].str.replace(" mph", "").astype(float)
    df['Temperature'] = df['Temperature'].str.replace('°', '')
    df['Feels Like'] = df['Feels Like'].str.replace('°', '')
    df.reset_index(drop=False, inplace=True, names='Time')

    
    return df

##### 2. RotoGrinders

RotoGrinders hosts weather warnings used to identify matchups to avoid based on weather risk.

Scrape

In [10]:
def rotogrinders(date, team_map):
    # URL of the web page containing the table
    url = "https://rotogrinders.com/weather/mlb"

    # Send a GET request to the URL and retrieve the response
    response = requests.get(url)

    # Check if the response is successful (status code 200)
    if response.status_code == 200:
        # Get the HTML content from the response
        html_content = response.text

        soup = BeautifulSoup(html_content, "html.parser")

        # Find all <li> elements within the <ul>
        li_elements = soup.find_all("li", class_="weather-blurb")

        # Create an empty list to store the data
        data = []

        for li_element in li_elements:
            # Extract the tag colors from the <span> elements
            tag_elements = li_element.find_all("span", class_=["green", "yellow", "orange", "red"])
        
            # Extract the first tag color
            tag = tag_elements[0].text.strip() if tag_elements else None
        
            # Extract the second tag color if it exists
            tag2 = tag_elements[1].text.strip() if len(tag_elements) > 1 else None
        
            # Extract the matchup from the <span> element with class "bold"
            matchup_span = li_element.find("span", class_="bold")
            matchup = matchup_span.text.strip() if matchup_span else None
        
            # Extract the description if it exists
            if matchup_span:
                description_span = matchup_span.find_next_sibling("span")
                description = description_span.text.strip() if description_span else None
            else:
                description = None
        
            # Append the data to the list
            data.append({"Tag": tag, "Tag2": tag2, "Matchup": matchup, "Description": description})


        # Convert the list of dictionaries to a DataFrame
        df = pd.DataFrame(data)

        df[['away', 'home']] = df['Matchup'].str.split(" @ ", expand=True)

        # Add in DK team abbreviations 
        df = df.merge(team_map[['ROTOGRINDERSTEAM', 'DKTEAM']], left_on=['away'], right_on=['ROTOGRINDERSTEAM'], how='left', suffixes=("", "_away"))
        df = df.merge(team_map[['ROTOGRINDERSTEAM', 'DKTEAM']], left_on=['home'], right_on=['ROTOGRINDERSTEAM'], how='left', suffixes=("", "_home"))
        df = df[['Tag', 'Tag2', 'Matchup', 'DKTEAM', 'DKTEAM_home', 'Description']]
        df.rename(columns={'DKTEAM':'Away', 'DKTEAM_home': 'Home'}, inplace=True)
        
        # Add the date column to the DataFrame
        df['date'] = date

        return df
    else:
        # Return an error message if the response is not successful
        return "Failed to retrieve data. Response status code: {}".format(response.status_code)

##### 3. Park x Weather Factors

Calculate wind x and y vectors

In [11]:
def calculate_vectors(row, azimuth_column, wind_column, speed_column):
    angle = row[wind_column] - row[azimuth_column]
    
    # Calculate vectors
    x_vect = round(math.sin(math.radians(angle)), 5) * row[speed_column] * -1
    y_vect = round(math.cos(math.radians(angle)), 5) * row[speed_column] * -1

    return pd.Series([x_vect, y_vect], index=['x_vect', 'y_vect'])

### Run

##### 1. Open Meteo

Columns to extract

In [12]:
# Columns from game_df
game_columns = ['game_id', 'game_datetime', 'game_date', 'date', 'year', 'game_type', 'status', 'away_team', 'home_team', 'doubleheader', 'game_num', 'venue_id', 'venue_name']
# Columns Venue Map
venue_columns = ['location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude', 'fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter',
                 'fieldInfo.rightCenter', 'location.elevation', 'location.azimuthAngle', 'fieldInfo.roofType', 'active']
# Columns from Open Mateo 
weather_columns = ['temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'surface_pressure', 'wind_speed_10m', 'wind_direction_10m', 'weather_code']
# Forecast-only columns from Open Meteo
forecast_only_columns = ['precipitation_probability']

In [13]:
%%time
# Loop over dates
for date in game_df['date'].unique():
    print(date)
    if int(date) == int(todaysdate):
        # Create daily weather dataframe (forecast)
        create_daily_weather_df(game_df[game_df['date'] == date])[game_columns + venue_columns + weather_columns + forecast_only_columns].to_csv(os.path.join(baseball_path, "A06. Weather", "1. Open Meteo", f"Open Meteo {date}.csv"), index=False)
    else:
        # Create historic weather dataframe
        create_historic_weather_df(game_df[game_df['date'] == date])[game_columns + venue_columns + weather_columns].to_csv(os.path.join(baseball_path, "A06. Weather", "1. Open Meteo", f"Open Meteo {date}.csv"), index=False)
        time.sleep(2)

20250530
CPU times: total: 109 ms
Wall time: 94.2 ms


##### 1. Swish Analytics

In [14]:
try:
    # Scrape Swish Analytics
    swishanalytics_df = swishanalytics(todaysdate)
    # To CSV
    swishanalytics_df.to_csv(os.path.join(baseball_path, "A06. Weather", "1. Swish Analytics", f"Swish Analytics {todaysdate}.csv"), index=False, encoding='iso-8859-1')
except:
    print("Could not scrape Swish Analytics weather data.")

##### 2. RotoGrinders

In [15]:
try:
    # Scrape RotoGrinders
    rotogrinders_df = rotogrinders(todaysdate, team_map)
    # To CSV
    rotogrinders_df.to_csv(os.path.join(baseball_path, "A06. Weather", "2. RotoGrinders", f"RotoGrinders {todaysdate}.csv"), index=False)
except:
    print("Could not scrape RotoGrinders weather data.")

##### 3. Park x Weather Factors

In [16]:
mlb_weather_variables = ['x_vect', 'y_vect', 'temperature'] # drop weather
meteo_duplicates_variables = ['meteo_x_vect', 'meteo_y_vect', 'temperature_2m']
meteo_weather_variables = ['relative_humidity_2m', 'dew_point_2m', 'surface_pressure']
mlb_park_variables = ['fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 'location.elevation'] # drop roof type

In [17]:
# Would be nice to generate this in a cleverer way

In [18]:
venue_dummy_list = ['venue_1', 'venue_2', 'venue_3', 'venue_4', 'venue_5', 'venue_7', 'venue_10', 'venue_12', 'venue_13', 'venue_14', 'venue_15', 'venue_17', 'venue_19', 'venue_22', 'venue_31', 'venue_32', 'venue_680', 'venue_2392', 'venue_2394', 'venue_2395', 'venue_2602', 'venue_2680', 'venue_2681', 'venue_2889', 'venue_3289', 'venue_3309', 'venue_3312', 'venue_3313', 'venue_4169', 'venue_4705', 'venue_5325']

Read in Meteo data

In [19]:
meteo_df = pd.read_csv(os.path.join(baseball_path, "A06. Weather", "1. Open Meteo", f"Open Meteo {todaysdate}.csv"), encoding='iso-8859-1')

Read in park latest

In [20]:
park_latest_df = pd.read_csv(os.path.join(baseball_path, "Park Latest.csv"))
l_park_latest_df = park_latest_df[park_latest_df['batSide'] == "L"]
r_park_latest_df = park_latest_df[park_latest_df['batSide'] == "R"]

pfx_variables = [col for col in park_latest_df if col.endswith("pfx")]

Read in event averages

In [21]:
event_averages = pd.read_csv(os.path.join(baseball_path, "Event Averages.csv"))
event_averages = event_averages.add_suffix("_pred_batted")
event_variables = list(event_averages.columns)

Add weather data

In [22]:
meteo_df[['meteo_x_vect', 'meteo_y_vect']] = meteo_df.apply(lambda row: calculate_vectors(row, 'location.azimuthAngle', 'wind_direction_10m', 'wind_speed_10m'), axis=1)
meteo_df[['weather', 'wind', 'venue', 'date', 'missing_weather']] = meteo_df['game_id'].apply(lambda game_id: pd.Series(create_box(game_id)))

Adjust weather data in dome

In [23]:
mask = meteo_df['weather'].str.contains('Roof|Dome', case=False, na=False)

meteo_df.loc[mask, 'temperature_2m'] = 68
meteo_df.loc[mask, 'meteo_x_vect'] = 0
meteo_df.loc[mask, 'meteo_y_vect'] = 0
meteo_df.loc[mask, 'relative_humidity_2m'] = 60
meteo_df.loc[mask, 'dew_point_2m'] = 57

##### Inputs

In [24]:
wfx_inputs = event_variables + pfx_variables + meteo_duplicates_variables + meteo_weather_variables + venue_dummy_list + ['b_L']

##### LHB

Merge

In [25]:
l_wfx_df = meteo_df.merge(l_park_latest_df, on=['venue_id'], how='left')
l_wfx_df = l_wfx_df.merge(event_averages, how='cross')

Assign venue dummies

In [26]:
for venue_dummy in venue_dummy_list:
    l_wfx_df[venue_dummy] = (venue_dummy == "venue_" + l_wfx_df["venue_id"].astype(str)).astype(int)

Assign batter is lefty dummy

In [27]:
l_wfx_df['b_L'] = 1

Scale

In [28]:
X2 = l_wfx_df[wfx_inputs].values
X2_scaled = scale_wfx.transform(X2)

Predict

In [29]:
predictions2 = predict_wfx.predict(X2_scaled)

prediction_df2 = pd.DataFrame(predictions2, columns=events_list)
prediction_df2 = prediction_df2.add_suffix('_pred_weather')


l_wfx_df = pd.concat([l_wfx_df, prediction_df2.reset_index()], axis=1)

In [30]:
l_wfx_df.head()

Unnamed: 0,game_id,game_datetime,game_date,date,year,game_type,status,away_team,home_team,doubleheader,game_num,venue_id,venue_name,location.defaultCoordinates.latitude,location.defaultCoordinates.longitude,fieldInfo.leftLine,fieldInfo.center,fieldInfo.rightLine,fieldInfo.leftCenter,fieldInfo.rightCenter,location.elevation,location.azimuthAngle,fieldInfo.roofType,active,temperature_2m,relative_humidity_2m,dew_point_2m,surface_pressure,wind_speed_10m,wind_direction_10m,weather_code,precipitation_probability,meteo_x_vect,meteo_y_vect,weather,wind,venue,missing_weather,batSide,b1_pfx,b2_pfx,b3_pfx,hr_pfx,bb_pfx,hbp_pfx,so_pfx,fo_pfx,go_pfx,lo_pfx,po_pfx,b1_pred_batted,b2_pred_batted,b3_pred_batted,hr_pred_batted,bb_pred_batted,hbp_pred_batted,so_pred_batted,fo_pred_batted,go_pred_batted,lo_pred_batted,po_pred_batted,venue_1,venue_2,venue_3,venue_4,venue_5,venue_7,venue_10,venue_12,venue_13,venue_14,venue_15,venue_17,venue_19,venue_22,venue_31,venue_32,venue_680,venue_2392,venue_2394,venue_2395,venue_2602,venue_2680,venue_2681,venue_2889,venue_3289,venue_3309,venue_3312,venue_3313,venue_4169,venue_4705,venue_5325,b_L,index,b1_pred_weather,b2_pred_weather,b3_pred_weather,hr_pred_weather,bb_pred_weather,hbp_pred_weather,so_pred_weather,fo_pred_weather,go_pred_weather,lo_pred_weather,po_pred_weather
0,777712,2025-05-30T18:20:00Z,2025-05-30,"May 30, 2025",2025,R,Scheduled,CIN,CHC,N,1,17,Wrigley Field,41.948171,-87.655503,355.0,400.0,353.0,368.0,368.0,595.0,37.0,Open,True,75.8183,37.0,47.694008,987.19257,2.4607,90.0,0.0,2.0,-1.965213,-1.480898,"75 degrees, Clear.","0 mph, L To R.",Wrigley Field.,True,L,1.046935,0.996954,1.47182,0.892449,1.000008,1.0,0.999992,1.036133,0.990883,0.880929,1.003787,0.138941,0.043404,0.004022,0.031336,0.085344,0.011031,0.226843,0.127452,0.226957,0.057744,0.046926,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.154122,0.044656,0.003659,0.032154,0.08558,0.016181,0.226734,0.129039,0.215021,0.047288,0.045566
1,777720,2025-05-30T20:30:00Z,2025-05-30,"May 30, 2025",2025,R,Scheduled,CHW,BAL,N,1,2,Oriole Park at Camden Yards,39.283787,-76.621689,333.0,400.0,318.0,398.0,373.0,33.0,31.0,Open,True,74.272095,68.0,63.029957,1001.277,6.409698,119.24873,3.0,18.0,-6.406685,-0.19588,"75 degrees, Clear.","0 mph, L To R.",Oriole Park at Camden Yards.,True,L,1.100934,0.861811,0.807394,1.041875,1.000008,1.0,0.999994,1.058403,0.9642,0.921835,0.954123,0.138941,0.043404,0.004022,0.031336,0.085344,0.011031,0.226843,0.127452,0.226957,0.057744,0.046926,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.155726,0.044284,0.003218,0.031623,0.08359,0.017365,0.228912,0.132714,0.210866,0.04959,0.042111
2,777715,2025-05-30T22:45:00Z,2025-05-30,"May 30, 2025",2025,R,Scheduled,MIL,PHI,N,1,2681,Citizens Bank Park,39.905391,-75.16717,329.0,401.0,330.0,381.0,398.0,20.0,9.0,Open,True,73.3649,76.0,65.34075,998.7625,2.864758,141.34016,3.0,16.0,-2.117514,1.9295,"75 degrees, Clear.","0 mph, L To R.",Citizens Bank Park.,True,L,1.036726,0.999595,1.610196,1.088406,1.000006,1.0,0.999999,1.019099,0.980931,0.909913,0.942164,0.138941,0.043404,0.004022,0.031336,0.085344,0.011031,0.226843,0.127452,0.226957,0.057744,0.046926,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,2,0.149951,0.045831,0.003674,0.035135,0.084344,0.012557,0.228732,0.130671,0.216915,0.047765,0.044426
3,777716,2025-05-30T23:07:00Z,2025-05-30,"May 30, 2025",2025,R,Scheduled,OAK,TOR,N,1,14,Rogers Centre,43.64155,-79.38915,328.0,404.0,328.0,375.0,375.0,270.0,345.0,Retractable,True,71.7368,44.0,48.661594,989.4597,5.1451,270.0,3.0,7.0,4.969806,-1.331655,"75 degrees, Clear.","0 mph, L To R.",Rogers Centre.,True,L,0.984151,1.073566,1.019324,0.973045,1.000007,1.0,1.000003,1.050248,0.994713,0.889615,1.015461,0.138941,0.043404,0.004022,0.031336,0.085344,0.011031,0.226843,0.127452,0.226957,0.057744,0.046926,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0.151459,0.045342,0.003156,0.033147,0.08546,0.015791,0.229692,0.125047,0.218206,0.046226,0.046473
4,777721,2025-05-30T23:10:00Z,2025-05-30,"May 30, 2025",2025,R,Scheduled,LAA,CLE,N,1,5,Progressive Field,41.495861,-81.685255,325.0,405.0,325.0,410.0,375.0,653.0,0.0,Open,True,69.4283,36.0,41.31412,978.7857,4.273782,222.87889,3.0,10.0,2.908095,3.131785,"75 degrees, Clear.","0 mph, L To R.",Progressive Field.,True,L,0.990857,1.041352,0.585774,1.030759,1.000007,1.0,0.999996,1.008579,0.995168,0.964885,1.059732,0.138941,0.043404,0.004022,0.031336,0.085344,0.011031,0.226843,0.127452,0.226957,0.057744,0.046926,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,0.147703,0.044285,0.003008,0.033135,0.081537,0.013953,0.230942,0.132386,0.216975,0.049189,0.046888


Calculate WFX

In [31]:
for event in events_list:
    l_wfx_df[f'{event}_wfx_unaj'] = l_wfx_df[f'{event}_pred_weather'] / l_wfx_df[f'{event}_pred_batted']

Calibrate

In [32]:
park_and_weather_df = pd.read_csv(os.path.join(baseball_path, "Park and Weather Factors.csv"))
park_and_weather_df.head()

Unnamed: 0,venue_id,gamePk,date,b1_wfx_unadj_l,b2_wfx_unadj_l,b3_wfx_unadj_l,hr_wfx_unadj_l,bb_wfx_unadj_l,hbp_wfx_unadj_l,so_wfx_unadj_l,fo_wfx_unadj_l,go_wfx_unadj_l,lo_wfx_unadj_l,po_wfx_unadj_l,b1_wfx_adj_l,b2_wfx_adj_l,b3_wfx_adj_l,hr_wfx_adj_l,bb_wfx_adj_l,hbp_wfx_adj_l,so_wfx_adj_l,fo_wfx_adj_l,go_wfx_adj_l,lo_wfx_adj_l,po_wfx_adj_l,b1_wfx_unadj_r,b2_wfx_unadj_r,b3_wfx_unadj_r,hr_wfx_unadj_r,bb_wfx_unadj_r,hbp_wfx_unadj_r,so_wfx_unadj_r,fo_wfx_unadj_r,go_wfx_unadj_r,lo_wfx_unadj_r,po_wfx_unadj_r,b1_wfx_adj_r,b2_wfx_adj_r,b3_wfx_adj_r,hr_wfx_adj_r,bb_wfx_adj_r,hbp_wfx_adj_r,so_wfx_adj_r,fo_wfx_adj_r,go_wfx_adj_r,lo_wfx_adj_r,po_wfx_adj_r,b1_pred_weather_l,b2_pred_weather_l,b3_pred_weather_l,hr_pred_weather_l,bb_pred_weather_l,hbp_pred_weather_l,so_pred_weather_l,fo_pred_weather_l,go_pred_weather_l,lo_pred_weather_l,po_pred_weather_l,b1_pred_batted_l,b2_pred_batted_l,b3_pred_batted_l,hr_pred_batted_l,bb_pred_batted_l,hbp_pred_batted_l,so_pred_batted_l,fo_pred_batted_l,go_pred_batted_l,lo_pred_batted_l,po_pred_batted_l,b1_pred_batted_mean_l,b2_pred_batted_mean_l,b3_pred_batted_mean_l,hr_pred_batted_mean_l,bb_pred_batted_mean_l,hbp_pred_batted_mean_l,so_pred_batted_mean_l,fo_pred_batted_mean_l,go_pred_batted_mean_l,lo_pred_batted_mean_l,po_pred_batted_mean_l,b1_pred_weather_r,b2_pred_weather_r,b3_pred_weather_r,hr_pred_weather_r,bb_pred_weather_r,hbp_pred_weather_r,so_pred_weather_r,fo_pred_weather_r,go_pred_weather_r,lo_pred_weather_r,po_pred_weather_r,b1_pred_batted_r,b2_pred_batted_r,b3_pred_batted_r,hr_pred_batted_r,bb_pred_batted_r,hbp_pred_batted_r,so_pred_batted_r,fo_pred_batted_r,go_pred_batted_r,lo_pred_batted_r,po_pred_batted_r,b1_pred_batted_mean_r,b2_pred_batted_mean_r,b3_pred_batted_mean_r,hr_pred_batted_mean_r,bb_pred_batted_mean_r,hbp_pred_batted_mean_r,so_pred_batted_mean_r,fo_pred_batted_mean_r,go_pred_batted_mean_r,lo_pred_batted_mean_r,po_pred_batted_mean_r,b1_l,b2_l,b3_l,hr_l,bb_l,hbp_l,so_l,fo_l,go_l,lo_l,po_l,b1_r,b2_r,b3_r,hr_r,bb_r,hbp_r,so_r,fo_r,go_r,lo_r,po_r
0,12,529406,20180329,1.115586,1.159499,0.889512,1.131525,0.951435,1.134072,0.997126,0.980588,0.922546,0.888798,1.06677,,,,,,,,,,,,1.094843,1.11309,0.82455,1.164049,0.961712,1.104002,0.999752,0.982998,0.94701,0.859987,1.041321,,,,,,,,,,,,0.155082,0.050297,0.003578,0.035407,0.081143,0.012509,0.22608,0.125128,0.209358,0.051341,0.050075,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,,,,,,,,,,,,0.152199,0.048283,0.003317,0.036425,0.08202,0.012178,0.226676,0.125436,0.21491,0.049677,0.048881,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,,,,,,,,,,,,0.0,0.045455,0.045455,0.0,0.136364,0.0,0.090909,0.045455,0.409091,0.090909,0.136364,0.104167,0.083333,0.0,0.020833,0.125,0.0,0.3125,0.0625,0.1875,0.0625,0.041667
1,4169,529407,20180329,1.063381,1.223914,1.047671,0.944888,1.050863,0.83706,0.964986,1.020015,0.953395,0.901789,1.044793,0.0,1.047874,11.29918,0.0,1.598916,0.0,0.400954,0.356211,1.802679,1.573782,2.905,1.067349,1.148446,0.912611,0.911729,1.051841,1.129531,0.969555,0.99326,0.963826,0.891194,1.07923,0.749324,1.921102,0.0,0.665786,1.465673,0.0,1.378278,0.489791,0.826228,1.081975,0.887639,0.147825,0.053091,0.004215,0.029567,0.089623,0.009233,0.218793,0.13016,0.216359,0.052092,0.049044,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.148377,0.049817,0.003671,0.028529,0.089706,0.012459,0.219829,0.126745,0.218726,0.05148,0.05066,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.033333,0.066667,0.033333,0.1,0.1,0.066667,0.2,0.1,0.166667,0.033333,0.1,0.122449,0.081633,0.0,0.0,0.163265,0.040816,0.204082,0.081633,0.163265,0.061224,0.081633
2,7,529408,20180329,1.144791,1.106598,1.004392,0.687393,1.019535,0.847727,0.99626,1.059088,0.895499,0.968752,1.0821,0.119892,1.292378,9.792623,1.597886,1.385727,3.021935,0.641526,0.569938,1.268552,1.075418,2.517667,1.088015,1.096175,0.98775,0.709007,1.009644,0.98788,0.998163,1.051764,0.921278,0.966828,1.120384,0.815082,1.901499,0.0,0.332893,1.69001,1.850164,1.139189,0.564759,0.772832,1.070935,1.313343,0.159142,0.048002,0.00404,0.021509,0.086951,0.009351,0.225884,0.135146,0.20322,0.05596,0.050795,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.151249,0.04755,0.003974,0.022186,0.086108,0.010897,0.226316,0.134211,0.20907,0.055849,0.052592,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.115385,0.076923,0.0,0.038462,0.076923,0.0,0.076923,0.115385,0.307692,0.115385,0.076923,0.116667,0.066667,0.0,0.1,0.116667,0.033333,0.116667,0.15,0.2,0.083333,0.016667
3,680,529409,20180329,1.036469,1.04187,0.786183,1.039963,0.999095,0.97953,0.977613,1.011694,0.980905,1.004758,1.014253,0.356602,1.452694,6.528415,1.474971,1.224469,2.014623,0.540773,0.681369,1.297655,1.382776,2.224684,1.063223,0.978735,0.735875,0.999767,0.933928,1.201623,0.98193,1.021234,0.9985,0.980483,0.988701,0.823135,1.77996,0.0,1.287186,1.58266,2.240755,0.930979,0.768338,0.808991,1.194834,0.993914,0.144084,0.045194,0.003163,0.032542,0.085208,0.010805,0.221656,0.129098,0.222602,0.05804,0.04761,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.147803,0.042455,0.00296,0.031284,0.07965,0.013254,0.222635,0.130315,0.226595,0.056637,0.046411,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.102564,0.025641,0.0,0.0,0.051282,0.025641,0.307692,0.102564,0.25641,0.051282,0.076923,0.166667,0.041667,0.0,0.041667,0.041667,0.041667,0.166667,0.083333,0.333333,0.041667,0.041667
4,15,529410,20180329,1.055282,1.217879,1.130785,1.056671,1.008806,1.204303,1.003096,0.933827,0.968688,0.912303,0.946173,0.4519,1.237297,4.896311,1.106229,1.068677,2.092109,0.744849,0.711966,1.255712,1.259026,2.078192,1.014664,1.121716,0.984943,1.068252,0.975299,1.59191,1.007535,0.920288,1.009242,0.929753,0.927737,0.917081,1.575108,0.0,1.298282,1.309135,2.624921,0.882004,0.739517,0.973956,1.076455,0.967345,0.146699,0.052829,0.004549,0.033065,0.086036,0.013284,0.227434,0.119162,0.219829,0.052699,0.044414,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.141053,0.048658,0.003962,0.033427,0.083178,0.017559,0.228441,0.117434,0.229032,0.053707,0.043549,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.139014,0.043378,0.004023,0.031291,0.085285,0.01103,0.226732,0.127606,0.226935,0.057765,0.046941,0.216216,0.027027,0.027027,0.0,0.081081,0.0,0.27027,0.081081,0.297297,0.0,0.0,0.216216,0.027027,0.0,0.054054,0.135135,0.0,0.297297,0.081081,0.135135,0.054054,0.0


In [33]:
for event in events_list:
    pred_weather_col = f'{event}_pred_weather'
    ref_pred_weather_col = f'{event}_pred_weather_l'
    ref_pred_batted_col = f'{event}_pred_batted_l'
    ref_actual_col = f'{event}_l'

    pred_batted_output = []
    actual_output = []

    # Pull reference columns as arrays for speed
    pwf_venue = park_and_weather_df['venue_id'].values
    pwf_pred_weather = park_and_weather_df[ref_pred_weather_col].values
    pwf_pred_batted = park_and_weather_df[ref_pred_batted_col].values
    pwf_actual = park_and_weather_df[ref_actual_col].values

    for i, row in l_wfx_df.iterrows():
        venue = row['venue_id']
        target_pred_weather = row[pred_weather_col]

        mask = pwf_venue == venue
        pred_weather_vals = pwf_pred_weather[mask]
        pred_batted_vals = pwf_pred_batted[mask]
        actual_vals = pwf_actual[mask]

        if len(pred_weather_vals) == 0 or np.isnan(target_pred_weather):
            pred_batted_output.append(np.nan)
            actual_output.append(np.nan)
            continue

        nearest_idx = np.argsort(np.abs(pred_weather_vals - target_pred_weather))[:similar_games]
        pred_batted_output.append(np.nanmean(pred_batted_vals[nearest_idx]))
        actual_output.append(np.nanmean(actual_vals[nearest_idx]))

    # Save results to l_wfx_df
    l_wfx_df[f'{event}_pred_batted'] = pred_batted_output
    l_wfx_df[event] = actual_output

    # Calculate adjusted wfx
    l_wfx_df[f'{event}_wfx_adj'] = l_wfx_df[event] / l_wfx_df[f'{event}_pred_batted'] 

##### RHB

Merge

In [34]:
r_wfx_df = meteo_df.merge(r_park_latest_df, on=['venue_id'], how='left')
r_wfx_df = r_wfx_df.merge(event_averages, how='cross')

Assign venue dummies

In [35]:
for venue_dummy in venue_dummy_list:
    r_wfx_df[venue_dummy] = (venue_dummy == "venue_" + r_wfx_df["venue_id"].astype(str)).astype(int)

Assign batter is lefty dummy

In [36]:
r_wfx_df['b_L'] = 0

Scale

In [37]:
X2 = r_wfx_df[wfx_inputs].values
X2_scaled = scale_wfx.transform(X2)

Predict

In [38]:
predictions2 = predict_wfx.predict(X2_scaled)

prediction_df2 = pd.DataFrame(predictions2, columns=events_list)
prediction_df2 = prediction_df2.add_suffix('_pred_weather')


r_wfx_df = pd.concat([r_wfx_df, prediction_df2.reset_index()], axis=1)

Calculate WFX

In [39]:
for event in events_list:
    r_wfx_df[f'{event}_wfx_unadj'] = r_wfx_df[f'{event}_pred_weather'] / r_wfx_df[f'{event}_pred_batted']

Calibrate

In [40]:
for event in events_list:
    pred_weather_col = f'{event}_pred_weather'
    ref_pred_weather_col = f'{event}_pred_weather_r'
    ref_pred_batted_col = f'{event}_pred_batted_r'
    ref_actual_col = f'{event}_r'

    pred_batted_output = []
    actual_output = []

    # Pull reference columns as arrays for speed
    pwf_venue = park_and_weather_df['venue_id'].values
    pwf_pred_weather = park_and_weather_df[ref_pred_weather_col].values
    pwf_pred_batted = park_and_weather_df[ref_pred_batted_col].values
    pwf_actual = park_and_weather_df[ref_actual_col].values

    for i, row in r_wfx_df.iterrows():
        venue = row['venue_id']
        target_pred_weather = row[pred_weather_col]

        mask = pwf_venue == venue
        pred_weather_vals = pwf_pred_weather[mask]
        pred_batted_vals = pwf_pred_batted[mask]
        actual_vals = pwf_actual[mask]

        if len(pred_weather_vals) == 0 or np.isnan(target_pred_weather):
            pred_batted_output.append(np.nan)
            actual_output.append(np.nan)
            continue

        nearest_idx = np.argsort(np.abs(pred_weather_vals - target_pred_weather))[:similar_games]
        pred_batted_output.append(np.nanmean(pred_batted_vals[nearest_idx]))
        actual_output.append(np.nanmean(actual_vals[nearest_idx]))

    # Save results to l_wfx_df
    r_wfx_df[f'{event}_pred_batted'] = pred_batted_output
    r_wfx_df[event] = actual_output

    # Calculate adjusted wfx
    r_wfx_df[f'{event}_wfx_adj'] = r_wfx_df[event] / r_wfx_df[f'{event}_pred_batted'] 

Combine

In [41]:
wfx_df = pd.merge(l_wfx_df, r_wfx_df[["venue_id", "game_num"] + [col for col in r_wfx_df if col.endswith("wfx")]], on=['venue_id', 'game_num'], how='left', suffixes=("_l", "_r"))

Rename (game_id is generated in historic wfx code)

In [42]:
wfx_df.rename(columns={'game_id': 'gamePk'}, inplace=True)

Keep relevant columns

In [43]:
keep_columns = ['gamePk', 'game_datetime', 'game_date', 'date', 'year', 'game_type', 'status', 'away_team', 'home_team', 'doubleheader', 'game_num', 'venue_id', 'venue_name', 
                'location.defaultCoordinates.latitude', 'location.defaultCoordinates.longitude', 'fieldInfo.leftLine', 'fieldInfo.center', 'fieldInfo.rightLine', 'fieldInfo.leftCenter', 'fieldInfo.rightCenter', 
                'location.elevation', 'location.azimuthAngle', 'fieldInfo.roofType', 'active', 
                'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'surface_pressure', 'wind_speed_10m', 'wind_direction_10m', 'weather_code', 'precipitation_probability', 
                'meteo_x_vect', 'meteo_y_vect', 'weather', 'wind', 'missing_weather']

Write to CSV

In [44]:
wfx_df[keep_columns + [col for col in wfx_df if "wfx" in col]].to_csv(os.path.join(baseball_path, "A06. Weather", "3. Park and Weather Factors", f"Park and Weather Factors {todaysdate}.csv"), index=False, encoding='iso-8859-1')