# A06. Weather
This scrapes extracts batting orders and rosters
- Type: Data
- Run Frequency: Pre-contest
- Sources:
    - Swish Analytics
    - RotoGrinders
    - Ballpark Pal (deprecated) 
- Dates:
    - Created: 9/23/2023
    - Updated: 4/21/2024

##### 1. Swish Analytics

##### Wind

In [1]:
# This reverses winds so that they're named for where they're going, not where they're from. This is so vectors make more sense logically.
def wind_reverser(direction):
    direction = direction.replace("N", "s")
    direction = direction.replace("S", "n")
    direction = direction.replace("E", "w")
    direction = direction.replace("W", "e")
    
    return direction.upper()

In [2]:
# This calculates number of degrees for each direction
def find_degree(direction):
    if direction == "N":
        degree = 0
    elif direction == "NNE":
        degree = 1
    elif direction == "NE":
        degree = 2
    elif direction == "ENE":
        degree = 3
    elif direction == "E":
        degree = 4
    elif direction == "ESE":
        degree = 5
    elif direction == "SE":
        degree = 6
    elif direction == "SSE":
        degree = 7
    elif direction == "S":
        degree = 8
    elif direction == "SSW":
        degree = 9
    elif direction == "SW":
        degree = 10
    elif direction == "WSW":
        degree = 11
    elif direction == "W":
        degree = 12
    elif direction == "WNW":
        degree = 13
    elif direction == "NW":
        degree = 14
    elif direction == "NNW":
        degree = 15
        
    degree = degree * 22.5 

    return degree

In [3]:
# This calculates the x and y vectors given the park's orientation and the wind's direction
def calculate_vectors(row):
    # Determines degree of centerfield
    park_angle = find_degree(row['CF'])
    # Determine degree of wind
    row['Direction'] = wind_reverser(row['Direction'])
    wind_angle = find_degree(row['Direction']) 
    
    # Determine angle between them
    angle = wind_angle - park_angle 

    # Calculate vectors
    x_vect = round(math.sin(math.radians(angle)), 5) * row['Speed']
    y_vect = round(math.cos(math.radians(angle)), 5) * row['Speed']

    return x_vect, y_vect

##### Scrape

In [4]:
# Scrape Swish Analytics for weather date
def swishanalytics(date):
    # Reformat date to fit URL
    date_dash = f"{date[:4]}-{date[4:6]}-{date[6:8]}"
    
    # Swish Analytics URL 
    url = "https://swishanalytics.com/mlb/weather?date=" + date_dash

     # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all divs with the class 'weather-card'
        weather_cards = soup.find_all('div', class_='weather-card')
        
        # Initialize an empty list to store DataFrames
        dfs = []
        
        # Iterate over each weather card
        for weather_card in weather_cards:
            # Extract relevant information from the weather card
            time_info = weather_card.find('small', class_='text-muted')
            location_info = weather_card.find('h4', class_='lato inline vert-mid bold')
            
            # Extract time and location information
            time = time_info.text.strip() if time_info else None
            location = location_info.text.strip() if location_info else None
            
            # Find the table within the weather card
            table = weather_card.find('table', class_='table-bordered')
            
            # If table exists, extract data from it
            if table:
                # Extract table data into a list of lists
                rows = table.find_all('tr')
                data = []
                for row in rows:
                    cells = row.find_all(['th', 'td'])
                    row_data = [cell.text.strip() for cell in cells]
                    data.append(row_data)
                
                # Convert data into a pandas DataFrame
                df = pd.DataFrame(data)
                
                # Set the first row as the column headers
                df.columns = df.iloc[0]
                df = df[1:]  # Remove the first row since it's the header row
                
                # Add time and location as additional columns
                df['Time'] = time
                df['Location'] = location

                # Create dataframem from the second time period scraped
                daily_weather_df = pd.DataFrame(df.iloc[:, 2]).T
                # Extract home team name 
                daily_weather_df['Matchup'] = df['Location'][1]
                daily_weather_df['FANGRAPHSTEAM'] = daily_weather_df['Matchup'].str.split("@", expand=True).iloc[:, 1]
                daily_weather_df['FANGRAPHSTEAM'] = daily_weather_df['FANGRAPHSTEAM'].str.replace("\xa0\xa0", "")

                dfs.append(daily_weather_df)
        
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)

    # Append together dataframes
    df = pd.concat(dfs, axis=0)

    # Identify CF
    df = df.merge(team_map[['FANGRAPHSTEAM', 'BBREFTEAM', 'CF']], on='FANGRAPHSTEAM', how='left')

    # Rename columns
    df.rename(columns={1:'Weather', 2:'temperature', 3:'Feels Like', 4:'Humidity', 5:'Speed', 6:'Direction', 'BBREFTEAM': 'home_team'}, inplace=True)

    # Remove mph
    df['Speed'] = df['Speed'].str.replace(" mph", "").astype(float)
    df['temperature'] = df['temperature'].str.replace('°', '')
    df['Feels Like'] = df['Feels Like'].str.replace('°', '')
    
    # Apply the calculate_vectors function row-wise and assign results to new columns
    df[['x_vect', 'y_vect']] = df.apply(calculate_vectors, axis=1, result_type='expand')
    
    
    return df[['Matchup', 'home_team', 'Weather', 'Feels Like', 'Humidity', 'Speed', 'Direction', 'FANGRAPHSTEAM', 'CF', 'temperature', 'x_vect', 'y_vect']]

##### 2. RotoGrinders

In [5]:
def rotogrinders(date, team_map):
    # URL of the web page containing the table
    url = "https://rotogrinders.com/weather/mlb"

    # Send a GET request to the URL and retrieve the response
    response = requests.get(url)

    # Check if the response is successful (status code 200)
    if response.status_code == 200:
        # Get the HTML content from the response
        html_content = response.text

        soup = BeautifulSoup(html_content, "html.parser")

        # Find all <li> elements within the <ul>
        li_elements = soup.find_all("li", class_="weather-blurb")

        # Create an empty list to store the data
        data = []

        for li_element in li_elements:
            # Extract the tag colors from the <span> elements
            tag_elements = li_element.find_all("span", class_=["green", "yellow", "orange", "red"])
        
            # Extract the first tag color
            tag = tag_elements[0].text.strip() if tag_elements else None
        
            # Extract the second tag color if it exists
            tag2 = tag_elements[1].text.strip() if len(tag_elements) > 1 else None
        
            # Extract the matchup from the <span> element with class "bold"
            matchup_span = li_element.find("span", class_="bold")
            matchup = matchup_span.text.strip() if matchup_span else None
        
            # Extract the description if it exists
            if matchup_span:
                description_span = matchup_span.find_next_sibling("span")
                description = description_span.text.strip() if description_span else None
            else:
                description = None
        
            # Append the data to the list
            data.append({"Tag": tag, "Tag2": tag2, "Matchup": matchup, "Description": description})


        # Convert the list of dictionaries to a DataFrame
        df = pd.DataFrame(data)

        df[['away', 'home']] = df['Matchup'].str.split(" @ ", expand=True)

        # Add in DK team abbreviations 
        df = df.merge(team_map[['ROTOGRINDERSTEAM', 'DKTEAM']], left_on=['away'], right_on=['ROTOGRINDERSTEAM'], how='left', suffixes=("", "_away"))
        df = df.merge(team_map[['ROTOGRINDERSTEAM', 'DKTEAM']], left_on=['home'], right_on=['ROTOGRINDERSTEAM'], how='left', suffixes=("", "_home"))
        df = df[['Tag', 'Tag2', 'Matchup', 'DKTEAM', 'DKTEAM_home', 'Description']]
        df.rename(columns={'DKTEAM':'Away', 'DKTEAM_home': 'Home'}, inplace=True)
        
        # Add the date column to the DataFrame
        df['date'] = date

        return df
    else:
        # Return an error message if the response is not successful
        return "Failed to retrieve data. Response status code: {}".format(response.status_code)

##### 3. Ballpark Pal

This is not currently supported after Ballpark Pal switch to subscription-only in 2024.

In [6]:
# Scrape Ballpark Pal for weather factors
# Note: these factor in park as well and are relative to league average conditions
def ballparkpal(date):
    headers = {
    'Accept': 'text/html',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    # Reformat date to fit URL
    date_dash = f"{date[:4]}-{date[4:6]}-{date[6:8]}"
    
    url = f'https://ballparkpal.com/ParkFactors.php?date={date_dash}'

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        tree = html.fromstring(response.text)
        table_elements = tree.xpath('/html/body/div[1]/table')

        if table_elements:
            table_element = table_elements[0]

            # Extract table rows
            rows = table_element.xpath('.//tr')

            # Extract table header (assuming it's the first row)
            header = [th.text_content().strip() for th in rows[0].xpath('.//th')]

            # Extract table data
            data = []
            for row in rows[1:]:
                row_data = [td.text_content().strip() for td in row.xpath('.//td')]
                data.append(row_data)

            # Create DataFrame
            df = pd.DataFrame(data, columns=header)
            
            # Extracting the parts of the 'Game' column
            df['Park'] = df['Game'].str.extract(r'^(.*?)\s\d{1,2}:\d{2}')
            df['Time'] = df['Game'].str.extract(r'(\d{1,2}:\d{2})')
            df['Away'] = df['Game'].str.extract(r'\d{1,2}:\d{2}(.*?)@')
            df['Away'] = df['Away'].str.strip()
            df['Home'] = df['Game'].str.extract(r'@ (.*)$')

            # Converting percentage columns to decimals
            cols_to_convert = ['HR', '2B/3B', '1B', 'Runs']
            for col in cols_to_convert:
                df[col] = df[col].str.rstrip('%').astype(float) / 100

            # Drop the 'Game' column since it's no longer needed
            df.drop('Game', axis=1, inplace=True)
            
            df = df[['Park', 'Time', 'Away', 'Home', 'HR', '2B/3B', '1B', 'Runs']]
            
            df['GameNum'] = df.groupby('Park').cumcount() + 1
            
        else:
            print("No table found at the specified XPath.")
    else:
        print(f"Request failed with status code: {response.status_code}")
    
    df['date'] = date
    
    return df

##### 4. Park and Weather Factors

In [7]:
# Run for create_box
%run "C:\Users\james\Documents\MLB\Code\A02. MLB API.ipynb"

In [8]:
def park_and_weather_factors(game_df, multiplier_dataset, period_avg_df, swish_df=None, date=None, overwrite_year=None, historic=False):
    print(date)
    # Extract daily games
    daily_game_df = game_df.query(f'date == "{date}"').reset_index(drop=True)
    daily_game_df.sort_values('game_datetime', inplace=True)
    daily_game_df['game_num'] = daily_game_df.groupby('home_team').cumcount() + 1

    # If we have Swish weather data, merge it in
    if swish_df is not None:
        # Add game num
        swish_df['game_num'] = swish_df.groupby('home_team').cumcount() + 1
        daily_game_df = daily_game_df.merge(swish_df, on=['home_team', 'game_num'], how='left')

    
    daily_weather_list = []
    # Loop over games
    for i in range(len(daily_game_df)):
        # Extract relevant information
        game_id = daily_game_df['game_id'][i]
        venue_id = daily_game_df['venue_id'][i]
        away_team = daily_game_df['away_team'][i]
        home_team = daily_game_df['home_team'][i]
        datetime = daily_game_df['game_datetime'][i]
        date = daily_game_df['date'][i]
        year = daily_game_df['year'][i]

        # If we have Swish Analytics data, extract it
        if swish_df is not None:
            temperature = daily_game_df['temperature'][i]
            x_vect = daily_game_df['x_vect'][i].astype(float)
            y_vect = daily_game_df['y_vect'][i].astype(float)
        
        # Extract weather from box score
        weather, wind, park, full_date, missing_weather = create_box(game_id)
        
        # Create game weather dataframe
        game_weather_dictionary = {
            'game_id': game_id,
            'away_team': away_team,
            'home_team': home_team,
            'venue_id': venue_id,
            'park': park,
            'datetime': datetime,
            'date': date,
            'year': year,
            'weather': weather,
            'wind': wind
        }
        game_weather_df = pd.DataFrame(game_weather_dictionary, index=[0])
        game_weather_df = clean_weather(game_weather_df)

        # If we do not have MLB Stats API data yet, use Swish Analytics
        if missing_weather == True:
            print(f"{away_team}@{home_team} uses Swish Analytics weather.")
            game_weather_df['temperature'] = float(temperature)
            game_weather_df['x_vect'] = x_vect
            game_weather_df['y_vect'] = y_vect
            game_weather_df['windDirection'] = "Predicted"
        
        daily_weather_list.append(game_weather_df)

    # Create day weather dataframe
    daily_weather_df = pd.concat(daily_weather_list, axis=0).reset_index(drop=True)
    
    # Overwrite year (good option if early in season, but may want to develop a rolling method)
    if overwrite_year is not None:
        daily_weather_df['year'] = overwrite_year
    
    # Columns to keep
    keep_columns = list(daily_weather_df.columns)
    
    # Add park dummies
    active_venues = list(team_map['VENUE_ID'])
    active_venues = [str(venue) for venue in active_venues]
    for park in active_venues:
        daily_weather_df[f'venue_{park}'] = (daily_weather_df['venue_id'].astype(str) == park).astype(int)
    active_venue_columns = [f"venue_{park}" for park in active_venues]


    
    

    # Loop over lefty/righty dummy
    for lefty_dummy in [0,1]:
        # Assign lefty dummy
        daily_weather_df['lefty'] = lefty_dummy

        if lefty_dummy == 0:
            side = 'r'
        else:
            side = 'l'

        # Step 2: Create interaction terms
        weather_interactions = []
        
        # Weather x Park
        for col1 in active_venue_columns:
            for col2 in ['x_vect', 'y_vect', 'temperature']:
                interaction_name = col1 + '_' + col2
                daily_weather_df[interaction_name] = daily_weather_df[col1] * daily_weather_df[col2]
                weather_interactions.append(interaction_name)
    
    # If it's historic, 
    if historic == True:
        # We already have the multiplier
        multiplier_columns = [column for column in multiplier_dataset.columns if "mult" in column]
        multiplier_dataset.rename(columns={'gamePk':'game_id'}, inplace=True)

        # So keep them
        daily_weather_df = pd.merge(daily_weather_df, multiplier_dataset[['game_id'] + multiplier_columns], on=['game_id'], how='left')

    
    
    # If it's the day of,
    else:
        # We need to calculate it from the league averages, park factors, and league environment (period averages)
        league_avg_columns = [column for column in multiplier_dataset.columns if "league" in column]
        factor_columns = [column for column in multiplier_dataset.columns if "factor" in column]
        
        keep_columns = ['venue_id'] + league_avg_columns + factor_columns

        # Keep most recent game at each venue
        last_game_df = multiplier_dataset.drop_duplicates('venue_id', keep='last')[keep_columns]

        # Merge that onto the weather
        daily_weather_df = pd.merge(daily_weather_df, last_game_df, on=['venue_id'], how='left')


        ### Calculate multipliers
        # Loop over events
        for event in events_list:
            # Loop over sides
            for side in ['l', 'r']:
                # Select model      
                model = globals().get(f'{event}_{side}_model')
            
                # Assign long-term average to average of team's stats for predicting a team-agnostic rate 
                daily_weather_df[f'{event}_b_long'] = period_avg_df[event][0]
                daily_weather_df[f'{event}_p_long'] = period_avg_df[event][0]

                daily_weather_df[f'{event}_league'] = daily_weather_df[f'{event}_league_{side}']
                daily_weather_df[f'{event}_factor'] = daily_weather_df[f'{event}_factor_{side}']


                # Model prediction inputs
                X = daily_weather_df[[f'{event}_b_long', f'{event}_p_long', f'{event}_league', f'{event}_factor'] + weather_interactions]
                X = sm.add_constant(X, has_constant='add')

                # Predict probability of event
                daily_weather_df[f'predicted_{event}'] = model.predict(X)
                # Calculate multiplier
                daily_weather_df[f'{event}_mult_{side}'] = daily_weather_df[f'predicted_{event}'] / period_avg_df[event][0]
            
    
    # Keep relevent variables
    daily_weather_df = daily_weather_df[['game_id', 'away_team', 'home_team', 'venue_id', 'park', 'datetime', 'date', 'year', 
                                         'weather', 'wind', 'temperature', 'windSpeed', 'windDirection', 'x_vect', 'y_vect'] + 
                                         [f'{event}_mult_l' for event in events_list] + 
                                         [f'{event}_mult_r' for event in events_list]]

    
    # Fill missings with 1
    for event in events_list:
        daily_weather_df[f'{event}_mult_l'].fillna(1, inplace=True)
        daily_weather_df[f'{event}_mult_r'].fillna(1, inplace=True)
    
    # # Set park factors to 1 if venue_id is not among active parks    
    # # Check if venue_id is not present in team_map['VENUE_ID']
    # not_in_team_map = ~daily_weather_df['venue_id'].isin(team_map['VENUE_ID'].astype(int))
    
    # # Set values to 1 for columns in event_factors_l and event_factors_r where the condition is met
    # for column in event_factors_l + event_factors_r:
    #     # Set values to 1 where the condition is met
    #     daily_weather_df.loc[not_in_team_map, column] = 1


    return daily_weather_df

In [9]:
# %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
# %run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
# %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"

In [13]:
# start_date = "20220407"
# end_date = todaysdate
# game_df = read_and_save_games(team_map, generate=True)
# game_df = game_df[(game_df['date'] >= start_date) & (game_df['date'] <= end_date)].reset_index(drop=True)

In [14]:
# multiplier_dataset = pd.read_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"))
# period_avg_df = pd.read_csv(os.path.join(baseball_path, "Period Averages.csv"))

In [15]:
# for date in game_df['date'].unique():
#     daily_weather_df = park_and_weather_factors(game_df, multiplier_dataset, period_avg_df, swish_df=None, date=date, overwrite_year=None, historic=True)
#     daily_weather_df.to_csv(os.path.join(baseball_path, "A06. Weather", "4. Park and Weather Factors", f"Park and Weather Factors {date}.csv"), index=False)