# 08. Weather
Source: <br>
1. Swish Analytics <br>
2. RotoGrinders <br>

Description: This scrapes Swish Analytics for daily weather data <br>
Historic data can be found using the stats API <br>

### Wind

In [4]:
# This reverses winds so that they're named for where they're going, not where they're from. This is so vectors make more sense logically.
def wind_reverser(direction):
    direction = direction.replace("N", "s")
    direction = direction.replace("S", "n")
    direction = direction.replace("E", "w")
    direction = direction.replace("W", "e")
    
    return direction.upper()

In [5]:
# This calculates number of degrees for each direction
def find_degree(direction):
    if direction == "N":
        degree = 0
    elif direction == "NNE":
        degree = 1
    elif direction == "NE":
        degree = 2
    elif direction == "ENE":
        degree = 3
    elif direction == "E":
        degree = 4
    elif direction == "ESE":
        degree = 5
    elif direction == "SE":
        degree = 6
    elif direction == "SSE":
        degree = 7
    elif direction == "S":
        degree = 8
    elif direction == "SSW":
        degree = 9
    elif direction == "SW":
        degree = 10
    elif direction == "WSW":
        degree = 11
    elif direction == "W":
        degree = 12
    elif direction == "WNW":
        degree = 13
    elif direction == "NW":
        degree = 14
    elif direction == "NNW":
        degree = 15
        
    degree = degree * 22.5 

    return degree

In [6]:
# This calculates the x and y vectors given the park's orientation and the wind's direction
def calculate_vectors(row):
    # Determines degree of centerfield
    park_angle = find_degree(row['CF'])
    # Determine degree of wind
    row['Direction'] = wind_reverser(row['Direction'])
    wind_angle = find_degree(row['Direction']) 
    
    # Determine angle between them
    angle = wind_angle - park_angle 

    # Calculate vectors
    x_vect = round(math.sin(math.radians(angle)), 5) * row['Speed']
    y_vect = round(math.cos(math.radians(angle)), 5) * row['Speed']

    return x_vect, y_vect

### Swish Analytics

In [7]:
# Scrape Swish Analytics for weather date
def scrape_swishanalytics(team_map):
    team_map = team_map[['FANGRAPHSTEAM', 'BBREFTEAM', 'FULLNAME', 'VENUE_ID']]
    
    # Swish Analytics URL 
    url = "https://swishanalytics.com/mlb/weather?date=" + todaysdate_dash

    # Browser header
    hdr = {'User-Agent':'Mozilla/5.0'}

    # Request data
    req = urllib.request.Request(url, headers=hdr)
    response = urlopen(req)
    soup = BeautifulSoup(response, "html.parser")

    # Create a list of matchups in the order of the weather tables
    matchup_list = []
    text = soup.find_all(text=True)
    for t in text: 
        if "\xa0\xa0@\xa0\xa0" in t:
            if t.parent.name != "small":
                matchup_list.append(t.parent)

    # Clean list of matchups
    matchup_list_clean = []
    for matchup in matchup_list:
        matchup_clean = str(matchup)
        matchup_clean = matchup_clean.replace('<h4 class="lato inline vert-mid bold">\xa0\xa0', "")
        matchup_clean = matchup_clean.replace('\xa0\xa0', "")
        matchup_clean = matchup_clean.replace('</h4>', "")
        matchup_list_clean.append(matchup_clean)
        
    # Request data
    r = requests.get(url, headers=hdr)

    # Loop over every weather table (one for each item in the matchup list)   
    i = 0
    while i < len(matchup_list_clean):
        # This is the table number
        table = 3 + (i * 2)
        # Make the table
        df = pd.read_html(r.text)[table]

        # Rename columns so they'll be consistent when they're appended together (they're usually hours)
        num_col = len(df.columns)

        df.rename(columns={df.columns[num_col-5]:"Start", df.columns[num_col-4]:"Plus1", df.columns[num_col-3]:"Plus2", df.columns[num_col-2]:"Plus3", df.columns[num_col-1]:"Plus4"}, inplace=True)

        # Create a column with the matchup
        df['Matchup'] = matchup_list_clean[i]

        # Try to append if you can, if not, create the weather dataframe
        try: 
            df_weather = df_weather.append(df)
        except:
            df_weather = df

        i += 1

    # Clean up a bit
    df_weather = df_weather.reset_index()
    df_weather['Start'] = df_weather['Start'].str.replace(u"°", "")
    df_weather['Start'] = df_weather['Start'].str.replace(" mph", "")
    df_weather['Plus1'] = df_weather['Plus1'].str.replace(u"°", "")
    df_weather['Plus1'] = df_weather['Plus1'].str.replace(" mph", "")

    # Create temperature, wind speed, and wind direction dataframes
    df_temp = df_weather[df_weather['Unnamed: 0'] == "Temp"]
    df_speed = df_weather[df_weather['Unnamed: 0'] == "Wind Speed"]
    df_dir = df_weather[df_weather['Unnamed: 0'] == "Wind Dir"]

    # Only keep relevant variables and label them appropriately
    df_temp = df_temp[['Matchup', 'Plus1']]
    df_temp.rename(columns={'Plus1': 'TEMP_PARK_CT'}, inplace=True)

    df_speed = df_speed[['Matchup', 'Plus1']]
    df_speed.rename(columns={'Plus1': 'Speed'}, inplace=True)

    df_dir = df_dir[['Matchup', 'Plus1']]
    df_dir.rename(columns={'Plus1': 'Direction'}, inplace=True)

    # Merge them all together to get the weather data
    df_weather_inputs = df_temp.merge(df_speed, on='Matchup', how='inner')
    df_weather_inputs = df_weather_inputs.merge(df_dir, on='Matchup', how='inner')

    # Choose the home team 
    df_weather_inputs['FANGRAPHSTEAM'] = df_weather_inputs['Matchup'].str.split("@").str[1]
    df_weather_inputs = pd.merge(df_weather_inputs, team_map, on='FANGRAPHSTEAM', how='left')
    df_weather_inputs.rename(columns={'BBREFTEAM':'Team'}, inplace=True)

    # Convert to numeric
    df_weather_inputs['Speed'] = df_weather_inputs['Speed'].astype('float')
    df_weather_inputs['TEMP_PARK_CT'] = df_weather_inputs['TEMP_PARK_CT'].astype('int')
    
    
    
    # Read in ballpark orientations (which direction center field is in relation to home plate)
    orientations = pd.read_excel(os.path.join(baseball_path, "Utilities", "Park Orientations.xlsx"))

    # Merge with weather data
    weather = orientations.merge(df_weather_inputs, on='Team', how='inner')

    # Calculate wind vectors
    weather[['x_vect', 'y_vect']] = weather.apply(calculate_vectors, axis=1, result_type='expand')

    # Only keep second game of a double header
    weather = weather.drop_duplicates(subset='Matchup', keep='last')
    
    
    # Read in list of parks
    all_parks = pd.read_csv(os.path.join(baseball_path, "Utilities", "All Parks.csv"))
    # Create dummies for each park (set = 0)
    venue_dummies = pd.get_dummies(all_parks['venue_id'], prefix='venue')
    for col in venue_dummies.columns:
        weather[col] = 0
    # Look over VENUE_ID, set venue_ dummy = 1 if it corresponds to VENUE_ID
    for venue in weather['VENUE_ID'].unique().tolist():
        venue_dummy = "venue_" + str(venue)
        weather[venue_dummy] = np.where(weather['VENUE_ID'] == venue, 1, 0)

    # Create variables
    # More variables exist in backtest data (from API) that aren't needed when scraping day-of
    weather['game_date'] = todaysdate
    weather['venue_id'] = weather['VENUE_ID']
    weather['BBREFTEAM'] = weather['Team']
    weather['weather'] = "Missing"
    weather['Speed'] = "Missing"
    weather['CF_angle'] = "Missing"
    weather['wind_angle'] = "Missing"
    weather['angle'] = "Missing"

    # Keep relevant variables
    weather = weather[['venue_id', 'game_date', 'weather', 'Speed', 'BBREFTEAM', 'CF', 
             'TEMP_PARK_CT', 'Speed', 'CF_angle', 'wind_angle', 'angle', 'x_vect', 
             'y_vect',
             'venue_1',    'venue_2',    'venue_3',    'venue_4',    'venue_5', 'venue_7', 
             'venue_10',   'venue_12',   'venue_13',   'venue_14',   'venue_15',   
             'venue_16',   'venue_17',   'venue_19',   'venue_22',   'venue_31',
             'venue_32',   'venue_680',  'venue_2392', 'venue_2394', 'venue_2395',
             'venue_2535', 'venue_2536', 'venue_2602', 'venue_2680', 'venue_2681',
             'venue_2701', 'venue_2735', 'venue_2756', 'venue_2889', 'venue_3289',
             'venue_3309', 'venue_3312', 'venue_3313', 'venue_4169', 'venue_4705',
             'venue_5010', 'venue_5325', 'venue_5365', 'venue_5381', 'venue_5445']]


    # Create daily weather file
    weather.to_excel(os.path.join(baseball_path, "8. Weather", "A. Swish Analytics", f"Daily_Weather_{todaysdate}.xlsx"), index=False)


    weather[['BBREFTEAM', 'x_vect', 'y_vect', 'TEMP_PARK_CT']].sort_values('y_vect', ascending=False) 
    
    return weather

### RotoGrinders Weather Preview

In [8]:
def scrape_rotogrinders():
    # URL of the web page containing the table
    url = "https://rotogrinders.com/weather/mlb"

    # Send a GET request to the URL and retrieve the response
    response = requests.get(url)

    # Get the HTML content from the response
    html_content = response.text

    soup = BeautifulSoup(html_content, "html.parser")

    tags = []
    matchups = []
    descriptions = []

    ul_element = soup.find("ul", class_="lst data")

    # Create an empty DataFrame
    df = pd.DataFrame(columns=["Tag", "Tag2", "Matchup", "Description"])

    # Iterate over the li elements
    li_elements = ul_element.find_all("li")
    for li_element in li_elements:
        # Find all span elements with tag class
        tag_elements = li_element.find_all("span", class_=["green tag", "yellow tag", "orange tag", "red tag"])

        # Extract the first tag color and assign it to the 'Tag' column
        tag = tag_elements[0].text.strip()

        # Extract the second tag color if it exists, otherwise set it to None
        tag2 = tag_elements[1].text.strip() if len(tag_elements) > 1 else None

        # Extract the matchup and description
        matchup = li_element.find("span", class_="game").text.strip().replace(" ", "").replace("\n", "")
        description = li_element.find("span", class_="description").text.strip().replace(" - ", "")

        # Append the data to the DataFrame
        df = df.append({"Tag": tag, "Tag2": tag2, "Matchup": matchup, "Description": description}, ignore_index=True)

    df.to_csv(os.path.join(baseball_path, "8. Weather", "B. RotoGrinders", "RotoGrinders {}.csv").format(todaysdate))

    return df 

### Ballpark Pal

In [14]:
# Scrape Ballpark Pal for weather factors
# Note: these factor in park as well and are relative to league average conditions
def scrape_ballparkpal(date_dash):
    headers = {
    'Accept': 'text/html',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    url = f'https://ballparkpal.com/ParkFactors.php?date={date_dash}'

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        tree = html.fromstring(response.text)
        table_elements = tree.xpath('/html/body/div[1]/table')

        if table_elements:
            table_element = table_elements[0]

            # Extract table rows
            rows = table_element.xpath('.//tr')

            # Extract table header (assuming it's the first row)
            header = [th.text_content().strip() for th in rows[0].xpath('.//th')]

            # Extract table data
            data = []
            for row in rows[1:]:
                row_data = [td.text_content().strip() for td in row.xpath('.//td')]
                data.append(row_data)

            # Create DataFrame
            df = pd.DataFrame(data, columns=header)
            
            # Extracting the parts of the 'Game' column
            df['Park'] = df['Game'].str.extract(r'^(.*?)\s\d{1,2}:\d{2}')
            df['Time'] = df['Game'].str.extract(r'(\d{1,2}:\d{2})')
            df['Away'] = df['Game'].str.extract(r'\d{1,2}:\d{2}(.*?)@')
            df['Away'] = df['Away'].str.strip()
            df['Home'] = df['Game'].str.extract(r'@ (.*)$')

            # Converting percentage columns to decimals
            cols_to_convert = ['HR', '2B/3B', '1B', 'Runs']
            for col in cols_to_convert:
                df[col] = df[col].str.rstrip('%').astype(float) / 100

            # Drop the 'Game' column since it's no longer needed
            df.drop('Game', axis=1, inplace=True)
            
            df = df[['Park', 'Time', 'Away', 'Home', 'HR', '2B/3B', '1B', 'Runs']]
            
            df['GameNum'] = df.groupby('Park').cumcount() + 1
            
        else:
            print("No table found at the specified XPath.")
    else:
        print(f"Request failed with status code: {response.status_code}")
        
    date = date_dash.replace('-', '')
    
    df.to_csv(os.path.join(baseball_path, "8. Weather", "C. Ballpark Pal", f"Ballpark Pal {date}.csv"), index=False)
    
    return df


In [22]:
# source_dir = r'C:\Users\james\Documents\MLB\Data2\8. Weather\A. Swish Analytics'
# target_dir = r'C:\Users\james\Documents\MLB\Data2\8. Weather\C. Ballpark Pal'

# # Get a list of file names in the source directory
# file_list = os.listdir(source_dir)

# # Regular expression pattern to match the date in the file name
# date_pattern = r'Daily_Weather_(\d{4})(\d{2})(\d{2}).xlsx'

# for file_name in file_list:
#     # Extract the date from the file name using regex
#     match = re.match(date_pattern, file_name)
#     if match:
#         year, month, day = match.groups()
#         date_dash = f"{year}-{month}-{day}"
#         date_plain = date_dash.replace("-", "")
#         print(date_dash)
        
#         # Check if a file with the same date exists in the target directory
#         target_file_path = os.path.join(target_dir, f"Ballpark Pal {date_plain}.csv")
#         if not os.path.exists(target_file_path):
#             scrape_ballparkpal(date_dash)
#             time.sleep(15)
#         else:
#             print("Already done")


2022-04-07
Already done
2022-04-08
Already done
2022-04-09
Already done
2022-04-10
Already done
2022-04-11
Already done
2022-04-12
Already done
2022-04-13
Already done
2022-04-14
Already done
2022-04-15
Already done
2022-04-16
Already done
2022-04-17
Already done
2022-04-18
Already done
2022-04-19
Already done
2022-04-20
Already done
2022-04-21
Already done
2022-04-22
Already done
2022-04-23
Already done
2022-04-24
Already done
2022-04-25
Already done
2022-04-26
Already done
2022-04-27
Already done
2022-04-28
Already done
2022-04-29
Already done
2022-04-30
Already done
2022-05-01
Already done
2022-05-02
Already done
2022-05-03
Already done
2022-05-04
Already done
2022-05-05
Already done
2022-05-06
Already done
2022-05-07
Already done
2022-05-08
Already done
2022-05-09
Already done
2022-05-10
Already done
2022-05-11
Already done
2022-05-12
Already done
2022-05-13
Already done
2022-05-14
Already done
2022-05-15
Already done
2022-05-16
Already done
2022-05-17
Already done
2022-05-18
Alrea

In [9]:
# import ast
# import datetime
# import dateutil.parser
# import distutils.dir_util
# import glob
# import IPython.display
# import json
# import math
# import numpy as np
# import os
# import pandas as pd
# import pathlib
# import pickle
# import pyautogui
# import pytz
# import re
# import requests
# import selenium
# import shutil
# import statsapi
# import statsmodels.formula.api as smf
# import time
# import unidecode
# import warnings
# import webbrowser
# import xlrd
# import random
# import urllib
# from urllib.request import urlopen, Request
# import zipfile

# from bs4 import BeautifulSoup
# from datetime import date
# from IPython.display import display, Javascript
# from joblib import Parallel, delayed
# from lxml import html
# from pathlib import Path
# from scipy import stats
# from sklearn.preprocessing import StandardScaler
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium import webdriver
# from openpyxl import load_workbook
# from functools import partial

# from statsapi import get
# from pydfs_lineup_optimizer import get_optimizer, Site, Sport, Player, TeamStack, PlayerFilter, RandomFantasyPointsStrategy

# os.chdir(r"C:\Users\james\Documents\MLB\Code")

# # from Utilities import *
# # from Classes import *
# # from simulation_functions_three import *

# import smtplib
# import ssl
# from email.mime.text import MIMEText
# from email.mime.multipart import MIMEMultipart
# from email.mime.base import MIMEBase
# from email import encoders

# # Ensure the warning is ignored only once
# warnings.simplefilter(action="ignore")

# # Display the DataFrame
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.width", None)
# pd.set_option("display.max_colwidth", None)

# # Set paths
# model_path = r"C:\Users\james\Documents\MLB\Code\Models"
# baseball_path = r"C:\Users\james\Documents\MLB\Data2"
# download_path = r"C:\Users\james\Downloads"