# A6. Weather
Source: Swish Analytics <br>

Description: This scrapes Swish Analytics for daily weather data <br>
Historic data can be found using the stats API <br>

### Imports

In [12]:
import pandas as pd
import numpy as np
import math
import os
import requests
import datetime
import urllib
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pickle
import warnings
warnings.filterwarnings("ignore")

baseball_path = r"C:\Users\james\Documents\MLB\Data"

In [13]:
# Today's Date
# YYYY-MM-DD (datetime)
todaysdate_dt = datetime.date.today()

# YYYY-MM-DD (string)
todaysdate_dash = str(todaysdate_dt)

# MM/DD/YYYY
todaysdate_slash = todaysdate_dash.split("-")
todaysdate_slash = todaysdate_slash[1] + "/" + todaysdate_slash[2] + "/" + todaysdate_slash[0]

# YYYYMMDD
todaysdate = todaysdate_dash.replace("-", "")

In [14]:
# This reads in a map of team name, codes, and the number Fangraphs uses in their URLs
team_map = pd.read_csv(os.path.join(baseball_path, "Utilities", "Team Map.csv"))

# We just need teams right now
team_map = team_map[['FANGRAPHSTEAM', 'BBREFTEAM', 'FULLNAME', 'VENUE_ID']]

### Wind

In [15]:
# This reverses winds so that they're named for where they're going, not where they're from. This is so vectors make more sense logically.
def wind_reverser(direction):
    direction = direction.replace("N", "s")
    direction = direction.replace("S", "n")
    direction = direction.replace("E", "w")
    direction = direction.replace("W", "e")
    
    return direction.upper()

In [16]:
# This calculates number of degrees for each direction
def find_degree(direction):
    if direction == "N":
        degree = 0
    elif direction == "NNE":
        degree = 1
    elif direction == "NE":
        degree = 2
    elif direction == "ENE":
        degree = 3
    elif direction == "E":
        degree = 4
    elif direction == "ESE":
        degree = 5
    elif direction == "SE":
        degree = 6
    elif direction == "SSE":
        degree = 7
    elif direction == "S":
        degree = 8
    elif direction == "SSW":
        degree = 9
    elif direction == "SW":
        degree = 10
    elif direction == "WSW":
        degree = 11
    elif direction == "W":
        degree = 12
    elif direction == "WNW":
        degree = 13
    elif direction == "NW":
        degree = 14
    elif direction == "NNW":
        degree = 15
        
    degree = degree * 22.5 

    return degree

In [17]:
# This calculates the x and y vectors given the park's orientation and the wind's direction
def calculate_vectors(row):
    # Determines degree of centerfield
    park_angle = find_degree(row['CF'])
    # Determine degree of wind
    row['Direction'] = wind_reverser(row['Direction'])
    wind_angle = find_degree(row['Direction']) 
    
    # Determine angle between them
    angle = wind_angle - park_angle 

    # Calculate vectors
    x_vect = round(math.sin(math.radians(angle)), 5) * row['Speed']
    y_vect = round(math.cos(math.radians(angle)), 5) * row['Speed']

    return x_vect, y_vect

### Scrape, scrape

In [18]:
# Scrape swishanalytics for weather date
# Create the URL 
url = "https://swishanalytics.com/mlb/weather?date=" + todaysdate_dash

# Use this to spoof a browser
hdr = {'User-Agent':'Mozilla/5.0'}

# I don't know what any of this does
req = urllib.request.Request(url, headers=hdr)
response = urlopen(req)
soup = BeautifulSoup(response, "html.parser")

# So this create a list of matchups in the order of the weather tables
matchup_list = []
text = soup.find_all(text=True)
for t in text: 
    if "\xa0\xa0@\xa0\xa0" in t:
        if t.parent.name != "small":
            matchup_list.append(t.parent)

matchup_list_clean = []
for matchup in matchup_list:
    matchup_clean = str(matchup)
    matchup_clean = matchup_clean.replace('<h4 class="lato inline vert-mid bold">\xa0\xa0', "")
    matchup_clean = matchup_clean.replace('\xa0\xa0', "")
    matchup_clean = matchup_clean.replace('</h4>', "")
    matchup_list_clean.append(matchup_clean)

### Weather 

In [19]:
# Create dataframes with weather by park
# If a df_weather exists, get rid of it
try:
    del df_weather
except:
    pass

r = requests.get(url, headers=hdr)

# Loop over every weather table (one for each item in the matchup list)   
i = 0
while i < len(matchup_list_clean):
    # This is the table number
    table = 3 + (i * 2)
    # Make the table
    df = pd.read_html(r.text)[table]
       
    # Rename columns so they'll be consistent when they're appended together (they're usually hours)
    num_col = len(df.columns)
    
    df.rename(columns={df.columns[num_col-5]:"Start", df.columns[num_col-4]:"Plus1", df.columns[num_col-3]:"Plus2", df.columns[num_col-2]:"Plus3", df.columns[num_col-1]:"Plus4"}, inplace=True)
    
    # Create a column with the matchup
    df['Matchup'] = matchup_list_clean[i]
    
    # Try to append if you can, if not, create the weather dataframe
    try: 
        df_weather = df_weather.append(df)
    except:
        df_weather = df
        
    i += 1

# Clean up a bit
df_weather = df_weather.reset_index()
df_weather['Start'] = df_weather['Start'].str.replace(u"°", "")
df_weather['Start'] = df_weather['Start'].str.replace(" mph", "")
df_weather['Plus1'] = df_weather['Plus1'].str.replace(u"°", "")
df_weather['Plus1'] = df_weather['Plus1'].str.replace(" mph", "")

# Create temperature, wind speed, and wind direction dataframes
df_temp = df_weather[df_weather['Unnamed: 0'] == "Temp"]
df_speed = df_weather[df_weather['Unnamed: 0'] == "Wind Speed"]
df_dir = df_weather[df_weather['Unnamed: 0'] == "Wind Dir"]

# Only keep relevant variables and label them appropriately
df_temp = df_temp[['Matchup', 'Plus1']]
df_temp.rename(columns={'Plus1': 'TEMP_PARK_CT'}, inplace=True)

df_speed = df_speed[['Matchup', 'Plus1']]
df_speed.rename(columns={'Plus1': 'Speed'}, inplace=True)

df_dir = df_dir[['Matchup', 'Plus1']]
df_dir.rename(columns={'Plus1': 'Direction'}, inplace=True)

# Merge them all together to get the weather data
df_weather_inputs = df_temp.merge(df_speed, on='Matchup', how='inner')
df_weather_inputs = df_weather_inputs.merge(df_dir, on='Matchup', how='inner')

# Choose the home team 
df_weather_inputs['FANGRAPHSTEAM'] = df_weather_inputs['Matchup'].str.split("@").str[1]
df_weather_inputs = pd.merge(df_weather_inputs, team_map, on='FANGRAPHSTEAM', how='left')
df_weather_inputs.rename(columns={'BBREFTEAM':'Team'}, inplace=True)

# Convert to numeric
df_weather_inputs['Speed'] = df_weather_inputs['Speed'].astype('float')
df_weather_inputs['TEMP_PARK_CT'] = df_weather_inputs['TEMP_PARK_CT'].astype('int')

In [20]:
# Read in ballpark orientations (which direction center field is in relation to home plate)
orientations = pd.read_excel(os.path.join(baseball_path, "Utilities", "Park Orientations.xlsx"))

# Merge with weather data
weather = orientations.merge(df_weather_inputs, on='Team', how='inner')

# Calculate wind vectors
weather[['x_vect', 'y_vect']] = weather.apply(calculate_vectors, axis=1, result_type='expand')

# Only keep second game of a double header
weather = weather.drop_duplicates(subset='Matchup', keep='last')

In [21]:
# Read in list of parks
all_parks = pd.read_csv(os.path.join(baseball_path, "Utilities", "All Parks.csv"))
# Create dummies for each park (set = 0)
venue_dummies = pd.get_dummies(all_parks['venue_id'], prefix='venue')
for col in venue_dummies.columns:
    weather[col] = 0
# Look over VENUE_ID, set venue_ dummy = 1 if it corresponds to VENUE_ID
for venue in weather['VENUE_ID'].unique().tolist():
    venue_dummy = "venue_" + str(venue)
    weather[venue_dummy] = np.where(weather['VENUE_ID'] == venue, 1, 0)

# Create variables
# More variables exist in backtest data (from API) that aren't needed when scraping day-of
weather['game_date'] = todaysdate
weather['venue_id'] = weather['VENUE_ID']
weather['BBREFTEAM'] = weather['Team']
weather['weather'] = "Missing"
weather['Speed'] = "Missing"
weather['CF_angle'] = "Missing"
weather['wind_angle'] = "Missing"
weather['angle'] = "Missing"

# Keep relevant variables
weather = weather[['venue_id', 'game_date', 'weather', 'Speed', 'BBREFTEAM', 'CF', 
         'TEMP_PARK_CT', 'Speed', 'CF_angle', 'wind_angle', 'angle', 'x_vect', 
         'y_vect',
         'venue_1',    'venue_2',    'venue_3',    'venue_4',    'venue_5', 'venue_7', 
         'venue_10',   'venue_12',   'venue_13',   'venue_14',   'venue_15',   
         'venue_16',   'venue_17',   'venue_19',   'venue_22',   'venue_31',
         'venue_32',   'venue_680',  'venue_2392', 'venue_2394', 'venue_2395',
         'venue_2535', 'venue_2536', 'venue_2602', 'venue_2680', 'venue_2681',
         'venue_2701', 'venue_2735', 'venue_2756', 'venue_2889', 'venue_3289',
         'venue_3309', 'venue_3312', 'venue_3313', 'venue_4169', 'venue_4705',
         'venue_5010', 'venue_5325', 'venue_5365', 'venue_5381', 'venue_5445']]
    
    
# Create daily weather file
weather.to_excel(os.path.join(baseball_path, "A6. Weather", "Daily_Weather_" + todaysdate + ".xlsx"), index=False)


weather[['BBREFTEAM', 'x_vect', 'y_vect', 'TEMP_PARK_CT']].sort_values('y_vect', ascending=False) 

Unnamed: 0,BBREFTEAM,x_vect,y_vect,TEMP_PARK_CT
17,OAK,0.0,9.6,61
14,LAD,3.329316,8.037756,62
2,BOS,-3.176244,7.668204,64
12,HOU,7.0711,7.0711,87
19,SEA,2.181276,5.266116,59
16,NYM,11.640888,4.821768,64
1,BAL,10.901784,4.515624,73
11,DET,1.72206,4.15746,72
0,ARI,8.77686,3.63546,91
15,MIN,6.46716,2.67876,83


In [22]:
print("Code was last run on: {} at {}.".format(datetime.date.today(), datetime.datetime.now().strftime("%H:%M:%S")))

Code was last run on: 2023-06-14 at 18:28:54.
