# A6. Stats
Source: Swish Analytics <br>

Description: This scrapes Swish Analytics for daily weather data <br>
Historic data can be found using the stats API <br>

To do: <br>
    Clean team names <br>
    Actually, clean everything <br>

### Imports

In [1]:
import pandas as pd
import numpy as np
import math
import os
import requests
import datetime
from datetime import date
import urllib
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pickle
import warnings
warnings.filterwarnings("ignore")

baseball_path = r"C:\Users\james\Documents\MLB\Data"

In [2]:
# Use today's date in the URL
todaysdate = date.today()
todaysdate = str(todaysdate)

In [3]:
# Today's Date
# YYYY-MM-DD (datetime)
todaysdate_dt = datetime.date.today()

# YYYY-MM-DD (string)
todaysdate_dash = str(todaysdate_dt)

# MM/DD/YYYY
todaysdate_slash = todaysdate_dash.split("-")
todaysdate_slash = todaysdate_slash[1] + "/" + todaysdate_slash[2] + "/" + todaysdate_slash[0]

# YYYYMMDD
todaysdate = todaysdate_dash.replace("-", "")

'20230523'

### Wind

In [3]:
# This reverses winds so that they're named for where they're going, not where they're from. This is so vectors make more sense logically.
def wind_reverser(direction):
    direction = direction.replace("N", "s")
    direction = direction.replace("S", "n")
    direction = direction.replace("E", "w")
    direction = direction.replace("W", "e")
    
    return direction.upper()

In [4]:
# This calculates number of degrees for each direction
def find_degree(direction):
    if direction == "N":
        degree = 0
    elif direction == "NNE":
        degree = 1
    elif direction == "NE":
        degree = 2
    elif direction == "ENE":
        degree = 3
    elif direction == "E":
        degree = 4
    elif direction == "ESE":
        degree = 5
    elif direction == "SE":
        degree = 6
    elif direction == "SSE":
        degree = 7
    elif direction == "S":
        degree = 8
    elif direction == "SSW":
        degree = 9
    elif direction == "SW":
        degree = 10
    elif direction == "WSW":
        degree = 11
    elif direction == "W":
        degree = 12
    elif direction == "WNW":
        degree = 13
    elif direction == "NW":
        degree = 14
    elif direction == "NNW":
        degree = 15
        
    degree = degree * 22.5 

    return degree

In [5]:
# This calculates the x and y vectors given the park's orientation and the wind's direction
def calculate_vectors(row):
    # Determines degree of centerfield
    park_angle = find_degree(row['CF'])
    # Determine degree of wind
    row['Direction'] = wind_reverser(row['Direction'])
    wind_angle = find_degree(row['Direction']) 
    
    # Determine angle between them
    angle = wind_angle - park_angle 

    # Calculate vectors
    x_vect = round(math.sin(math.radians(angle)), 5) * row['Speed']
    y_vect = round(math.cos(math.radians(angle)), 5) * row['Speed']

    return x_vect, y_vect

### Scrape, scrape

In [6]:
# Scrape swishanalytics for weather date
# Create the URL 
url = "https://swishanalytics.com/mlb/weather?date=" + todaysdate

# Use this to spoof a browser
hdr = {'User-Agent':'Mozilla/5.0'}

# I don't know what any of this does
req = urllib.request.Request(url, headers=hdr)
response = urlopen(req)
soup = BeautifulSoup(response, "html.parser")

# So this create a list of matchups in the order of the weather tables
matchup_list = []
text = soup.find_all(text=True)
for t in text: 
    if "\xa0\xa0@\xa0\xa0" in t:
        if t.parent.name != "small":
            matchup_list.append(t.parent)

matchup_list_clean = []
for matchup in matchup_list:
    matchup_clean = str(matchup)
    matchup_clean = matchup_clean.replace('<h4 class="lato inline vert-mid bold">\xa0\xa0', "")
    matchup_clean = matchup_clean.replace('\xa0\xa0', "")
    matchup_clean = matchup_clean.replace('</h4>', "")
    matchup_list_clean.append(matchup_clean)
    
matchup_list_clean

['White Sox@Guardians',
 'Rangers@Pirates',
 'Cardinals@Reds',
 'Diamondbacks@Phillies',
 'Blue Jays@Rays',
 'Orioles@Yankees',
 'Padres@Nationals',
 'Dodgers@Braves',
 'Tigers@Royals',
 'Giants@Twins',
 'Mets@Cubs',
 'Astros@Brewers',
 'Marlins@Rockies',
 'Red Sox@Angels',
 'Athletics@Mariners']

### Weather 

In [7]:
# Create dataframes with weather by park

# If a df_weather exists, get rid of it
try:
    del df_weather
except:
    pass

r = requests.get(url, headers=hdr)

# Loop over every weather table (one for each item in the matchup list)   
i = 0
while i < len(matchup_list_clean):
    # This is the table number
    table = 3 + (i * 2)
    # Make the table
    df = pd.read_html(r.text)[table]
       
    # Rename columns so they'll be consistent when they're appended together (they're usually hours)
    num_col = len(df.columns)
    
    df.rename(columns={df.columns[num_col-5]:"Start", df.columns[num_col-4]:"Plus1", df.columns[num_col-3]:"Plus2", df.columns[num_col-2]:"Plus3", df.columns[num_col-1]:"Plus4"}, inplace=True)
    
    # Create a column with the matchup
    df['Matchup'] = matchup_list_clean[i]
    
    # Try to append if you can, if not, create the weather dataframe
    try: 
        df_weather = df_weather.append(df)
    except:
        df_weather = df
        
    i += 1

# Clean up a bit
df_weather = df_weather.reset_index()
df_weather['Start'] = df_weather['Start'].str.replace(u"°", "")
df_weather['Start'] = df_weather['Start'].str.replace(" mph", "")
df_weather['Plus1'] = df_weather['Plus1'].str.replace(u"°", "")
df_weather['Plus1'] = df_weather['Plus1'].str.replace(" mph", "")

# Create temperature, wind speed, and wind direction dataframes
df_temp = df_weather[df_weather['Unnamed: 0'] == "Temp"]
df_speed = df_weather[df_weather['Unnamed: 0'] == "Wind Speed"]
df_dir = df_weather[df_weather['Unnamed: 0'] == "Wind Dir"]

# Only keep relevant variables and label them appropriately
df_temp = df_temp[['Matchup', 'Plus1']]
df_temp.rename(columns={'Plus1': 'TEMP_PARK_CT'}, inplace=True)

df_speed = df_speed[['Matchup', 'Plus1']]
df_speed.rename(columns={'Plus1': 'Speed'}, inplace=True)

df_dir = df_dir[['Matchup', 'Plus1']]
df_dir.rename(columns={'Plus1': 'Direction'}, inplace=True)

# Merge them all together to get the weather data
df_weather_inputs = df_temp.merge(df_speed, on='Matchup', how='inner')
df_weather_inputs = df_weather_inputs.merge(df_dir, on='Matchup', how='inner')

# Choose the home team (just do this with the map?)
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Diamondbacks"), "ARI", "")
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Braves"), "ATL", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Orioles"), "BAL", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Red Sox"), "BOS", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Cubs"), "CHC", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@White Sox"), "CHW", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Reds"), "CIN", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Guardians"), "CLE", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Rockies"), "COL", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Tigers"), "DET", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Astros"), "HOU", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Royals"), "KCR", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Angels"), "LAA", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Dodgers"), "LAD", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Marlins"), "MIA", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Brewers"), "MIL", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Twins"), "MIN", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Mets"), "NYM", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Yankees"), "NYY", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Athletics"), "OAK", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Phillies"), "PHI", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Pirates"), "PIT", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Padres"), "SDP", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Mariners"), "SEA", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Giants"), "SFG", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Cardinals"), "STL", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Rays"), "TBR", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Rangers"), "TEX", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Blue Jays"), "TOR", df_weather_inputs['Team'])
df_weather_inputs['Team'] = np.where(df_weather_inputs['Matchup'].str.contains("@Nationals"), "WSN", df_weather_inputs['Team'])

# Convert to numeric
df_weather_inputs['Speed'] = df_weather_inputs['Speed'].astype('float')
df_weather_inputs['TEMP_PARK_CT'] = df_weather_inputs['TEMP_PARK_CT'].astype('int')

In [8]:
# Read in ballpark orientations (which direction center field is in relation to home plate)
orientations = pd.read_excel(os.path.join(baseball_path, "Utilities", "Park Orientations.xlsx"))

# Merge with weather data
weather = orientations.merge(df_weather_inputs, on='Team', how='inner')

# Calculate wind vectors
weather[['x_vect', 'y_vect']] = weather.apply(calculate_vectors, axis=1, result_type='expand')

# Only keep second game of a double header
weather = weather.drop_duplicates(subset='Matchup', keep='last')

weather

Unnamed: 0,Team,CF,Matchup,TEMP_PARK_CT,Speed,Direction,x_vect,y_vect
0,ATL,SSE,Dodgers@Braves,67,6.3,ENE,6.3,0.0
1,CHC,NE,Mets@Cubs,65,8.0,ESE,-7.39104,-3.06144
2,CIN,ESE,Cardinals@Reds,80,6.9,ESE,0.0,-6.9
3,CLE,N,White Sox@Guardians,73,8.2,NNE,-3.137976,-7.575816
4,COL,N,Marlins@Rockies,74,7.2,ENE,-6.651936,-2.755296
5,KCR,NE,Tigers@Royals,79,6.8,ESE,-6.282384,-2.602224
6,LAA,NE,Red Sox@Angels,64,8.1,SSW,-3.099708,7.483428
7,MIL,SE,Astros@Brewers,72,8.5,SSE,-3.25278,-7.85298
8,MIN,E,Giants@Twins,79,5.9,SSE,-5.450892,-2.257812
9,NYY,E,Orioles@Yankees,60,7.8,SSE,-7.206264,-2.984904


In [9]:
# Rename parks to their Retrosheet names for later merging
weather['PARK_ID'] = weather['Team']
# weather.rename(columns={'Team': 'PARK_ID'}, inplace=True)

def park_renamer(df):
    df['PARK_ID'].loc[df['PARK_ID'] == 'LAA'] = "ANA01"
    df['PARK_ID'].loc[df['PARK_ID'] == 'ARI'] = "PHO01"
    df['PARK_ID'].loc[df['PARK_ID'] == 'BAL'] = "BAL12"
    df['PARK_ID'].loc[df['PARK_ID'] == 'BOS'] = "BOS07"
    df['PARK_ID'].loc[df['PARK_ID'] == 'CHC'] = "CHI11" 
    df['PARK_ID'].loc[df['PARK_ID'] == 'CHW'] = "CHI12"
    df['PARK_ID'].loc[df['PARK_ID'] == 'CIN'] = "CIN09"
    df['PARK_ID'].loc[df['PARK_ID'] == 'CLE'] = "CLE08"
    df['PARK_ID'].loc[df['PARK_ID'] == 'COL'] = "DEN02"
    df['PARK_ID'].loc[df['PARK_ID'] == 'DET'] = "DET05"
    df['PARK_ID'].loc[df['PARK_ID'] == 'HOU'] = "HOU03"
    df['PARK_ID'].loc[df['PARK_ID'] == 'KCR'] = "KAN06"
    df['PARK_ID'].loc[df['PARK_ID'] == 'LAD'] = "LOS03"
    df['PARK_ID'].loc[df['PARK_ID'] == 'MIA'] = "MIA02"
    df['PARK_ID'].loc[df['PARK_ID'] == 'MIL'] = "MIL06"
    df['PARK_ID'].loc[df['PARK_ID'] == 'MIN'] = "MIN04"
    df['PARK_ID'].loc[df['PARK_ID'] == 'NYY'] = "NYC21"
    df['PARK_ID'].loc[df['PARK_ID'] == 'NYM'] = "NYC20" 
    df['PARK_ID'].loc[df['PARK_ID'] == 'OAK'] = "OAK01"
    df['PARK_ID'].loc[df['PARK_ID'] == 'PHI'] = "PHI13"
    df['PARK_ID'].loc[df['PARK_ID'] == 'PIT'] = "PIT08"
    df['PARK_ID'].loc[df['PARK_ID'] == 'SDP'] = "SAN02"
    df['PARK_ID'].loc[df['PARK_ID'] == 'SEA'] = "SEA03"
    df['PARK_ID'].loc[df['PARK_ID'] == 'SFG'] = "SFO03"
    df['PARK_ID'].loc[df['PARK_ID'] == 'STL'] = "STL10"
    df['PARK_ID'].loc[df['PARK_ID'] == 'TOR'] = "TOR02"
    df['PARK_ID'].loc[df['PARK_ID'] == 'WSN'] = "WAS11"

    df['PARK_ID'].loc[df['PARK_ID'] == 'TEX'] = "ARL03"
    df['PARK_ID'].loc[df['PARK_ID'] == 'ATL'] = "ATL03"
    df['PARK_ID'].loc[df['PARK_ID'] == 'TBR'] = "STP01"
    
    
    return df

# Rename parks with their team's name
weather = park_renamer(weather)

# # Drop parks we won't use for weather (for now)
# weather = weather[weather['PARK_ID'] != "ARL03"]
# weather = weather[weather['PARK_ID'] != "ATL03"]
# weather = weather[weather['PARK_ID'] != "STP01"]

# Save day-specific file
todaysdate = todaysdate.replace("-", "")


weather['Home'] = weather['Matchup'].str.split("@").str[1]

all_parks = pd.read_csv(os.path.join(baseball_path, "Utilities", "All Parks.csv"))
venue_dummies = pd.get_dummies(all_parks['venue_id'], prefix='venue')
for col in venue_dummies.columns:
    weather[col] = 0

team_map = pd.read_csv(os.path.join(baseball_path, "Utilities", "Team Map.csv"))[['FULLNAME', 'FANGRAPHSTEAM', 'VENUE_ID']]
team_map


weather = weather.merge(team_map, left_on='Home', right_on='FANGRAPHSTEAM', how='inner')

for venue in weather['VENUE_ID'].unique().tolist():
    venue_dummy = "venue_" + str(venue)
    weather[venue_dummy] = np.where(weather['VENUE_ID'] == venue, 1, 0)

weather['game_date'] = todaysdate
weather['venue_id'] = weather['VENUE_ID']
weather['BBREFTEAM'] = weather['Team']
weather['weather'] = "Missing"
weather['Speed'] = "Missing"
weather['CF_angle'] = "Missing"
weather['wind_angle'] = "Missing"
weather['angle'] = "Missing"


weather = weather[['venue_id', 'game_date', 'weather', 'Speed', 'BBREFTEAM', 'CF', 
         'TEMP_PARK_CT', 'Speed', 'CF_angle', 'wind_angle', 'angle', 'x_vect', 
         'y_vect',
         'venue_1',    'venue_2',    'venue_3',    'venue_4',    'venue_5', 'venue_7', 
         'venue_10',   'venue_12',   'venue_13',   'venue_14',   'venue_15',   
         'venue_16',   'venue_17',   'venue_19',   'venue_22',   'venue_31',
         'venue_32',   'venue_680',  'venue_2392', 'venue_2394', 'venue_2395',
         'venue_2535', 'venue_2536', 'venue_2602', 'venue_2680', 'venue_2681',
         'venue_2701', 'venue_2735', 'venue_2756', 'venue_2889', 'venue_3289',
         'venue_3309', 'venue_3312', 'venue_3313', 'venue_4169', 'venue_4705',
         'venue_5010', 'venue_5325', 'venue_5365', 'venue_5381', 'venue_5445']]
    
    
# Create daily weather file
filename = "Daily_Weather_" + todaysdate + ".xlsx"
filepath = r"C:\Users\james\Documents\MLB\Data\A6. Weather"
file = os.path.join(filepath, filename)

weather.to_excel(file)


weather[['BBREFTEAM', 'x_vect', 'y_vect', 'TEMP_PARK_CT']].sort_values('y_vect', ascending=False) 

Unnamed: 0,BBREFTEAM,x_vect,y_vect,TEMP_PARK_CT
6,LAA,-3.099708,7.483428,64
10,PHI,-8.222532,3.405852,67
0,ATL,6.3,0.0,67
8,MIN,-5.450892,-2.257812,79
5,KCR,-6.282384,-2.602224,79
13,TBR,6.374772,-2.640492,81
4,COL,-6.651936,-2.755296,74
14,WSN,-6.744324,-2.793564,67
12,SEA,6.9291,-2.8701,57
9,NYY,-7.206264,-2.984904,60


In [10]:
print("Code was last run on: {} at {}.".format(datetime.date.today(), datetime.datetime.now().strftime("%H:%M:%S")))

Code was last run on: 2023-05-23 at 17:41:47.
