In [7]:
import pandas as pd
from baseball_scraper import statcast
from datetime import datetime
import warnings
from IPython.display import clear_output
import pickle as pkl
import feather
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests


warnings.simplefilter("ignore")

## Collect Baseball Pitches 2012-2022

In [2]:
date_range = pd.date_range("03/26/2012", "12/01/2012")

In [3]:
def collect_pitches(date_range, df):
    for date in date_range:
        print(str(date.date()))
        # Collects all pitches in the given date range and appends to a singular df
        daily_pitches = statcast(start_dt=str(date.date()), end_dt=str(date.date()))
        df = df.append(daily_pitches)
        clear_output(wait=True)
    return df

In [539]:
starter = statcast(start_dt='2012-03-25', end_dt='2012-03-25')
x = collect_pitches(date_range, starter)

2012-12-01


## Import and Breakup Pitches 2016-2020

In [157]:
prior_data = pd.read_feather("All Pitch Data 2016-2020")
prior_data["year"] = prior_data.game_date.apply(lambda x: x.year)

In [158]:
pitches_2016 = prior_data[prior_data.year == 2016].reset_index(drop=True)
pitches_2017 = prior_data[prior_data.year == 2017].reset_index(drop=True)
pitches_2018 = prior_data[prior_data.year == 2018].reset_index(drop=True)
pitches_2019 = prior_data[prior_data.year == 2019].reset_index(drop=True)
pitches_2020 = prior_data[prior_data.year == 2020].reset_index(drop=True)

# Fix a strange series to begin 2020 where the Blue Jays played as the home team IN Washington

strange_games = pitches_2020[(pitches_2020.home_team == "TOR") & (pitches_2020.away_team == "WSH")].index
pitches_2020.loc[strange_games, "home_team"] = "WSH"
pitches_2020.loc[strange_games, "away_team"] = "TOR"

In [159]:
pkl.dump(pitches_2016,open("pitches_2016.pkl","wb"))
pkl.dump(pitches_2017,open("pitches_2017.pkl","wb"))
pkl.dump(pitches_2018,open("pitches_2018.pkl","wb"))
pkl.dump(pitches_2019,open("pitches_2019.pkl","wb"))
pkl.dump(pitches_2020,open("pitches_2020.pkl","wb"))

## Scrape Weather Data

WEATHER COLLECTION WEBSITES

Baseball Reference and/or Fangraphs if they have forecasts for tomorrow's games
https://www.oddstrader.com/mlb/weather/
https://swishanalytics.com/mlb/weather?date=2023-10-23

In [114]:
all_weather = pd.DataFrame()

for year in ["2022"]:#["2012", "2013","2014","2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022"]:
    url = "https://www.baseball-reference.com/leagues/majors/{}-schedule.shtml".format(year)
    webpage = requests.get(url)
    soup = bs(webpage.content, "html.parser")
    games = soup.find_all("p", {"class":"game"})
    for game in games:
        game_url = "https://www.baseball-reference.com/" + game.em.a['href']
        game_webpage = requests.get(game_url)
        game_soup = bs(game_webpage.content, "html.parser")
        game_weather = game_soup.find_all("div", {"id":"all_5821622361"})
        if len(game_weather) == 0:
            game_weather = game_soup.find_all("div", {"id":"all_6469399717"})
        if len(game_weather) == 0:
            game_weather = game_soup.find_all("div", {"id":"all_2016723098"})
        if len(game_weather) == 0:
            game_weather = game_soup.find_all("div", {"id":"div_5821622361"})
        if len(game_weather) == 0:
            game_weather = game_soup.find_all("div", {"id":"all_3471289817"})
        if len(game_weather) == 0:
            game_weather = game_soup.find_all("div", {"id":"all_3390179539"})
        if len(game_weather) == 0:
            game_weather = game_soup.find_all("div", {"id":"all_8155599404"})
        if len(game_weather) == 0:
            game_weather = game_soup.find_all("div", {"id":"all_4881795978"})  
        if len(game_weather) == 0:
            game_weather = game_soup.find_all("div", {"id":"all_6350237457"})  
        if len(game_weather) == 0:
            game_weather = game_soup.find_all("div", {"id":"all_3716958924"}) 
            
        
            
        comments = game_weather[0].find_all(text=lambda text:isinstance(text, Comment))
        weather = [bs(cmt).find_all('div') for cmt in comments][0][-1].text
        
        names = game_soup.find_all("a", {"itemprop":"name"})
        if len(names) == 0:
            try:
                away_team = game_soup.find_all("table", {"class":"teams poptip"})[0]["data-tip"].split(" at ")[0]
            except:
                home_team = game_soup.find_all("div", {"class":"box"})[0].h1.text.split(" vs ")[-1].split(" Box")[0]

            try:    
                home_team = game_soup.find_all("table", {"class":"teams poptip"})[0]["data-tip"].split(" at ")[1]
            except:
                away_team = game_soup.find_all("div", {"class":"box"})[0].h1.text.split(" vs")[0]
        else:
            if year < "2022":
                away_team = names[0].text
                home_team = names[1].text
            else:
                home_team = names[1].text
                away_team = names[0].text
                
        timing = game_soup.find_all("div", {"class":"scorebox_meta"})[0].div.text
        
        weather_insert = pd.Series({"home_team":home_team, "away_team":away_team, "date":timing, "weather":weather})
        
        all_weather = all_weather.append(weather_insert, ignore_index = True)
        
        print(timing)
        
        clear_output(wait=True)
    all_weather.away_team = all_weather.away_team.apply(lambda x: " ".join(x) if type(x) == list else x)

IndexError: list index out of range

In [134]:
all_weather.date = all_weather.date.apply(lambda x: datetime.strptime(" ".join(x.split(", ")[1:]), "%B %d %Y"))
all_weather.home_team = all_weather.home_team.apply(lambda x: "Los Angeles Angels" if x == "Los Angeles Angels of Anaheim" else x)
all_weather.away_team = all_weather.away_team.apply(lambda x: "Los Angeles Angels" if x == "Los Angeles Angels of Anaheim" else x)
all_weather.home_team = all_weather.home_team.apply(lambda x: "Cleveland Guardians" if x == "Cleveland Indians" else x)
all_weather.away_team = all_weather.away_team.apply(lambda x: "Cleveland Guardians" if x == "Cleveland Indians" else x)
#pkl.dump(all_weather, open("weather_data.pkl", "wb"))

AttributeError: 'Timestamp' object has no attribute 'split'

In [140]:
all_weather.date = all_weather.date.apply(lambda x: str(x).split(" ")[0])

In [141]:
weather = pkl.load(open("weather_data.pkl", "rb"))

In [142]:
weather = weather[weather.date < "2022-01-01"]

In [143]:
weather = weather.append(all_weather)

In [145]:
pkl.dump(weather, open("weather_data.pkl","wb"))

## Attach the weather data to the pitch data

In [38]:
pitches_2012 = pkl.load(open("pitches_2012.pkl","rb"))
pitches_2013 = pkl.load(open("pitches_2013.pkl","rb"))
pitches_2014 = pkl.load(open("pitches_2014.pkl","rb"))
pitches_2015 = pkl.load(open("pitches_2015.pkl","rb"))
pitches_2016 = pkl.load(open("pitches_2016.pkl","rb"))
pitches_2017 = pkl.load(open("pitches_2017.pkl","rb"))
pitches_2018 = pkl.load(open("pitches_2018.pkl","rb"))
pitches_2019 = pkl.load(open("pitches_2019.pkl","rb"))
pitches_2020 = pkl.load(open("pitches_2020.pkl","rb"))
pitches_2021 = pkl.load(open("pitches_2021.pkl","rb"))
pitches_2022 = pkl.load(open("pitches_2022.pkl","rb"))