In [6]:
import pandas as pd
from baseball_scraper import statcast
import datetime
from datetime import datetime as dt
import warnings
from IPython.display import clear_output
import pickle as pkl
import feather
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests
import time
import numpy as np
import re

warnings.simplefilter("ignore")

%run "/Users/jaredzirkes/Desktop/Python/Google Drive/google_cloud_functions.ipynb"


## Collect Baseball Pitches from Statcast

In [27]:
def pull_statcast_pitch_range(start_date, end_date):
    date_range = pd.date_range(start_date, end_date)
    pitch_dict = {}

    for date in date_range:
        try:
            print(date)
            pitches = statcast(str(date).split(" ")[0])
            time.sleep(4)
            clear_output(wait = False)
        except:
            print("{} Threw an exception".format(date))
            time.sleep(30)
            try:
                pitches = statcast(str(date).split(" ")[0])
                clear_output(wait = False)
            except:
                print("The program broke on {} re-run the update pitch files function to continue pulling pitches".format(date))
                break
        pitch_dict[date] = pitches
    
    try:
        pulled_pitches = pd.concat([x for x in pitch_dict.values()])
    except ValueError:
        print("The pitches file is up to date")
        return pd.DataFrame()
    
    return pulled_pitches

In [12]:
def update_pitch_files(fill_previous_year = False):
    
    if not fill_previous_year:
        # Pull the relavant pitches file from Pickle and determine the date from which we need to start pulling statcast data
        current_year = dt.today().year
        pitches_file = pkl.load(open("/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/pitches_{}.pkl".format(current_year), "rb"))
        final_pulled_date = pitches_file.game_date.sort_values(ascending = True).iloc[-1]
        first_date_to_pull = dt.strftime(final_pulled_date + datetime.timedelta(1), "%Y-%m-%d")
        
        # Now pull the statcast data from when we need to start until yesterday (to avoid errors on today's data)        
        start_date = first_date_to_pull
        end_date = dt.strftime(dt.today() - datetime.timedelta(1), "%Y-%m-%d")
        
        pulled_pitches = pull_statcast_pitch_range(start_date, end_date)
        
        if len(pulled_pitches) > 0:
            updated_pitches = pd.concat([pitches_file, pulled_pitches]).reset_index(drop=True).drop_duplicates()
            pkl.dump(updated_pitches, open("/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/pitches_{}.pkl".format(current_year), "wb"))
            
            # Upload the Pitch File to Google Cloud
            file_path = "/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/pitches_{}.pkl".format(current_year)
            file_name = "All Pitches {} (Updated {})".format(current_year, dt.today().strftime("%Y-%m-%d"))
            cloud_bucket_name = "yearly_pitches_files"
            project_id = "inspiring-dryad-422903-c4"
            upload_to_cloud_storage(project_id, cloud_bucket_name, file_name, file_path)
            
            # Delete the Pitch File from 3 days ago
            try:
                cloud_bucket_name = "yearly_pitches_files"
                project_id = "inspiring-dryad-422903-c4"
                file_to_delete_name = "All Pitches {} (Updated {})".format(current_year, (dt.today()- datetime.timedelta(3)).strftime("%Y-%m-%d"))
                delete_from_cloud_storage(project_id, cloud_bucket_name, file_to_delete_name)
            except:
                print("The Pitches File from {} is not in the Cloud Bucket. No Files Have Been Deleted.".format((dt.today()- datetime.timedelta(3)).strftime("%Y-%m-%d")))

    if fill_previous_year:
        raise ValueError("There is not currently code for filling in previous years. Either write in in the Update pitch files function or run manually using the pull statcast pitch range function")

## Scrape Weather Data

#### Scrape Start time weather info from previous games (likely on an annual or semi-annual basis for updating coefficients)

In [21]:
# RERUN FOR 2022 and 2023!!

In [22]:
def update_prior_mlb_weather_data(years):
    all_weather = pd.DataFrame()
    n = 0

    for year in years:#["2012", "2013","2014","2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022"]:
        url = "https://www.baseball-reference.com/leagues/majors/{}-schedule.shtml".format(year)
        webpage = requests.get(url)
        soup = bs(webpage.content, "html.parser")
        games = soup.find_all("p", {"class":"game"})
        
        for game in games:
            n+=1

            if n%1 == 0:
                time.sleep(np.random.random()*1.5 + 7)

            try:
                suffix = game.em.a["href"]
            except AttributeError: # Break the loop if there is no box score (for example in a spring training game)
                break

            game_url = "https://www.baseball-reference.com/" + suffix
            game_webpage = requests.get(game_url)
            game_soup = bs(game_webpage.content, "html.parser")
            timing = game_soup.find_all("div", {"class":"scorebox_meta"})[0].div.text
            print(timing)
            if dt.strptime(timing, "%A, %B %d, %Y").month >= 10:
                break
            
            
            game_weather = game_soup.find_all("div", {"id":"all_5821622361"})
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_6469399717"})
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_2016723098"})
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"div_5821622361"})
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_3471289817"})
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_3390179539"})
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_8155599404"})
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_4881795978"})  
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_6350237457"})  
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_3716958924"}) 
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_7703495808"}) 
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_3758413896"}) 
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_2566301813"}) 
            if len(game_weather) == 0:
                game_weather = game_soup.find_all("div", {"id":"all_5702473528"})


            try:
                comments = game_weather[0].find_all(text=lambda text:isinstance(text, Comment))
            except IndexError:
                print(game_url)
                raise IndexError("If pulling a new year of weather data, check the id code from the game url and add a new if statement above")


            weather = [bs(cmt).find_all('div') for cmt in comments][0][-1].text

            names = game_soup.find_all("a", {"itemprop":"name"})
            if len(names) == 0:
                try:
                    away_team = game_soup.find_all("table", {"class":"teams poptip"})[0]["data-tip"].split(" at ")[0]
                except:
                    home_team = game_soup.find_all("div", {"class":"box"})[0].h1.text.split(" vs ")[-1].split(" Box")[0]

                try:    
                    home_team = game_soup.find_all("table", {"class":"teams poptip"})[0]["data-tip"].split(" at ")[1]
                except:
                    away_team = game_soup.find_all("div", {"class":"box"})[0].h1.text.split(" vs")[0]
            else:
                if year < "2022":
                    away_team = names[0].text
                    home_team = names[1].text
                else:
                    home_team = names[1].text
                    away_team = names[0].text

            

            weather_insert = pd.Series({"home_team":home_team, "away_team":away_team, "date":timing, "weather":weather})

            all_weather = all_weather.append(weather_insert, ignore_index = True)

            clear_output(wait=True)


        all_weather.away_team = all_weather.away_team.apply(lambda x: " ".join(x) if type(x) == list else x)
        all_weather.date = all_weather.date.apply(lambda x: dt.strptime(" ".join(x.split(", ")[1:]), "%B %d %Y"))
        all_weather.date = all_weather.date.apply(lambda x: str(x).split(" ")[0])
        all_weather.home_team = all_weather.home_team.apply(lambda x: "Los Angeles Angels" if x == "Los Angeles Angels of Anaheim" else x)
        all_weather.away_team = all_weather.away_team.apply(lambda x: "Los Angeles Angels" if x == "Los Angeles Angels of Anaheim" else x)
        all_weather.home_team = all_weather.home_team.apply(lambda x: "Cleveland Guardians" if x == "Cleveland Indians" else x)
        all_weather.away_team = all_weather.away_team.apply(lambda x: "Cleveland Guardians" if x == "Cleveland Indians" else x)

    if int(year) > 2022: 
        prior_weather_data = pkl.load(open("/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/weather_data_through_{}.pkl".format(int(year)-1), "rb"))
    else:
        prior_weather_data = pkl.load(open("/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/Original All Weather Data.pkl", "rb"))


    pulled_weather_data = all_weather
    updated_weather_data = pd.concat([prior_weather_data, pulled_weather_data]).reset_index(drop=True).drop_duplicates()

    pkl.dump(updated_weather_data, open("/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/weather_data_through_{}.pkl".format(year), "wb"))
    
    # Upload the Rotowire Weather File to Google Drive
    file_path = "/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/weather_data_through_{}.pkl".format(year)
    file_title = "Prior Weather Data Through {} (Updated {})".format(current_year, dt.today().strftime("%Y-%m-%d"))
    drive_folder_id = "1yD4DEn12VLA6i9hj7jGQDKWEZAdbUAjq"
    upload_to_gdrive(file_path, file_title, drive_folder_id)

    
    clear_output()

#### Scrape Rotowire Weather for Today's Games
###### Check to make sure there are sometimes "in" wind forecasts

In [11]:
def update_daily_mlb_weather(save_files = True):
    url = "https://www.rotowire.com/baseball/weather.php"
    webpage = requests.get(url)
    soup = bs(webpage.content, "html.parser")
    weather_boxes = soup.find_all("div",{"class":"weather-box"})

    weather_dict = {}

    for game in weather_boxes: # Right now we are only looking at the start time for each game. In order to look at 1 or two hou
        visiting_team = game.find_all("a", {"class":"weather-box__team is-visit"})[0].div.text
        home_team = game.find_all("a", {"class":"weather-box__team is-home"})[0].div.text
        game_time = str.join(" ", game.find_all("div",{"class":"weather-box__date"})[0].text.split("at ")[-1].split(" ")[0:2])

        game_id = "{} @ {} on {} {}".format(visiting_team, home_team, dt.strftime(dt.today(), "%Y-%m-%d"), game_time)
        weather_dict[game_id] = {}

        is_dome = (game.find_all("div", {"class":"lineup__weather-text"})[0].b.text == "Dome")

        starting_weather_text = game.find_all("div", {"class":"weather-box__weather"})[0].find_all("div")[-1].text
        is_postponed = (starting_weather_text.split(" ")[-1].strip(".")) == "postponed"
        
        if is_postponed == True:
            continue
        
        starting_rain_percentage = float(starting_weather_text.split("%")[0].split(" ")[-1]) if is_dome == False else 0.0
        starting_temprature = float(starting_weather_text.split("°")[0].split(" ")[-1]) if is_dome == False else 72.0
        starting_wind_speed = float(starting_weather_text.split(" MPH")[0].split(" ")[-1]) if is_dome == False else 0.0
        starting_wind_direction = starting_weather_text.split("blowing ")[1].split(" in")[0] if is_dome == False else None

        weather_dict[game_id]["rain_percentage"] = [starting_rain_percentage]
        weather_dict[game_id]["temprature"] = [starting_temprature]
        weather_dict[game_id]["wind_speed"] = [starting_wind_speed]
        weather_dict[game_id]["wind_direction"] = [starting_wind_direction]

        # Now collect the weather info for each hour of the game
        for hour in range(0,3):
            rain_percentage = float(game.find_all("div", {"class":"lineup__weather-text"})[hour].text.split("% ")[0].split("\n")[-1]) if is_dome == False else 0.0
            temprature = float(re.findall(r'\d+', game.find_all("div", {"class":"lineup__weather-text"})[hour].text.split("°")[0].split(" ")[-1])[0]) if is_dome == False else 72.0
            wind_speed = float(game.find_all("div", {"class":"lineup__weather-text"})[hour].text.split(" ")[2]) if is_dome == False else 0.0
            wind_direction = game.find_all("div", {"class":"lineup__weather-text"})[hour].text.split("mph")[-1].strip() if is_dome == False else None

            weather_dict[game_id]["rain_percentage"].append(rain_percentage)
            weather_dict[game_id]["temprature"].append(temprature)
            weather_dict[game_id]["wind_speed"].append(wind_speed)
            weather_dict[game_id]["wind_direction"].append(wind_direction)

        weather_dict[game_id]["is_dome"] = [is_dome for x in range(4)]
        
    if save_files == True:
        prior_weather_data = pkl.load(open("/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/daily_weather_data.pkl", "rb"))
        updated_weather_data = pd.concat([prior_weather_data, pd.DataFrame(weather_dict).T.reset_index(names="game_id")]).reset_index(drop=True)
        updated_weather_data = updated_weather_data.iloc[updated_weather_data.astype(str).drop_duplicates().index]
        pkl.dump(updated_weather_data, open("/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/daily_weather_data.pkl", "wb"))
        
        # Upload the Rotowire Weather File to Google Drive
        file_path = "/Users/jaredzirkes/Desktop/Python/Non-GitHub/Sports Projects/MLB Pitch and Weather Pickles/daily_weather_data.pkl"
        file_name= "Roto Wire Data Through {}".format(dt.today().strftime("%Y-%m-%d"))
        cloud_bucket_name = "daily_rotowire_weather_data"
        project_id = "inspiring-dryad-422903-c4"
        upload_to_cloud_storage(project_id, cloud_bucket_name, file_name, file_path)
    
        
        # Delete the Rotowire Weather File from two days ago
        try:
            cloud_bucket_name = "daily_rotowire_weather_data"
            project_id = "inspiring-dryad-422903-c4"
            file_to_delete_name = "Roto Wire Data Through {}".format((dt.today() - datetime.timedelta(2)).strftime("%Y-%m-%d"))
            delete_from_cloud_storage(project_id, cloud_bucket_name, file_to_delete_name)
        except:
            print("The RotoWire Weather File from {} is not in the Drive File. No Files Have Been Deleted.".format((dt.today()- datetime.timedelta(2)).strftime("%Y-%m-%d")))