# A1. Depth Charts
Source: MLB.com team depth charts <br>
        Wayback Machine for past games <br>

Description: This scrapes bullpen depth charts and infers leverage from order <br>

### Imports

In [8]:
import pandas as pd
import requests
import os
import datetime
import re

import warnings
warnings.simplefilter(action="ignore")

import sys 
sys.path.append(r'C:\Users\james\Documents\MLB\Code')


import import_ipynb
from Utilities import *

baseball_path = r"C:\Users\james\Documents\MLB\Data"

In [9]:
# Today's Date
# YYYY-MM-DD (datetime)
todaysdate_dt = datetime.date.today()

# YYYY-MM-DD (string)
todaysdate_dash = str(todaysdate_dt)

# MM/DD/YYYY
todaysdate_slash = todaysdate_dash.split("-")
todaysdate_slash = todaysdate_slash[1] + "/" + todaysdate_slash[2] + "/" + todaysdate_slash[0]

# YYYYMMDD
todaysdate = todaysdate_dash.replace("-", "")

In [10]:
# This reads in a map of team name, codes, and the shorthand MLB uses in their URLs
team_map = pd.read_csv(os.path.join(baseball_path, "Utilities", "Team Map.csv"))

# We just need teams right now
team_map = team_map[['FULLNAME', 'BBREFTEAM', 'MLBURL']]

### Bullpen Depth Charts

In [11]:
# This scrapes the bullpen depth chart for teams via their website or via the Wayback Machine
# Top reliever will be the closer. Usually other high-leverage pitchers will be near top
# Need this header to trick site into thinking this isn't a scrape
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

def scrape_bullpen(url, header, abbrev):
    # Get data from URL
    r = requests.get(url, headers=header)
    dfs = pd.read_html(r.text, encoding='iso-8859-1')
    # Bullpen can be one of two tables
    try:
        df = dfs[2]
        # Remove if they're on IL
        df = df[df["Bullpen.1"].str.contains("IL-")==False].reset_index()
        # Or in the minors
        df = df[df["Bullpen.1"].str.contains(" Minors")==False].reset_index()
    except:
        df = dfs[1]
        df = df[df["Bullpen.1"].str.contains("IL-")==False].reset_index()  
        df = df[df["Bullpen.1"].str.contains(" Minors")==False].reset_index()
    # Assume leverage is 0
    df['Leverage'] = 0
    # Loop through rows
    for i in range(len(df)):
        if i == 0:
            # The top guy should be the closer
            df['Leverage'][i] = 4
        elif i < 4:
            # Then the next five are set up/high leverage
            df['Leverage'][i] = 3
        elif i < 11:
            # Then low leverage
            df['Leverage'][i] = 2

    # Extract name from column Bullpen.1
    df[['Name', 'drop']] = df['Bullpen.1'].str.split("B/T", expand=True)
    # Remove numbers
    df['Name'] = df['Name'].str.replace('\d+', '')
    # Remove closer tag
    df['Name'] = df['Name'].str.replace("\(CL\)", '')
    
    # Clean name
    df['Name'] = df.apply(lambda x: remove_accents(x['Name']), axis=1)  # remove accents
    df['Name'] = df['Name'].str.strip()
    
    # Keep Name, Bats/Throws, Leverage
    df = df[['Name', 'B/T', 'Leverage']]
    
    return df

### All Depth Charts

In [12]:
# This loops over teams and scrapes all depth charts
def create_depth_charts(start_date, end_date):
    # Date range
    start_date = datetime.datetime.strptime(start_date, "%Y%m%d")
    end_date = datetime.datetime.strptime(end_date, "%Y%m%d")
    delta = datetime.timedelta(days=1)
    
    # Loop over dates
    while start_date <= end_date:
        print(start_date)
        date = start_date.strftime("%Y%m%d")
        
        # Create roster directory
        directory = "Depth" + date
        try:
            os.mkdir(os.path.join(baseball_path, "A1. Depth Charts", directory))
        except:
            pass

        for index, row in team_map.iterrows():
            mlburl = row['MLBURL']
            abbrev = row['BBREFTEAM']
        
            # Wayback Machine is good for backtesting, but won't default to current date
            # url = f"https://web.archive.org/web/{date}/https://www.mlb.com/{mlburl}/roster/depth-chart"
            url = f"https://www.mlb.com/{mlburl}/roster/depth-chart"
            df = scrape_bullpen(url, header, abbrev)
        
            filename = "Depth_Chart_" + abbrev + "_" + date + ".csv"
            df.to_csv(os.path.join(baseball_path, "A1. Depth Charts", directory, filename), encoding='iso-8859-1')
        
        start_date += delta
        
        
# Last 20221021
create_depth_charts(todaysdate, todaysdate)

2023-05-29 00:00:00


Note:
Scraping past bullpens is possible using the Wayback Machine. It will provide the most recent data as of the data provided in the URL, even if the date isn't available. So if 4/3 exists and 4/4 doesn't, when you try to create the depth chart on 4/4, it'll give you the same depth chart as 4/3 <br>
When running day-of, you don't want to use the Wayback Machine as it will choose the last date it scraped <br>
May not provide a pitcher of each leverage level. Prone to missing closers on occassion. 

In [13]:
print("Code was last run on: {} at {}.".format(datetime.date.today(), datetime.datetime.now().strftime("%H:%M:%S")))

Code was last run on: 2023-05-29 at 14:42:10.
