# A04. Bullpens
This scrapes bullpen depth charts via team websites, using the Wayback Machine for missing historic data
- Type: Data
- Run Frequency: Pre-contest
- Sources:
    - Team pages on mlb.com
    - Wayback Machine (if necessary)
- Dates:
    - Created: 9/23/2023
    - Updated: 4/21/2024

Note: This relies upon the belief that these depth charts are sorted by descending leverage, which appears largely correct.

In [21]:
# Extract bullpen information from MLB.com depth charts
def scrape_bullpen(mlburl, header, abbrev, historic=False, date=None):
    # URL
    if historic == True:
        url = f"https://web.archive.org/web/{date}/https://www.mlb.com/{mlburl}/roster/depth-chart"
    else:
        url = f"https://www.mlb.com/{mlburl}/roster/depth-chart"
    
    header = {
      "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
      "X-Requested-With": "XMLHttpRequest"
    }
    
    # Get data from URL
    r = requests.get(url, headers=header, verify=False)
    time.sleep(1)
    dfs = pd.read_html(r.text, encoding='iso-8859-1')
    
    # Bullpen can be one of two tables
    try:
        df = dfs[2]
        # Remove if they're on IL
        df = df[df["Bullpen.1"].str.contains("IL-")==False].reset_index()
        # Or in the minors
        df = df[df["Bullpen.1"].str.contains(" Minors")==False].reset_index()
    except:
        df = dfs[1]
        df = df[df["Bullpen.1"].str.contains("IL-")==False].reset_index()  
        df = df[df["Bullpen.1"].str.contains(" Minors")==False].reset_index()
        
    # Assume leverage is 0 (these pitchers will never come into a game, reserved for SPs on days off)
    df['Leverage'] = 0
    # Loop through rows
    for i in range(len(df)):
        # The top guy should be the closer
        if i == 0:
            df['Leverage'][i] = 4
        # Then the next five are set up/high leverage
        elif i < 4:
            df['Leverage'][i] = 3
        # Then low leverage
        elif i < 11:
            df['Leverage'][i] = 2
    
    # Always have a pitcher with leverage of 2
    df.loc[df.index[-1], 'Leverage'] = 2
    # Always have a pitcher with leverage of 3
    if 3 not in list(df['Leverage']):
        df.loc[df.index[-2], 'Leverage'] = 3

    # Extract name from column Bullpen.1
    df[['Name', 'drop']] = df['Bullpen.1'].str.split("B/T", expand=True)
    # Remove numbers
    df['Name'] = df['Name'].str.replace(r'\d+', '', regex=True)
    # Remove closer tag
    df['Name'] = df['Name'].str.replace(r"\(CL\)", '', regex=True)
    
    # Clean name
    df['Name'] = df.apply(lambda x: remove_accents(x['Name']), axis=1)  # remove accents
    df['Name'] = df['Name'].str.strip()
    
    # Keep Name, Bats/Throws, Leverage
    df = df[['Name', 'B/T', 'Leverage']]
    
    # Date
    df['date'] = date
    # Baseball Reference team abbreviation
    df['BBREFTEAM'] = abbrev
    
    
    return df

In [None]:
# Create bullpen dataframe (and csv)
# Note: can write to csv in parallel but not to sql
def bullpens(date, team_map, historic):    
    header = {
      "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
      "X-Requested-With": "XMLHttpRequest"
        }

    # Create path
    if not os.path.exists(os.path.join(r'C:\Users\james\Documents\MLB\Database\A04. Bullpens', f"Bullpens {date}")):
        os.makedirs(os.path.join(r'C:\Users\james\Documents\MLB\Database\A04. Bullpens', f"Bullpens {date}"))
        time.sleep(1)
    else:
        print("Already exists")    

    # Scrape bullpens
    for i in range(len(team_map)):
        # Extract team's website URL
        mlburl = team_map['MLBURL'][i]
        # Extract team's Baseball Reference abbreviation
        bbrefteam = team_map['BBREFTEAM'][i]
        # try:
        # Scrape bullpens
        bullpen_df = scrape_bullpen(mlburl, header, bbrefteam, historic, date)
        # To csv
        bullpen_df.to_csv(os.path.join(baseball_path, "A04. Bullpens", f"Bullpens {date}", f"Bullpen {bbrefteam} {date}.csv"), index=False, encoding='iso-8859-1')