In [18]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from io import StringIO

import re
from unidecode import unidecode  # for remove_accents fallback

# Define remove_accents if not already
def remove_accents(input_str):
    return unidecode(input_str)

def scrape_bullpen(mlburl, bbrefteam, historic=False, date=None):
    if historic:
        url = f"https://web.archive.org/web/{date}/https://www.mlb.com/{mlburl}/roster/depth-chart"
    else:
        url = f"https://www.mlb.com/{mlburl}/roster/depth-chart"
    
    headers = {
        "User-Agent": "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }

    r = requests.get(url, headers=headers)
    time.sleep(1)
    
    # Read tables with pandas
    dfs = pd.read_html(StringIO(r.text), encoding='iso-8859-1')

    # Use BeautifulSoup to get links
    soup = BeautifulSoup(r.text, 'html.parser')
    player_links = {}
    for tag in soup.select('a[href*="/player/"]'):
        name = tag.get_text(strip=True)
        href = tag['href']
        if name:
            # Remove numbers and (CL) just like we do below
            cleaned_name = re.sub(r'\(CL\)|\d+', '', name).strip()
            cleaned_name = remove_accents(cleaned_name)
            player_links[cleaned_name] = f"https://www.mlb.com{href}"

    # Bullpen can be one of two tables
    try:
        df = dfs[2]
        df = df[df["Bullpen.1"].str.contains("IL-") == False].reset_index(drop=True)
        df = df[df["Bullpen.1"].str.contains(" Minors") == False].reset_index(drop=True)
    except:
        df = dfs[1]
        df = df[df["Bullpen.1"].str.contains("IL-") == False].reset_index(drop=True)
        df = df[df["Bullpen.1"].str.contains(" Minors") == False].reset_index(drop=True)

    # Assume leverage = 0 by default
    df['Leverage'] = 0
    for i in range(len(df)):
        if i == 0:
            df.at[i, 'Leverage'] = 4
        elif i < 4:
            df.at[i, 'Leverage'] = 3
        elif i < 11:
            df.at[i, 'Leverage'] = 2

    df.loc[df.index[-1], 'Leverage'] = 2
    if 3 not in list(df['Leverage']):
        df.loc[df.index[-2], 'Leverage'] = 3

    # Extract name and B/T
    df[['Name', 'drop']] = df['Bullpen.1'].str.split("B/T", expand=True)
    df['Name'] = df['Name'].str.replace(r'\d+', '', regex=True)
    df['Name'] = df['Name'].str.replace(r"\(CL\)", '', regex=True)
    df['Name'] = df['Name'].apply(remove_accents).str.strip()

    # Rebuild B/T column
    df['B/T'] = df['drop'].str.extract(r'([LR]+/[LR]+)', expand=False)

    # Add player URLs if available
    df['URL'] = df['Name'].map(player_links)

    # Final columns
    df = df[['Name', 'B/T', 'Leverage', 'URL']]
    df['date'] = date
    df['BBREFTEAM'] = bbrefteam

    df['id'] = df['URL'].str.split('/').str[-1]

    df.drop(columns=['url'], inplace=True)

    return df


In [19]:
# Example for Red Sox bullpen on current roster
df = scrape_bullpen(mlburl="redsox", bbrefteam="BOS")
print(df)


                 Name  B/T  Leverage                                URL  date  \
0     Aroldis Chapman  L/L         4  https://www.mlb.com/player/547973  None   
1       Justin Slaten  R/R         3  https://www.mlb.com/player/686580  None   
2    Garrett Whitlock  R/R         3  https://www.mlb.com/player/676477  None   
3       Justin Wilson  L/L         3  https://www.mlb.com/player/458677  None   
4          Zack Kelly  R/R         2  https://www.mlb.com/player/677161  None   
5       Greg Weissert  R/R         2  https://www.mlb.com/player/669711  None   
6  Brennan Bernardino  L/L         2  https://www.mlb.com/player/657514  None   
7     Josh Winckowski  R/R         2  https://www.mlb.com/player/670174  None   

  BBREFTEAM      id  
0       BOS  547973  
1       BOS  686580  
2       BOS  676477  
3       BOS  458677  
4       BOS  677161  
5       BOS  669711  
6       BOS  657514  
7       BOS  670174  
