# A04. Bullpens
This scrapes bullpen depth charts via team websites, using the Wayback Machine for missing historic data
- Type: Data
- Run Frequency: Pre-contest
- Sources:
    - Team pages on mlb.com
    - Wayback Machine (if necessary)
- Dates:
    - Created: 9/23/2023
    - Updated: 4/21/2024

Note: This relies upon the belief that these depth charts are sorted by descending leverage, which appears largely correct.

### Imports

In [1]:
import sys
if not hasattr(sys.modules['__main__'], '__file__'):
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Functions.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
    print("Imports executed")

Imports executed


### Games

In [2]:
if not hasattr(sys.modules['__main__'], '__file__'):
    # Set date range 
    start_date = "20250319"
    end_date = "20250319"
    all_game_df = pd.read_csv(os.path.join(baseball_path, "game_df.csv"))
    game_df = all_game_df[(all_game_df['date'].astype(str) >= start_date) & (all_game_df['date'].astype(str) <= end_date)].reset_index(drop=True)

game_df created.


### Functions

##### Scrape Bullpen Data from MLB.com Depth Charts

In [None]:
def scrape_bullpen(mlburl, bbrefteam, historic=False, date=None):
    if historic:
        url = f"https://web.archive.org/web/{date}/https://www.mlb.com/{mlburl}/roster/depth-chart"
    else:
        url = f"https://www.mlb.com/{mlburl}/roster/depth-chart"
    
    headers = {
        "User-Agent": "Mozilla/5.0",
        "X-Requested-With": "XMLHttpRequest"
    }

    r = requests.get(url, headers=headers)
    time.sleep(1)
    
    # Read tables with pandas
    dfs = pd.read_html(StringIO(r.text), encoding='iso-8859-1')

    # Use BeautifulSoup to get links
    soup = BeautifulSoup(r.text, 'html.parser')
    player_links = {}
    for tag in soup.select('a[href*="/player/"]'):
        name = tag.get_text(strip=True)
        href = tag['href']
        if name:
            # Remove numbers and (CL) just like we do below
            cleaned_name = re.sub(r'\(CL\)|\d+', '', name).strip()
            cleaned_name = remove_accents(cleaned_name)
            player_links[cleaned_name] = f"https://www.mlb.com{href}"

    # Bullpen can be one of two tables
    try:
        df = dfs[2]
        df = df[df["Bullpen.1"].str.contains("IL-") == False].reset_index(drop=True)
        df = df[df["Bullpen.1"].str.contains(" Minors") == False].reset_index(drop=True)
    except:
        df = dfs[1]
        df = df[df["Bullpen.1"].str.contains("IL-") == False].reset_index(drop=True)
        df = df[df["Bullpen.1"].str.contains(" Minors") == False].reset_index(drop=True)

    # Assume leverage = 0 by default
    df['Leverage'] = 0
    for i in range(len(df)):
        if i == 0:
            df.at[i, 'Leverage'] = 4
        elif i < 4:
            df.at[i, 'Leverage'] = 3
        elif i < 11:
            df.at[i, 'Leverage'] = 2

    df.loc[df.index[-1], 'Leverage'] = 2
    if 3 not in list(df['Leverage']):
        df.loc[df.index[-2], 'Leverage'] = 3

    # Extract name and B/T
    df[['Name', 'drop']] = df['Bullpen.1'].str.split("B/T", expand=True)
    df['Name'] = df['Name'].str.replace(r'\d+', '', regex=True)
    df['Name'] = df['Name'].str.replace(r"\(CL\)", '', regex=True)
    df['Name'] = df['Name'].apply(remove_accents).str.strip()

    # Rebuild B/T column
    df['B/T'] = df['drop'].str.extract(r'([LR]+/[LR]+)', expand=False)

    # Add player URLs
    df['URL'] = df['Name'].map(player_links)

    # Extract player's MLB id from URL
    df['id'] = df['URL'].str.split('/').str[-1]
    
    # Final columns
    df = df[['Name', 'B/T', 'Leverage', 'URL', 'id']]
    df['date'] = date
    df['BBREFTEAM'] = bbrefteam

    
    return df


##### Write to CSV

In [4]:
# Create bullpen dataframe (and csv)
# Note: can write to csv in parallel but not to sql
def bullpens(date, team_map, historic):    
    # Create folder, if necessary
    os.makedirs(os.path.join(baseball_path, 'A04. Bullpens', f"Bullpens {date}"), exist_ok=True)
    time.sleep(1)
    
    # Scrape bullpens
    for i in range(len(team_map)):
        # Extract team's website URL
        mlburl = team_map['MLBURL'][i]
        # Extract team's Baseball Reference abbreviation
        bbrefteam = team_map['BBREFTEAM'][i]
        # Scrape bullpens
        bullpen_df = scrape_bullpen(mlburl, bbrefteam, historic, date)
        # To CSV
        bullpen_df.to_csv(os.path.join(baseball_path, "A04. Bullpens", f"Bullpens {date}", f"Bullpen {bbrefteam} {date}.csv"), index=False, encoding='iso-8859-1')

### Run

##### Scrape in Parallel

In [5]:
%%time
empty_list = Parallel(n_jobs=-1, verbose=0)(delayed(bullpens)(date=date, team_map=team_map, historic=len(game_df['date'].unique()) > 1) for date in list(game_df['date'].unique()))

CPU times: total: 172 ms
Wall time: 40.3 s
