# A2. Rosters
Source: MLB Stats API <br>

Description: This scrapes roster information from MLB Stats API <br>
It can get historic data, although it might miss some players.

### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import requests

from joblib import Parallel, delayed
from bs4 import BeautifulSoup

import statsapi
from statsapi import get

import sys
sys.path.append(r"C:\Users\james\Documents\MLB\Code")
from Utilities import *

# import import_ipynb
# from Utilities import *

import warnings
warnings.simplefilter(action="ignore")

baseball_path = r"C:\Users\james\Documents\MLB\Data"

In [2]:
# Today's Date
# YYYY-MM-DD (datetime)
todaysdate_dt = datetime.date.today()

# YYYY-MM-DD (string)
todaysdate_dash = str(todaysdate_dt)

# MM/DD/YYYY
todaysdate_slash = todaysdate_dash.split("-")
todaysdate_slash = todaysdate_slash[1] + "/" + todaysdate_slash[2] + "/" + todaysdate_slash[0]

# YYYYMMDD
todaysdate = todaysdate_dash.replace("-", "")

In [3]:
# This reads in Chadwick register with player codes.
keep_list = ['key_mlbam', 'key_fangraphs', 'key_bbref_minors', 'key_bbref', 'name_first', 'name_last']
chadwick = read_chadwick(keep_list)

In [4]:
# This reads in a map of team name, codes, and the number Fangraphs uses in their URLs
team_map = pd.read_csv(os.path.join(baseball_path, "Utilities", "Team Map.csv"))

# We just need teams right now
team_map = team_map[['FULLNAME', 'BBREFTEAM']]

### Box Score

In [5]:
# Creates box score variables
def create_box(gamePk):
    # Read in boxscore as json
    box = pd.json_normalize(statsapi.boxscore_data(gamePk, timecode=None), record_path='gameBoxInfo')
    # Take value where label = variable of interest
    try:
        weather = box.loc[box['label']=="Weather", "value"].item() 
    except:
        weather = "75 degrees, Clear."
    try:
        wind = box.loc[box['label']=="Wind", "value"].item()
    except:
        wind = "0 mph, L To R."

    return weather, wind

### Batting Order

In [6]:
# Creates dataframe of players and their spot in the batting order
def create_order(gamePk, teamId, date, team="away"):
    list_of_lists = []
    players = statsapi.get("game", {"gamePk": gamePk})['liveData']['boxscore']['teams'][team]['players']
    for player in players:
        id = players[player]['person']['id']
        fullName = players[player]['person']['fullName']
        position = players[player]['position']['name']
        status = players[player]['status']['description']
        try:
            order = statsapi.get("game", {"gamePk": gamePk})['liveData']['boxscore']['teams'][team]['players'][player]['battingOrder']
        except:
            order = np.nan
    
        return_list = [id, fullName, position, status, order]
        list_of_lists.append(return_list)
        
    df = pd.DataFrame(list_of_lists, columns=['id', 'fullName', 'position', 'status', 'order'])
    df = df.merge(chadwick, left_on='id', right_on='key_mlbam', how='left')
        
    return df

### Rosters

In [7]:
# Creates a roster
def create_roster(chadwick, teamId, date):
    id_list = []
    fullName_list = []
    firstName_list = []
    lastName_list = []
    position_list = []
    batSide_list = []
    pitchHand_list = []

    for i in range(len(statsapi.get("team_roster", {"teamId": teamId, "rosterType": "activeRoster", "date": date, "hydrate": "person"})['roster'])):
        fullName = statsapi.get("team_roster", {"teamId": teamId, "rosterType": "activeRoster", "date": date, "hydrate": "person"})['roster'][i]['person']['fullName']   
        try:
            firstName = statsapi.get("team_roster", {"teamId": teamId, "rosterType": "activeRoster", "date": date, "hydrate": "person"})['roster'][i]['person']['firstName']
        except: 
            firstName = "Missing"
        try:
            lastName = statsapi.get("team_roster", {"teamId": teamId, "rosterType": "activeRoster", "date": date, "hydrate": "person"})['roster'][i]['person']['lastName']
        except:
            lastName = "Missing"
        id = statsapi.get("team_roster", {"teamId": teamId, "rosterType": "activeRoster", "date": date, "hydrate": "person"})['roster'][i]['person']['id']
        position = statsapi.get("team_roster", {"teamId": teamId, "rosterType": "activeRoster", "date": date, "hydrate": "person"})['roster'][i]['person']['primaryPosition']['abbreviation']
        batSide = statsapi.get("team_roster", {"teamId": teamId, "rosterType": "activeRoster", "date": date, "hydrate": "person"})['roster'][i]['person']['batSide']['code']
        pitchHand = statsapi.get("team_roster", {"teamId": teamId, "rosterType": "activeRoster", "date": date, "hydrate": "person"})['roster'][i]['person']['pitchHand']['code']
        
        
        fullName_list.append(fullName)
        id_list.append(id)
        firstName_list.append(firstName)
        lastName_list.append(lastName)
        position_list.append(position)
        batSide_list.append(batSide)
        pitchHand_list.append(pitchHand)

    df = pd.DataFrame(list(zip(id_list, fullName_list, firstName_list, lastName_list, position_list, batSide_list, pitchHand_list)), columns=['id', 'fullName', 'firstName', 'lastName', 'position', 'batSide', 'pitchHand'])
    
    return df

In [8]:
# Creates all rosters
def create_rosters(x):                      
    ### Create new folder
    # Create date
    date = x['game_date']
    # Remove -
    date = date.replace("-", "")

    # Create roster directory
    directory = "Rosters" + date
    try:
        os.mkdir(os.path.join(baseball_path, "A2. Rosters", directory))
    except:
        pass

    # Create boxscore for weather
    weather, wind = create_box(x['game_id'])

    ### Away
    # Create rosters
    away_roster = create_roster(chadwick, x['away_id'], x['game_date'])        

    # Create batting orders
    away_order = create_order(x['game_id'], x['away_id'], date, "away")

    # Merge batting order df onto rosters
    away_roster = away_roster.merge(away_order, on='id', how='outer', suffixes=("", "_fill"))

    # Fill in missings
    away_roster['fullName'].fillna(away_roster['fullName_fill'], inplace=True)
    away_roster['position'].fillna(away_roster['position_fill'], inplace=True)
    away_roster['firstName'].fillna(away_roster['name_first'], inplace=True)
    away_roster['lastName'].fillna(away_roster['name_last'], inplace=True)
    away_roster['batSide'].fillna("R", inplace=True)
    away_roster['pitchHand'].fillna("R", inplace=True)

    # Determine starting pitcher
    away_roster['starter'] = (away_roster['fullName'] == x['away_probable_pitcher']).astype('int')

    away_roster['teamName'] = x['away_name']
    away_roster['venue_id'] = x['venue_id']
    away_roster['game_date'] = x['game_date']
    away_roster['game_type'] = x['game_type']
    away_roster['game_num'] = x['game_num']
    away_roster['summary'] = x['summary']

    away_roster['weather'] = weather
    away_roster['wind'] = wind

    # Acquire team name in short form
    away_roster = away_roster.merge(team_map, left_on=['teamName'], right_on=['FULLNAME'], how='inner')
    try:
        teamname = away_roster['BBREFTEAM'][0]
    except:
        teamname = "Missing"

    # Read in depth chart
    depthfolder = "Depth" + date
    depthfile = "Depth_Chart_" + teamname + "_" + date + ".csv"
    depth_chart = pd.read_csv(os.path.join(baseball_path, "A1. Depth Charts", depthfolder, depthfile), encoding='iso-8859-1')
    depth_chart = depth_chart[['Name', 'Leverage']]

    # Merge rosters with depth chart
    away_roster = away_roster.merge(depth_chart, left_on='fullName', right_on='Name', how='left')
    away_roster['Leverage'].fillna(0, inplace=True)

    away_roster.drop(columns={'teamName', 'Name', 'fullName_fill', 'position_fill', 'name_first', 'name_last'}, inplace=True)
    # If they weren't in the roster, they'll have a missing position value, which will be filled in using full name. These folks will have missing hand values
    away_roster['missing'] = np.where(away_roster['position'].str.len() > 2, 1, 0)            
    # Position will be weird for those missing from roster. Change from Pitcher to P to match
    away_roster['position'] = np.where(away_roster['position'] == 'Pitcher', 'P', away_roster['position'])

    # Write to csv
    awayname = teamname + date + ".csv"
    away_roster.to_csv(os.path.join(baseball_path, "A2. Rosters", directory, awayname), encoding='iso-8859-1')


    ### Home
    # Create rosters
    home_roster = create_roster(chadwick, x['home_id'], x['game_date'])        

    # Create batting orders
    home_order = create_order(x['game_id'], x['home_id'], date, "home")

    # Merge batting order df onto rosters
    home_roster = home_roster.merge(home_order, on='id', how='outer', suffixes=("", "_fill"))

    # Fill in missings
    home_roster['fullName'].fillna(home_roster['fullName_fill'], inplace=True)
    home_roster['position'].fillna(home_roster['position_fill'], inplace=True)
    home_roster['firstName'].fillna(home_roster['name_first'], inplace=True)
    home_roster['lastName'].fillna(home_roster['name_last'], inplace=True)
    home_roster['batSide'].fillna("R", inplace=True)
    home_roster['pitchHand'].fillna("R", inplace=True)

    # Determine starting pitcher
    home_roster['starter'] = (home_roster['fullName'] == x['home_probable_pitcher']).astype('int')

    home_roster['teamName'] = x['home_name']
    home_roster['venue_id'] = x['venue_id']
    home_roster['game_date'] = x['game_date']
    home_roster['game_type'] = x['game_type']
    home_roster['game_num'] = x['game_num']
    home_roster['summary'] = x['summary']

    home_roster['weather'] = weather
    home_roster['wind'] = wind

    # Acquire team name in short form
    home_roster = home_roster.merge(team_map, left_on=['teamName'], right_on=['FULLNAME'], how='inner')
    try:
        teamname = home_roster['BBREFTEAM'][0]
    except:
        teamname = "Missing"

    # Read in depth chart
    depthfolder = "Depth" + date
    depthfile = "Depth_Chart_" + teamname + "_" + date + ".csv"
    depth_chart = pd.read_csv(os.path.join(baseball_path, "A1. Depth Charts", depthfolder, depthfile), encoding='iso-8859-1')
    depth_chart = depth_chart[['Name', 'Leverage']]

    # Merge rosters with depth chart
    home_roster = home_roster.merge(depth_chart, left_on='fullName', right_on='Name', how='left')
    home_roster['Leverage'].fillna(0, inplace=True)

    home_roster.drop(columns={'teamName', 'Name', 'fullName_fill', 'position_fill', 'name_first', 'name_last'}, inplace=True)
    # If they weren't in the roster, they'll have a missing position value, which will be filled in using full name. These folks will have missing hand values
    home_roster['missing'] = np.where(home_roster['position'].str.len() > 2, 1, 0)
    # Position will be weird for those missing from roster. Change from Pitcher to P to match
    home_roster['position'] = np.where(home_roster['position'] == 'Pitcher', 'P', home_roster['position'])

    # Write to csv
    homename = teamname + date + ".csv"
    home_roster.to_csv(os.path.join(baseball_path, "A2. Rosters", directory, homename), encoding='iso-8859-1')

### Run

In [9]:
# List of games between dates
games = statsapi.schedule(start_date=todaysdate_slash, end_date=todaysdate_slash)

# Loop over games to create rosters
Parallel(n_jobs=-2, verbose=5)(delayed(create_rosters)(x=game) for game in games)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of  16 | elapsed:  1.3min remaining:  5.8min
[Parallel(n_jobs=-2)]: Done   7 out of  16 | elapsed:  1.4min remaining:  1.8min
[Parallel(n_jobs=-2)]: Done  11 out of  16 | elapsed:  1.4min remaining:   38.7s


Done


[Parallel(n_jobs=-2)]: Done  16 out of  16 | elapsed:  2.2min finished


In [10]:
# Fill in missing FanGraphs IDs
# Create roster directory
directory = "Rosters" + todaysdate

for filename in os.listdir(os.path.join(baseball_path, "A2. Rosters", directory)):
    df = pd.read_csv(os.path.join(baseball_path, "A2. Rosters", directory, filename))
    print(filename)
    df["missing_fg"] = df['key_fangraphs'].isna()
    for i in range(len(df)):
        if df['missing_fg'][i] == True:
            print(df['fullName'][i])
            try:
                fangraphs_id = new_ids(df['fullName'][i], filename[0:3], "fangraphs")
                df['key_fangraphs'][i] = fangraphs_id
            except:
                print("Missing")
            
    df.to_csv(os.path.join(baseball_path, "A2. Rosters", directory, filename))

ARI20230614.csv
ATL20230614.csv
AJ Smith-Shawver
Dylan Dodd
Jared Shuster
BAL20230614.csv
Joey Ortiz
BOS20230614.csv
Joe Jacques
Masataka Yoshida
CHC20230614.csv
Matt Mervis
Miguel Amaya
CHW20230614.csv
CIN20230614.csv
Andrew Abbott
Brandon Williamson
Casey Legumina
Elly De La Cruz
Matt McLain
TJ Hopkins
CLE20230614.csv
David Fry
Logan Allen
Tanner Bibee
COL20230614.csv
Brenton Doyle
Coco Montes
DET20230614.csv
Braden Bristo
Brendan White
Mason Englert
Reese Olson
HOU20230614.csv
Cesar Salazar
Corey Julks
Grae Kessinger
J.P. France
KCR20230614.csv
Austin Cox
LAA20230614.csv
Jose Soriano
Sam Bachman
Zach Neto
LAD20230614.csv
Bobby Miller
Jonny Deluca
Nick Robertson
MIA20230614.csv
Eury Perez
MIL20230614.csv
Andruw Monasterio
Blake Perkins
Joey Wiemer
MIN20230614.csv
Edouard Julien
NYM20230614.csv
Josh Walker
Kodai Senga
NYY20230614.csv
Anthony Volpe
Matt Krook
OAK20230614.csv
Hogan Harris
Lucas Erceg
Luis Medina
Ryan Noda
Shintaro Fujinami
PHI20230614.csv
PIT20230614.csv
Jose Hernandez


In [11]:
# # Fix all IDs
# folder_path = r"C:\Users\james\Documents\MLB\Data\A2. Rosters"

# # Loop over all Roster folders
# for folder_name in os.listdir(r"C:\Users\james\Documents\MLB\Data\A2. Rosters"): 
#     if folder_name.startswith("Rosters2023"):
#         # Full path
#         folder_full_path = os.path.join(folder_path, folder_name)

#         # Loop over files within Roster folders
#         for file_name in os.listdir(folder_full_path):
#             print(file_name)
#             # Read in Roster csv
#             df = pd.read_csv(os.path.join(folder_full_path, file_name), encoding='utf-8')
#             # Create missing FanGraphs ID flag
#             df["missing_fg"] = df['key_fangraphs'].isna()
#             # Loop over players
#             for i in range(len(df)):
#                 # If they're missing an ID
#                 if df['missing_fg'][i] == True:
#                     # Print their name
#                     print(df['fullName'][i])
#                     # Then, attempt to look it up
#                     # try:
#                 #     # Google player, team, and "fangraphs"
#                     fangraphs_id = new_ids(df['fullName'][i], file_name[0:3], "fangraphs")
#                     # Replace their ID with what's found
#                     df['key_fangraphs'][i] = fangraphs_id
#                     # If not, it's missing
#                     # except:
#                     #     print("Missing")

#             df.to_csv(os.path.join(folder_full_path, file_name))

Certain players (Trey Mancini, Astros, 9/15/22) don't appear in the roster, but they do appear in the batting order. You can gather id, fullName, position, and batting order (which will have Mancini) and do an outer merge with rosters to keep Mancini, albeit with some missing information. <br>
Can fill in those later, but it beats not having any idea he's on the roster at all. <br>
You'll just need to fill in firstName, lastName, batSide, pitchHand unless you can find a way to do this all at once. <br>


Worth exploring its data for: <br>
1) Weather <br>
2) Probables <br>
    Updates very quickly. Can use for locks <br>
3) Live stats (dashboard, maybe?) <br>
4) Final scores <br>
5) Full rosters (not just active)

In [12]:
print("Code was last run on: {} at {}.".format(datetime.date.today(), datetime.datetime.now().strftime("%H:%M:%S")))

Code was last run on: 2023-06-14 at 17:48:20.
