# A5. Stats
Source: FanGraphs API <br>

This imports stats from Fangraphs <br>
This calculates stats that aren't used in the models but help us get there <br>

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import statsmodels.formula.api as smf
import glob
import warnings
warnings.filterwarnings("ignore")
import datetime
from datetime import date
import time
import re
import import_ipynb
from Utilities import *
import pickle

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

model_path = r"C:\Users\james\Documents\MLB\Code\Models"
baseball_path = r"C:\Users\james\Documents\MLB\Data"
download_path = r"C:\Users\james\Downloads"

importing Jupyter notebook from Utilities.ipynb


In [2]:
# This reads in Chadwick register with player codes.
keep_list = ['key_mlbam', 'key_fangraphs', 'name_first', 'name_last']
chadwick = read_chadwick(keep_list)

In [3]:
# Set today's date
todaysdate = date.today()
todaysdate_dash = str(todaysdate)
todaysdate = todaysdate_dash.replace("-", "")
todaysdate

'20230426'

In [4]:
# List of stats
simple_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo']

# Read in models
woba_reg = pickle.load(open(os.path.join(model_path, 'woba_20220908.sav'), 'rb'))
obp_reg = pickle.load(open(os.path.join(model_path, 'obp_20220908.sav'), 'rb'))
slg_reg = pickle.load(open(os.path.join(model_path, 'slg_20220908.sav'), 'rb'))

fg_vs_lhp = pickle.load(open(os.path.join(model_path, 'fg_vs_lhp_20220905.sav'), 'rb'))
fg_vs_rhp = pickle.load(open(os.path.join(model_path, 'fg_vs_rhp_20220905.sav'), 'rb'))
fg_vs_lhb = pickle.load(open(os.path.join(model_path, 'fg_vs_lhb_20220905.sav'), 'rb'))
fg_vs_rhb = pickle.load(open(os.path.join(model_path, 'fg_vs_rhb_20220905.sav'), 'rb'))

# Scrape FanGraphs

In [5]:
# Fangraphs API
def scrape_batters():
    # Read in API json
    batters_lb = pd.read_json('https://www.fangraphs.com/api/projections?type=steamer&stats=bat&pos=all&team=0&players=0&lg=all')

    # Name is currently some weird thing with a bunch of data. We don't need all that
    batters_lb['Name'] = batters_lb['PlayerName']
    # Rename to match steam. Note that steamerid = key_fangraphs
    batters_lb.rename(columns={'playerids':'steamerid'}, inplace=True)
    # Convert to string
    chadwick['key_fangraphs'] = (chadwick['key_fangraphs']).astype('str')
    # Remove trailing .0
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].str.replace(r'\.\d', "", regex=True)
    
    # Merge with chadwick for mlbamid
    batters_lb = batters_lb.merge(chadwick, left_on='steamerid', right_on='key_fangraphs', how='left')

    # Create missing columns to match what's provided by steamer 
    batters_lb['proj_date'] = todaysdate
    batters_lb['mlbamid'] = batters_lb['key_mlbam']
    batters_lb['bats'] = "MI" # Not included in FanGraphs data
    batters_lb['playerid'] = batters_lb['steamerid']
    batters_lb['NIBB'] = batters_lb['BB'] - batters_lb['IBB']
    batters_lb.rename(columns={'name_first':'firstname', 'name_last':'lastname', 'minpos':'position'}, inplace=True)

    # Keep relevant variables and in order
    batters_lb = batters_lb[['proj_date', 'mlbamid', 'steamerid', 'firstname', 'lastname', 'Team', 'position', 'bats', 'PA', 'IBB', 'NIBB', 
                             'BB', 'SO', 'HBP', 'H', '2B', '3B', 'HR', 'OBP', 'SLG', 'wOBA', 'SB', 'CS', 'playerid', 'Name']]

    # Export to CSV
    filename = "Batters_FG_" + todaysdate + ".csv"
    batters_lb.to_csv(os.path.join(baseball_path, "A5. Stats - 1. FanGraphs", "Batters", filename), encoding='iso-8859-1')
    
    print(batters_lb.query('Name == "Brice Turang"'))
    
    
scrape_batters()

    proj_date   mlbamid steamerid firstname lastname Team position bats   PA  \
268  20230426  668930.0     22186     Brice   Turang  MIL       2B   MI  390   

     IBB  ...  2B  3B  HR       OBP       SLG      wOBA  SB  CS  playerid  \
268    0  ...  16   1   6  0.307942  0.345596  0.290903  11   3     22186   

             Name  
268  Brice Turang  

[1 rows x 25 columns]


In [6]:
# Fangraphs API
def scrape_pitchers():
    # Read in API json
    pitchers_lb = pd.read_json('https://www.fangraphs.com/api/projections?type=steamer&stats=pit&pos=all&team=0&players=0&lg=all')

    # Name is currently some weird thing with a bunch of data. We don't need all that
    pitchers_lb['Name'] = pitchers_lb['PlayerName']
    # Rename to match steam. Note that steamerid = key_fangraphs
    pitchers_lb.rename(columns={'playerids':'steamerid'}, inplace=True)
    # Convert to string
    chadwick['key_fangraphs'] = (chadwick['key_fangraphs']).astype('str')
    # Remove trailing .0
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].str.replace(r'\.\d', "", regex=True)
    
    # Merge with chadwick for mlbamid
    pitchers_lb = pitchers_lb.merge(chadwick, left_on='steamerid', right_on='key_fangraphs', how='left')

    # Create missing columns to match what's provided by steamer 
    pitchers_lb['proj_date'] = todaysdate
    pitchers_lb['mlbamid'] = pitchers_lb['key_mlbam']
    pitchers_lb['Throws'] = "MI" # Not included in FanGraphs data
    pitchers_lb['playerid'] = pitchers_lb['steamerid']
    pitchers_lb.rename(columns={'name_first':'firstname', 'name_last':'lastname', 'minpos':'position'}, inplace=True)

    # Keep relevant variables and in order
    pitchers_lb = pitchers_lb[['proj_date', 'mlbamid', 'steamerid', 'firstname', 'lastname', 'Throws', 'IP', 'G', 'GS', 'K/9',
                             'BB/9', 'H', 'HR', 'playerid', 'Name']]

    print(pitchers_lb.query('Name == "Jared Shuster"'))

    
    # Export to CSV
    filename = "Pitchers_FG_" + todaysdate + ".csv"
    pitchers_lb.to_csv(os.path.join(baseball_path, "A5. Stats - 1. FanGraphs", "Pitchers", filename), encoding='iso-8859-1')
    
scrape_pitchers()

    proj_date   mlbamid steamerid firstname lastname Throws   IP   G  GS  \
161  20230426  694363.0     27472     Jared  Shuster     MI  111  19  19   

         K/9     BB/9    H  HR playerid           Name  
161  7.29813  2.83048  116  16    27472  Jared Shuster  


# Create Useful Stats

In [7]:
def create_intermediate_batters(date):
    # Read in file
    filename = "Batters_FG_" + date + ".csv"
    df = pd.read_csv(os.path.join(baseball_path, "A5. Stats - 1. FanGraphs", "Batters", filename), encoding='iso-8859-1')
    # Create singles
    df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
    
    # Basic stats
    hit_list = ['1B', '2B', '3B', 'HR', 'BB', 'HBP', 'SO']

    # Advance stats
    rate_list = ['OBP', 'SLG', 'wOBA']
    for stat in hit_list:
        rate = stat + "_rate"
        rate_list.append(rate)
        df[rate] = df[stat] / df['PA']

    df['SBA'] = df['SB'] + df['CS']
    df['SBO'] = df['1B'] + df['BB'] + df['HBP']
    df['sba_imp'] = df['SBA'] / df['SBO']

    # Cap imputed SBA 
    df['sba_imp'] = np.where(df['sba_imp'] > 0.15, 0.15, df['sba_imp'])

    # Determine stolen base success rate
    df['sbr'] = df['SB'] / df['SBA']
    
    keep_list = ['Name', 'mlbamid', 'playerid', 'sba_imp', 'sbr'] + rate_list
    df = df[keep_list]
    
    df['sbr'].fillna(0.6, inplace=True) # assume 25th percentile 
    df['sba_imp'].fillna(0.05, inplace=True) # assume imputed
    
    df.columns = df.columns.str.lower()
    df.rename(columns={'1b_rate': 'b1_rate', '2b_rate': 'b2_rate', '3b_rate': 'b3_rate'}, inplace=True)

    df.dropna(inplace=True)
    
    sba_2b_reg = pickle.load(open(os.path.join(model_path, 'sba_2b_20220901.sav'), 'rb'))
    df['sba_2b'] = sba_2b_reg.predict(df[['sba_imp']])

    sba_3b_reg = pickle.load(open(os.path.join(model_path, 'sba_3b_20220901.sav'), 'rb'))
    df['sba_3b'] = sba_3b_reg.predict(df[['sba_imp']])

    sb_2b_reg = pickle.load(open(os.path.join(model_path, 'sb_2b_20220901.sav'), 'rb'))
    df['sb_2b'] = sb_2b_reg.predict(df[['sbr']])

    sb_3b_reg = pickle.load(open(os.path.join(model_path, 'sb_3b_20220901.sav'), 'rb'))
    df['sb_3b'] = sb_3b_reg.predict(df[['sbr']])
       
    # Merge with chadwick to fix certain fangraphs ids 
    df = df.merge(chadwick, left_on='mlbamid', right_on='key_mlbam', how='left')
    # df['key_fangraphs'] = df['key_fangraphs'].astype('int', errors='ignore')
    df['playerid'] = np.where(df['playerid'].str.startswith("sa") & ~df['key_fangraphs'].isna(), df['key_fangraphs'], df['playerid'])
    df['playerid'] = df['playerid'].astype('string')
    df['playerid'] = df['playerid'].str.replace(r'\.0', '', regex=True)
        
    df.drop(columns={'index', 'key_fangraphs', 'key_mlbam', 'name_first', 'name_last'}, inplace=True)
        
    return df

In [8]:
def create_intermediate_pitchers(date):
    # Read in file
    filename = "Pitchers_FG_" + date + ".csv"
    df = pd.read_csv(os.path.join(baseball_path, "A5. Stats - 1. FanGraphs", "Pitchers", filename), encoding='iso-8859-1')
    
    df['H/9'] = df['H'] / df['IP'] * 9
    df['HR/9'] = df['HR'] / df['IP'] * 9

    # This is for manual changes to playerids. This occurred because Drey Jameson has a "real" playerid when looking back but doesn't have one in FG data from that day
    df['playerid'] = np.where(df['Name'] == "Drey Jameson", "26260", df['playerid'])
    
    
    keep_list = ['playerid', 'mlbamid', 'H/9', 'HR/9', 'K/9', 'BB/9'] 
    df = df[keep_list]
    
    # Merge with chadwick to fix certain fangraphs ids 
    df = df.merge(chadwick, left_on='mlbamid', right_on='key_mlbam', how='left')
    # df['key_fangraphs'] = df['key_fangraphs'].astype('int', errors='ignore')
    df['playerid'] = np.where(df['playerid'].str.startswith("sa") & ~df['key_fangraphs'].isna(), df['key_fangraphs'], df['playerid'])
    df['playerid'] = df['playerid'].astype('string')
    df['playerid'] = df['playerid'].str.replace(r'\.0', '', regex=True)
    
    df.drop(columns={'index', 'key_fangraphs', 'key_mlbam', 'name_first', 'name_last'}, inplace=True)
    
    return df

# Create Rosters

In [9]:
def create_team_rosters(date=todaysdate):
    # Create new folder with daily rosters
    team_folder = "Daily" + date
    try:
        os.mkdir(os.path.join(baseball_path, "A5. Stats - 2. Teams", team_folder))
    except:
        pass
    
    # Locate daily rosters
    rosters_folder = "Rosters" + date
    rosters_path = os.path.join(baseball_path, "A2. Rosters", rosters_folder)
    
    # Read in batter stats from API
    batter_filename = "Batters" + date + ".csv"
    batters_api = pd.read_csv(os.path.join(baseball_path, "A4. Dataset", "Batters", batter_filename), encoding='iso-8859-1')

    # And from FG
    batters = create_intermediate_batters(date)
    
    # Read in pitcher stats from API
    pitcher_filename = "Pitchers" + date + ".csv"
    pitchers_api = pd.read_csv(os.path.join(baseball_path, "A4. Dataset", "Pitchers", pitcher_filename), encoding='iso-8859-1')

    # And from FG
    pitchers = create_intermediate_pitchers(date)
    
    for filename in os.listdir(rosters_path):
        print(filename)
        # Read in roster
        df = pd.read_csv(os.path.join(rosters_path, filename), encoding='iso-8859-1')

        # Destination     
        excel_file = filename.replace(".csv", "")
        excel_file = excel_file + ".xlsx"
        file_name = os.path.join(baseball_path, "A5. Stats - 2. Teams", team_folder, excel_file)


        ### Batters
        # Merge with stats from MLB API
        # We want a left merge because players in first game won't be in API data
        batter_df = df.merge(batters_api, left_on='id', right_on='batter', how='left', suffixes=("", "_api"))

        # Convert fangraphs ID to string 
        batter_df = batter_df[~batter_df['key_fangraphs'].isna()].reset_index(drop=True)
        batter_df['key_fangraphs'] = batter_df['key_fangraphs'].astype('string')
        batter_df['key_fangraphs'] = batter_df['key_fangraphs'].str.replace(r'\.\d', "", regex=True) # This shouldn't be necessary but is right now
        # Create dummy variable for if they're a lefty. This is necessary to project.
        batter_df['b_L'] = np.where(batter_df['batSide'] == "L", 1, 0)
        # Get fangraphs projections
        # We want an inner merge because everyone should be in fangraphs data (or we don't care about them anyway!) - changed to left
        batters_merged = batter_df.merge(batters, left_on='key_fangraphs', right_on='playerid', how='inner')


        # Predict stats
        # Vs. left-handed pitchers
        # print(batters_merged)
        # print(batters_merged)
        vs_lhp_preds = fg_vs_lhp.predict_proba(batters_merged[['b_L', 'b1_rate', 'b2_rate', 'b3_rate', 'hr_rate', 'bb_rate', 'hbp_rate', 'so_rate']])
        vs_lhp_df = pd.DataFrame(vs_lhp_preds, columns=fg_vs_lhp.classes_)

        vs_lhp_df['woba'] = woba_reg.predict(vs_lhp_df[simple_list])
        vs_lhp_df['obp'] = obp_reg.predict(vs_lhp_df[simple_list])
        vs_lhp_df['slg'] = slg_reg.predict(vs_lhp_df[simple_list])

        vs_lhp_df = vs_lhp_df.add_suffix("_l")

        # Vs. right-handed pitchers
        vs_rhp_preds = fg_vs_rhp.predict_proba(batters_merged[['b_L', 'b1_rate', 'b2_rate', 'b3_rate', 'hr_rate', 'bb_rate', 'hbp_rate', 'so_rate']])
        vs_rhp_df = pd.DataFrame(vs_rhp_preds, columns=fg_vs_rhp.classes_)

        vs_rhp_df['woba'] = woba_reg.predict(vs_rhp_df[simple_list])
        vs_rhp_df['obp'] = obp_reg.predict(vs_rhp_df[simple_list])
        vs_rhp_df['slg'] = slg_reg.predict(vs_rhp_df[simple_list])

        vs_rhp_df = vs_rhp_df.add_suffix("_r")

        batters_df = pd.concat([batters_merged, vs_lhp_df, vs_rhp_df], axis=1)

        # Save as Excel
        batters_df.to_excel(file_name, sheet_name="Batters", engine='openpyxl')


        ### Pitchers
        # Merge with stats from MLB API
        # We want a left merge because players in first game won't be in API data
        pitcher_df = df.merge(pitchers_api, left_on='id', right_on='pitcher', how='left', suffixes=("", "_api"))

        # Convert fangraphs ID to string 
        pitcher_df = pitcher_df[~pitcher_df['key_fangraphs'].isna()].reset_index(drop=True)
        pitcher_df['key_fangraphs'] = pitcher_df['key_fangraphs'].astype('string')
        pitcher_df['key_fangraphs'] = pitcher_df['key_fangraphs'].str.replace(r'\.\d', "", regex=True) # This shouldn't be necessary but is right now

        # Create dummy variable for if they're a lefty. This is necessary to project.
        pitcher_df['p_L'] = np.where(pitcher_df['pitchHand'] == "L", 1, 0)

        # Get fangraphs projections
        # We want an inner merge because everyone should be in fangraphs data (or we don't care about them anyway!) - changed to left
        pitchers_merged = pitcher_df.merge(pitchers, left_on='key_fangraphs', right_on='playerid', how='inner')

        # Predict stats
        # Vs. left-handed pitchers
        vs_lhb_preds = fg_vs_lhb.predict_proba(pitchers_merged[['p_L', 'H/9', 'HR/9', 'K/9', 'BB/9']])
        vs_lhb_df = pd.DataFrame(vs_lhb_preds, columns=fg_vs_lhb.classes_)

        vs_lhb_df['woba'] = woba_reg.predict(vs_lhb_df[simple_list])
        vs_lhb_df['obp'] = obp_reg.predict(vs_lhb_df[simple_list])
        vs_lhb_df['slg'] = slg_reg.predict(vs_lhb_df[simple_list])

        vs_lhb_df = vs_lhb_df.add_suffix("_l")

        # Vs. right-handed pitchers
        vs_rhb_preds = fg_vs_rhb.predict_proba(pitchers_merged[['p_L', 'H/9', 'HR/9', 'K/9', 'BB/9']])
        vs_rhb_df = pd.DataFrame(vs_rhb_preds, columns=fg_vs_rhb.classes_)

        vs_rhb_df['woba'] = woba_reg.predict(vs_rhb_df[simple_list])
        vs_rhb_df['obp'] = obp_reg.predict(vs_rhb_df[simple_list])
        vs_rhb_df['slg'] = slg_reg.predict(vs_rhb_df[simple_list])

        vs_rhb_df = vs_rhb_df.add_suffix("_r")

        pitchers_df = pd.concat([pitchers_merged, vs_lhb_df, vs_rhb_df], axis=1)

        with pd.ExcelWriter(file_name, mode='a', engine='openpyxl') as writer:  
            pitchers_df.to_excel(writer, sheet_name='Pitchers')

In [10]:
create_team_rosters(todaysdate)

ARI20230426.csv
ATL20230426.csv
BAL20230426.csv
BOS20230426.csv
CHC20230426.csv
CHW20230426.csv
CIN20230426.csv
CLE20230426.csv
COL20230426.csv
DET20230426.csv
HOU20230426.csv
KCR20230426.csv
LAA20230426.csv
LAD20230426.csv
MIA20230426.csv
MIL20230426.csv
MIN20230426.csv
NYM20230426.csv
NYY20230426.csv
OAK20230426.csv
PHI20230426.csv
PIT20230426.csv
SDP20230426.csv
SEA20230426.csv
SFG20230426.csv
STL20230426.csv
TBR20230426.csv
TEX20230426.csv
TOR20230426.csv
WSN20230426.csv


# Run All

In [11]:
# # Loop over dates that we have fangraphs projections for
# for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data\FanGraphs\Batters"):
#     date = filename[11:19]
#     print(date)
#     try:
#         create_team_rosters(date)
#     except:
#         print("it didn't work")
#     # create_team_rosters(date) 
    
# # Breaks at 1003

In [12]:
print("Code was last run on: {} at {}.".format(datetime.date.today(), datetime.datetime.now().strftime("%H:%M:%S")))

Code was last run on: 2023-04-26 at 18:10:23.
