In [92]:
import pandas as pd
import numpy as np
import os
import statsmodels.formula.api as smf
import glob
import warnings
warnings.filterwarnings("ignore")
import datetime
from datetime import date
import time
import re
import import_ipynb
from Utilities import *
import pickle

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

baseball_path = r"C:\Users\james\Documents\MLB\Data"
download_path = r"C:\Users\james\Downloads"

In [93]:
# Set today's date
todaysdate = date.today()
todaysdate_dash = str(todaysdate)
todaysdate = todaysdate_dash.replace("-", "")
todaysdate

'20221126'

In [94]:
# List of stats
simple_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo']

# Read in models
woba_reg = pickle.load(open('woba_20220908.sav', 'rb'))
obp_reg = pickle.load(open('obp_20220908.sav', 'rb'))
slg_reg = pickle.load(open('slg_20220908.sav', 'rb'))

fg_vs_lhp = pickle.load(open('fg_vs_lhp_20220905.sav', 'rb'))
fg_vs_rhp = pickle.load(open('fg_vs_rhp_20220905.sav', 'rb'))
fg_vs_lhb = pickle.load(open('fg_vs_lhb_20220905.sav', 'rb'))
fg_vs_rhb = pickle.load(open('fg_vs_rhb_20220905.sav', 'rb'))

In [96]:
# Scrape batters from Fangraphs
def scrape_batters():
    # Set driver
    driver = webdriver.Chrome(executable_path=r'C:\Users\james\Documents\MLB\chromedriver.exe')
    # Choose url
    driver.get(f'https://www.fangraphs.com/projections.aspx?pos=all&stats=bat&type=steameru&team=0&lg=all&players=0') 
    # driver.get(f'https://www.fangraphs.com/projections.aspx?pos=all&stats=bat&type=steamer&team=0&lg=all&players=0')
    # Select element
    driver.execute_script("arguments[0].click();", WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ProjectionBoard1_cmdCSV"]'))))
    
    # Find all Fangraphs downloads
    fangraphs_files = glob.glob(os.path.join(download_path, 'FanGraphs Leaderboard*.csv'))
    fangraphs_files.sort(key=os.path.getmtime)

    # Pick the second to last (this will be the highest number (which is above the unnumbered one)
    latest = fangraphs_files[-1]
    print(latest)

    batters_lb = pd.read_csv(os.path.join(download_path, latest), encoding='iso-8859-1')

    batters_lb = batters_lb.rename(columns={batters_lb.columns[0]: "Name"})
    
    print(batters_lb)
    
    filename = "df_" + todaysdate + ".csv"
    batters_lb.to_csv(os.path.join(baseball_path, "FanGraphs", "Batters", filename), encoding='iso-8859-1')
    
    try:
        driver.close()
    except:
        pass
    
# scrape_batters()

In [97]:
def scrape_pitchers():
    # Set driver
    driver = webdriver.Chrome(executable_path=r'r"C:\Users\james\Documents\MLB\Data\chromedriver.exe')
    # Choose url
    driver.get(f'https://www.fangraphs.com/projections.aspx?pos=all&stats=pit&type=steameru&team=0&lg=all&players=0')
    # Select element
    driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ProjectionBoard1_cmdCSV"]'))))

    # Find all Fangraphs downloads
    fangraphs_files = glob.glob(os.path.join(download_path, 'FanGraphs Leaderboard*.csv'))
    fangraphs_files.sort(key=os.path.getmtime)

    # Pick the second to last (this will be the highest number (which is above the unnumbered one)
    latest = fangraphs_files[-1]
    print(latest)

    pitchers_lb = pd.read_csv(os.path.join(download_path, latest), encoding='iso-8859-1')
    
    pitchers_lb = pitchers_lb.rename(columns={pitchers_lb.columns[0]: "Name"})
    
    print(pitchers_lb)

    filename = "Pitchers_FG_" + todaysdate + ".csv"
    pitchers_lb.to_csv(os.path.join(baseball_path, "FanGraphs", "Pitchers", filename), encoding='iso-8859-1')

    try:
        driver.close()
    except:
        pass
    
# scrape_pitchers()

In [98]:
def create_intermediate_batters(date):
    # Read in file
    filename = "Batters_FG_" + date + ".csv"
    df = pd.read_csv(os.path.join(baseball_path, "FanGraphs", "Batters", filename))
    # Create singles
    df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
    
    # Basic stats
    hit_list = ['1B', '2B', '3B', 'HR', 'BB', 'HBP', 'SO']

    # Advance stats
    rate_list = ['OBP', 'SLG', 'wOBA']
    for stat in hit_list:
        rate = stat + "_rate"
        rate_list.append(rate)
        df[rate] = df[stat] / df['PA']

    df['SBA'] = df['SB'] + df['CS']
    df['SBO'] = df['1B'] + df['BB'] + df['HBP']
    df['sba_imp'] = df['SBA'] / df['SBO']

    # Cap imputed SBA 
    df['sba_imp'] = np.where(df['sba_imp'] > 0.5, 0.5, df['sba_imp'])

    # Determine stolen base success rate
    df['sbr'] = df['SB'] / df['SBA']
    
    # This is for manual changes to fangraphs playerids. This occurred because Drey Jameson has a "real" playerid when looking back but doesn't have one in FG data from that day
    df['playerid'] = np.where(df['Name'] == "Matt Wallner", "26466", df['playerid'])
    df['playerid'] = np.where(df['Name'] == "Jordan Diaz", "22650", df['playerid'])
    df['playerid'] = np.where(df['Name'] == "Will Brennan", "25660", df['playerid'])
    df['playerid'] = np.where(df['Name'] == "Mike Siani", "22557", df['playerid'])
    df['playerid'] = np.where(df['Name'] == "Ezequiel Tovar", "24064", df['playerid'])
    df['playerid'] = np.where(df['Name'] == "Ford Proctor", "21634", df['playerid'])
    df['playerid'] = np.where(df['Name'] == "Logan O'Hoppe", "24729", df['playerid'])
    df['playerid'] = np.where(df['Name'] == "Francisco Alvarez", "26121", df['playerid'])
    df['playerid'] = np.where(df['Name'] == "Brian O'Keefe", "16680", df['playerid'])
    
    keep_list = ['Name', 'playerid', 'sba_imp', 'sbr'] + rate_list
    df = df[keep_list]
    df['sbr'].fillna(0.6, inplace=True) # assume 25th percentile 
    df['sba_imp'].fillna(1, inplace=True) # assume imputed
    df.columns = df.columns.str.lower()
    df.rename(columns={'1b_rate': 'b1_rate', '2b_rate': 'b2_rate', '3b_rate': 'b3_rate'}, inplace=True)

    df.dropna(inplace=True)
    
    sba_2b_reg = pickle.load(open('sba_2b_20220901.sav', 'rb'))
    df['sba_2b'] = sba_2b_reg.predict(df[['sba_imp']])

    sba_3b_reg = pickle.load(open('sba_3b_20220901.sav', 'rb'))
    df['sba_3b'] = sba_3b_reg.predict(df[['sba_imp']])

    sb_2b_reg = pickle.load(open('sb_2b_20220901.sav', 'rb'))
    df['sb_2b'] = sb_2b_reg.predict(df[['sbr']])

    sb_3b_reg = pickle.load(open('sb_3b_20220901.sav', 'rb'))
    df['sb_3b'] = sb_3b_reg.predict(df[['sbr']])
       
    return df

In [99]:
def create_intermediate_pitchers(date):
    # Read in file
    filename = "Pitchers_FG_" + date + ".csv"
    df = pd.read_csv(os.path.join(baseball_path, "FanGraphs", "Pitchers", filename))
    
    df['H/9'] = df['H'] / df['IP'] * 9
    df['HR/9'] = df['HR'] / df['IP'] * 9

    # This is for manual changes to playerids. This occurred because Drey Jameson has a "real" playerid when looking back but doesn't have one in FG data from that day
    df['playerid'] = np.where(df['Name'] == "Drey Jameson", "26260", df['playerid'])
    
    
    keep_list = ['playerid', 'H/9', 'HR/9', 'K/9', 'BB/9'] 
    df = df[keep_list]
    
    return df

In [100]:
def create_team_rosters(date=todaysdate):
    # Create new folder with daily rosters
    team_folder = "Daily" + date
    try:
        os.mkdir(os.path.join(baseball_path, "Rosters", "Teams", team_folder))
    except:
        pass
    
    # Locate daily rosters
    rosters_folder = "Rosters" + date
    rosters_path = os.path.join(baseball_path, "Rosters", "Rosters", rosters_folder)
    
    # Read in batter stats from API
    batter_filename = "Batters" + date + ".csv"
    batters_api = pd.read_csv(os.path.join(baseball_path, "API", "Inputs", "Batters", batter_filename))

    # And from FG
    batters = create_intermediate_batters(date)
    
    # Read in pitcher stats from API
    pitcher_filename = "Pitchers" + date + ".csv"
    pitchers_api = pd.read_csv(os.path.join(baseball_path, "API", "Inputs", "Pitchers", pitcher_filename))

    # And from FG
    pitchers = create_intermediate_pitchers(date)
    
    for filename in os.listdir(rosters_path):
        # Read in roster
        df = pd.read_csv(os.path.join(rosters_path, filename))

        # Destination     
        excel_file = filename.replace(".csv", "")
        excel_file = excel_file + ".xlsx"
        file_name = os.path.join(baseball_path, "Rosters", "Teams", team_folder, excel_file)


        ### Batters
        # Merge with stats from MLB API
        # We want a left merge because players in first game won't be in API data
        batter_df = df.merge(batters_api, left_on='id', right_on='batter', how='left', suffixes=("", "_api"))

        # Convert fangraphs ID to string 
        batter_df['key_fangraphs'].fillna("999999", inplace=True)
        batter_df['key_fangraphs'] = batter_df['key_fangraphs'].astype('int').astype('string')

        # Create dummy variable for if they're a lefty. This is necessary to project.
        batter_df['b_L'] = np.where(batter_df['batSide'] == "L", 1, 0)
        
        # print(batters.query('Name == "Matt Wallner"')['playerid'])
        # Get fangraphs projections
        # We want an inner merge because everyone should be in fangraphs data (or we don't care about them anyway!)
        batters_merged = batter_df.merge(batters, left_on='key_fangraphs', right_on='playerid', how='inner')
               
        # Predict stats
        # Vs. left-handed pitchers
        vs_lhp_preds = fg_vs_lhp.predict_proba(batters_merged[['b_L', 'b1_rate', 'b2_rate', 'b3_rate', 'hr_rate', 'bb_rate', 'hbp_rate', 'so_rate']])
        vs_lhp_df = pd.DataFrame(vs_lhp_preds, columns=fg_vs_lhp.classes_)

        vs_lhp_df['woba'] = woba_reg.predict(vs_lhp_df[simple_list])
        vs_lhp_df['obp'] = obp_reg.predict(vs_lhp_df[simple_list])
        vs_lhp_df['slg'] = slg_reg.predict(vs_lhp_df[simple_list])

        vs_lhp_df = vs_lhp_df.add_suffix("_l")

        # Vs. right-handed pitchers
        vs_rhp_preds = fg_vs_rhp.predict_proba(batters_merged[['b_L', 'b1_rate', 'b2_rate', 'b3_rate', 'hr_rate', 'bb_rate', 'hbp_rate', 'so_rate']])
        vs_rhp_df = pd.DataFrame(vs_rhp_preds, columns=fg_vs_rhp.classes_)

        vs_rhp_df['woba'] = woba_reg.predict(vs_rhp_df[simple_list])
        vs_rhp_df['obp'] = obp_reg.predict(vs_rhp_df[simple_list])
        vs_rhp_df['slg'] = slg_reg.predict(vs_rhp_df[simple_list])

        vs_rhp_df = vs_rhp_df.add_suffix("_r")

        batters_df = pd.concat([batters_merged, vs_lhp_df, vs_rhp_df], axis=1)
        # print(batters_df.query('playerid == "26466"'))
        
        # Save as Excel
        batters_df.to_excel(file_name, sheet_name="Batters", engine='openpyxl')


        ### Pitchers
        # Merge with stats from MLB API
        # We want a left merge because players in first game won't be in API data
        pitcher_df = df.merge(pitchers_api, left_on='id', right_on='pitcher', how='left', suffixes=("", "_api"))

        # Convert fangraphs ID to string 
        pitcher_df['key_fangraphs'].fillna("999999", inplace=True)
        pitcher_df['key_fangraphs'] = pitcher_df['key_fangraphs'].astype('int').astype('string')

        # Create dummy variable for if they're a lefty. This is necessary to project.
        pitcher_df['p_L'] = np.where(pitcher_df['pitchHand'] == "L", 1, 0)

        # Get fangraphs projections
        # We want an inner merge because everyone should be in fangraphs data (or we don't care about them anyway!)
        pitchers_merged = pitcher_df.merge(pitchers, left_on='key_fangraphs', right_on='playerid', how='inner')

        # Predict stats
        # Vs. left-handed pitchers
        vs_lhb_preds = fg_vs_lhb.predict_proba(pitchers_merged[['p_L', 'H/9', 'HR/9', 'K/9', 'BB/9']])
        vs_lhb_df = pd.DataFrame(vs_lhb_preds, columns=fg_vs_lhb.classes_)

        vs_lhb_df['woba'] = woba_reg.predict(vs_lhb_df[simple_list])
        vs_lhb_df['obp'] = obp_reg.predict(vs_lhb_df[simple_list])
        vs_lhb_df['slg'] = slg_reg.predict(vs_lhb_df[simple_list])

        vs_lhb_df = vs_lhb_df.add_suffix("_l")

        # Vs. right-handed pitchers
        vs_rhb_preds = fg_vs_rhb.predict_proba(pitchers_merged[['p_L', 'H/9', 'HR/9', 'K/9', 'BB/9']])
        vs_rhb_df = pd.DataFrame(vs_rhb_preds, columns=fg_vs_rhb.classes_)

        vs_rhb_df['woba'] = woba_reg.predict(vs_rhb_df[simple_list])
        vs_rhb_df['obp'] = obp_reg.predict(vs_rhb_df[simple_list])
        vs_rhb_df['slg'] = slg_reg.predict(vs_rhb_df[simple_list])

        vs_rhb_df = vs_rhb_df.add_suffix("_r")

        pitchers_df = pd.concat([pitchers_merged, vs_lhb_df, vs_rhb_df], axis=1)

        with pd.ExcelWriter(file_name, mode='a', engine='openpyxl') as writer:  
            pitchers_df.to_excel(writer, sheet_name='Pitchers')
            


In [101]:
# create_team_rosters("20220917")

In [102]:
# Loop over dates that we have fangraphs projections for
for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data\FanGraphs\Batters"):
    date = filename[11:19]
    print(date)
    create_team_rosters(date)
    
# Breaks at 1003

20220914
20220915
20220916
20220917
20220918
20220919
20220920
20220921
20220922
20220923
20220924
20220925
20220926
20220927
20220928
20220929
20220930
20221001
20221002
20221003
20221004
20221005
20221006


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\james\\Documents\\MLB\\Data\\Rosters\\Rosters\\Rosters20221006'