# A5. Stats
Source: FanGraphs API <br>

This imports stats from Fangraphs <br>
This calculates stats that aren't used in the models but help us get there <br>

### Imports

In [65]:
import pandas as pd
import numpy as np
import os
import statsmodels.formula.api as smf
import glob
import warnings
warnings.filterwarnings("ignore")
import datetime
from datetime import date
import time
import re
import pickle

import sys
sys.path.append(r"C:\Users\james\Documents\MLB\Code")
from Utilities import *

# import import_ipynb
# from Utilities import *

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

model_path = r"C:\Users\james\Documents\MLB\Code\Models"
baseball_path = r"C:\Users\james\Documents\MLB\Data2"
download_path = r"C:\Users\james\Downloads"

In [66]:
# Today's Date
# YYYY-MM-DD (datetime)
todaysdate_dt = datetime.date.today()

# YYYY-MM-DD (string)
todaysdate_dash = str(todaysdate_dt)

# MM/DD/YYYY
todaysdate_slash = todaysdate_dash.split("-")
todaysdate_slash = todaysdate_slash[1] + "/" + todaysdate_slash[2] + "/" + todaysdate_slash[0]

# YYYYMMDD
todaysdate = todaysdate_dash.replace("-", "")

In [67]:
 # This reads in Chadwick register with player codes.
keep_list = ['key_mlbam', 'key_fangraphs', 'name_first', 'name_last']
chadwick = read_chadwick(keep_list)

In [68]:
crosswalk = pd.read_csv(os.path.join(baseball_path, "Utilities", "Crosswalk.csv"), encoding='iso-8859-1') 
crosswalk = crosswalk[['mlbamid', 'steamerid']]
crosswalk.rename(columns={'mlbamid':'mlbamid_fill'}, inplace=True)

### Scrape FanGraphs

In [69]:
# Fangraphs API
def scrape_batters():
    # Read in API json
    batters_lb = pd.read_json('https://www.fangraphs.com/api/projections?type=steamer&stats=bat&pos=all&team=0&players=0&lg=all')

    # Name is currently some weird thing with a bunch of data. We don't need all that
    batters_lb['Name'] = batters_lb['PlayerName']
    # Rename to match steam. Note that steamerid = key_fangraphs
    batters_lb.rename(columns={'playerids':'steamerid'}, inplace=True)
    # Convert to string
    chadwick['key_fangraphs'] = (chadwick['key_fangraphs']).astype('str')
    # Remove trailing .0
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].str.replace(r'\.\d', "", regex=True)
    
    # Merge with chadwick for mlbamid
    batters_lb = batters_lb.merge(chadwick, left_on='steamerid', right_on='key_fangraphs', how='left')
    batters_lb = batters_lb.merge(crosswalk, on='steamerid', how='left')
    
    
    # Create missing columns to match what's provided by steamer 
    batters_lb['proj_date'] = todaysdate
    batters_lb['mlbamid'] = batters_lb['key_mlbam']
    batters_lb['bats'] = "MI" # Not included in FanGraphs data
    batters_lb['playerid'] = batters_lb['steamerid']
    batters_lb['NIBB'] = batters_lb['BB'] - batters_lb['IBB']
    batters_lb.rename(columns={'name_first':'firstname', 'name_last':'lastname', 'minpos':'position'}, inplace=True)

    batters_lb['mlbamid'].fillna(batters_lb['mlbamid_fill'], inplace=True)
    
    # Keep relevant variables and in order
    batters_lb = batters_lb[['proj_date', 'mlbamid', 'steamerid', 'firstname', 'lastname', 'Team', 'position', 'bats', 
                             'PA', 'IBB', 'NIBB', 'BB', 'SO', 'HBP', 'H', '2B', '3B', 'HR', 'OBP', 'SLG', 'wOBA', 'SB', 'CS', 'playerid', 'Name']]

    # Export to CSV
    batters_lb.to_csv(os.path.join(baseball_path, "7. Stats", "A. Raw FanGraphs", "Batters", "Batters_FG_" + todaysdate + ".csv"), encoding='iso-8859-1')
    
scrape_batters()

In [70]:
# Fangraphs API
def scrape_pitchers():
    # Read in API json
    pitchers_lb = pd.read_json('https://www.fangraphs.com/api/projections?type=steamer&stats=pit&pos=all&team=0&players=0&lg=all')

    # Name is currently some weird thing with a bunch of data. We don't need all that
    pitchers_lb['Name'] = pitchers_lb['PlayerName']
    # Rename to match steam. Note that steamerid = key_fangraphs
    pitchers_lb.rename(columns={'playerids':'steamerid'}, inplace=True)
    # Convert to string
    chadwick['key_fangraphs'] = (chadwick['key_fangraphs']).astype('str')
    # Remove trailing .0
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].str.replace(r'\.\d', "", regex=True)
    
    # Merge with chadwick for mlbamid
    pitchers_lb = pitchers_lb.merge(chadwick, left_on='steamerid', right_on='key_fangraphs', how='left')
    pitchers_lb = pitchers_lb.merge(crosswalk, on='steamerid', how='left')


    # Create missing columns to match what's provided by steamer 
    pitchers_lb['proj_date'] = todaysdate
    pitchers_lb['mlbamid'] = pitchers_lb['key_mlbam']
    pitchers_lb['Throws'] = "MI" # Not included in FanGraphs data
    pitchers_lb['playerid'] = pitchers_lb['steamerid']
    pitchers_lb.rename(columns={'name_first':'firstname', 'name_last':'lastname', 'minpos':'position'}, inplace=True)

    pitchers_lb['mlbamid'].fillna(pitchers_lb['mlbamid_fill'], inplace=True)

    
    # Keep relevant variables and in order
    pitchers_lb = pitchers_lb[['proj_date', 'mlbamid', 'steamerid', 'firstname', 'lastname', 'Throws', 
                               'IP', 'G', 'GS', 'K/9', 'BB/9', 'H', 'HR', 'playerid', 'Name']]
    
    # Export to CSV
    pitchers_lb.to_csv(os.path.join(baseball_path, "7. Stats", "A. Raw FanGraphs", "Pitchers", "Pitchers_FG_" + todaysdate + ".csv"), encoding='iso-8859-1')
    
scrape_pitchers()

### Create Useful Stats

In [71]:
def create_intermediate_batters(date):
    # Read in file
    filename = "Batters_FG_" + date + ".csv"
    df = pd.read_csv(os.path.join(baseball_path, "7. Stats", "A. Raw FanGraphs", "Batters", filename), encoding='iso-8859-1')
    # Create singles
    df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
    
    # Basic stats
    hit_list = ['1B', '2B', '3B', 'HR', 'BB', 'HBP', 'SO']

    # Advance stats
    rate_list = ['OBP', 'SLG', 'wOBA']
    for stat in hit_list:
        rate = stat + "_rate"
        rate_list.append(rate)
        df[rate] = df[stat] / df['PA']

    df['SBA'] = df['SB'] + df['CS']
    df['SBO'] = df['1B'] + df['BB'] + df['HBP']
    df['sba_imp'] = df['SBA'] / df['SBO']

    # Cap imputed SBA 
    df['sba_imp'] = np.where(df['sba_imp'] > 0.15, 0.15, df['sba_imp'])

    # Determine stolen base success rate
    df['sbr'] = df['SB'] / df['SBA']
    
    keep_list = ['Name', 'mlbamid', 'playerid', 'sba_imp', 'sbr'] + rate_list
    df = df[keep_list]
    
    df['sbr'].fillna(0.6, inplace=True) # assume 25th percentile 
    df['sba_imp'].fillna(0.05, inplace=True) # assume low prob
    
    df.columns = df.columns.str.lower()
    df.rename(columns={'1b_rate': 'b1_rate', '2b_rate': 'b2_rate', '3b_rate': 'b3_rate'}, inplace=True)

    df.dropna(inplace=True)
    
    sba_2b_reg = pickle.load(open(os.path.join(model_path, 'sba_2b_20220901.sav'), 'rb'))
    df['sba_2b'] = sba_2b_reg.predict(df[['sba_imp']])

    sba_3b_reg = pickle.load(open(os.path.join(model_path, 'sba_3b_20220901.sav'), 'rb'))
    df['sba_3b'] = sba_3b_reg.predict(df[['sba_imp']])

    sb_2b_reg = pickle.load(open(os.path.join(model_path, 'sb_2b_20220901.sav'), 'rb'))
    df['sb_2b'] = sb_2b_reg.predict(df[['sbr']])

    sb_3b_reg = pickle.load(open(os.path.join(model_path, 'sb_3b_20220901.sav'), 'rb'))
    df['sb_3b'] = sb_3b_reg.predict(df[['sbr']])
       
    # Merge with chadwick to fix certain fangraphs ids 
    df = df.merge(chadwick, left_on='mlbamid', right_on='key_mlbam', how='left')
    # df['key_fangraphs'] = df['key_fangraphs'].astype('int', errors='ignore')
    df['playerid'] = np.where(df['playerid'].str.startswith("sa") & ~df['key_fangraphs'].isna(), df['key_fangraphs'], df['playerid'])
    df['playerid'] = df['playerid'].astype('string')
    df['playerid'] = df['playerid'].str.replace(r'\.0', '', regex=True)
        
    df.drop(columns={'index', 'key_fangraphs', 'key_mlbam', 'name_first', 'name_last'}, inplace=True)
    
    # Write intermediate FanGraphs data to csv
    df.to_csv(os.path.join(baseball_path, "7. Stats", "B. Clean FanGraphs", "Batters", "Batters_FG2_" + date + ".csv"), encoding='iso-8859-1')
        
    return df

In [72]:
def create_intermediate_pitchers(date):
    # Read in file
    filename = "Pitchers_FG_" + date + ".csv"
    df = pd.read_csv(os.path.join(baseball_path, "7. Stats", "A. Raw FanGraphs", "Pitchers", filename), encoding='iso-8859-1')
    
    df['H9'] = df['H'] / df['IP'] * 9
    df['HR9'] = df['HR'] / df['IP'] * 9
    
    df.rename(columns={'K/9':'K9', 'BB/9':'BB9'}, inplace=True)
    
    keep_list = ['playerid', 'mlbamid', 'H9', 'HR9', 'K9', 'BB9'] 
    df = df[keep_list]
    
    # Merge with chadwick to fix certain fangraphs ids 
    df = df.merge(chadwick, left_on='mlbamid', right_on='key_mlbam', how='left')
    # df['key_fangraphs'] = df['key_fangraphs'].astype('int', errors='ignore')
    df['playerid'] = np.where(df['playerid'].str.startswith("sa") & ~df['key_fangraphs'].isna(), df['key_fangraphs'], df['playerid'])
    df['playerid'] = df['playerid'].astype('string')
    df['playerid'] = df['playerid'].str.replace(r'\.0', '', regex=True)
    
    df.drop(columns={'index', 'key_fangraphs', 'key_mlbam', 'name_first', 'name_last'}, inplace=True)
    
    
    # Write intermediate FanGraphs data to csv
    df.to_csv(os.path.join(baseball_path, "7. Stats", "B. Clean FanGraphs", "Pitchers", "Pitchers_FG2_" + date + ".csv"), encoding='iso-8859-1')
    
    return df

In [73]:
def batter_merge(date):
    # Read in batter stats from API
    batter_filename = "Batters" + date + ".csv"
    batters_api = pd.read_csv(os.path.join(baseball_path, "4. Dataset", "Batters", batter_filename), encoding='iso-8859-1')
    
    # Make string, remove trailing .0
    batters_api = fix_fangraphs(batters_api)
    
    # Read in batter projections from FanGraphs
    batters_fg = create_intermediate_batters(date)
    batters_fg['key_fangraphs'] = batters_fg['playerid']
    
    
    # Merge API data with FG data
    batters_df = batters_api.merge(batters_fg, left_on='key_mlbam', right_on='mlbamid', how='outer')
    
    return batters_df

In [74]:
def pitcher_merge(date):
    # Read in pitcher stats from API
    pitcher_filename = "Pitchers" + date + ".csv"
    pitchers_api = pd.read_csv(os.path.join(baseball_path, "4. Dataset", "Pitchers", pitcher_filename), encoding='iso-8859-1')
    
    # Make string, remove trailing .0
    pitchers_api = fix_fangraphs(pitchers_api)
    
    # Read in pitcher projections from FanGraphs
    pitchers_fg = create_intermediate_pitchers(date)
    pitchers_fg['key_fangraphs'] = pitchers_fg['playerid']

    # Merge API data with FG data
    pitchers_df = pitchers_api.merge(pitchers_fg, left_on='key_mlbam', right_on='mlbamid', how='outer')

    
    return pitchers_df

### Create Rosters

In [75]:
def create_team_rosters(date=todaysdate):
    # Create new folder with daily rosters
    team_folder = "Daily" + date
    try:
        os.mkdir(os.path.join(baseball_path, "7. Stats", "C. Teams", team_folder))
    except:
        pass
    
    # Locate daily rosters
    rosters_folder = "Rosters" + date
    rosters_path = os.path.join(baseball_path, "6. Rosters", rosters_folder)
    
    
    # Merge API and FG data
    batters_df = batter_merge(date)
    pitchers_df = pitcher_merge(date)
    

    
    for filename in os.listdir(rosters_path):
        print(filename)
        # Read in roster
        df = pd.read_csv(os.path.join(rosters_path, filename), encoding='iso-8859-1')

        # Destination     
        excel_file = filename.replace(".csv", "")
        excel_file = excel_file + ".xlsx"
        file_name = os.path.join(baseball_path, "7. Stats", "C. Teams", team_folder, excel_file)


        ### Batters
        batters_merged = df.merge(batters_df, left_on='id', right_on='batter', how='left', suffixes=("", "_api"))
        
        # Only keep batters
        batters_merged = batters_merged.query('position != "P"')
        
        # Create dummy variable for if they're a lefty. This is necessary to project. (maybe move this)
        batters_merged['b_L'] = np.where(batters_merged['batSide'] == "L", 1, 0)
        
        # Save as Excel
        batters_merged.to_excel(file_name, sheet_name="Batters", engine='openpyxl')
        

        ### Pitcher
        pitchers_merged = df.merge(pitchers_df, left_on='id', right_on='pitcher', how='left', suffixes=("", "_api"))
        
        # Only keep pitchers
        pitchers_merged = pitchers_merged[(pitchers_merged['position'] == 'P') | (pitchers_merged['position'] == 'TWP')]

        # Create dummy variable for if they're a lefty. This is necessary to project. (maybe move this)
        pitchers_merged['p_L'] = np.where(pitchers_merged['pitchHand'] == "L", 1, 0)
        
        # Save as Excel
        with pd.ExcelWriter(file_name, mode='a', engine='openpyxl') as writer:  
            pitchers_merged.to_excel(writer, sheet_name='Pitchers')

### Run One

In [76]:
create_team_rosters(todaysdate)

ARI20230616.csv
ATL20230616.csv
BAL20230616.csv
BOS20230616.csv
CHC20230616.csv
CHW20230616.csv
CIN20230616.csv
CLE20230616.csv
COL20230616.csv
DET20230616.csv
HOU20230616.csv
KCR20230616.csv
LAA20230616.csv
LAD20230616.csv
MIA20230616.csv
MIL20230616.csv
MIN20230616.csv
NYM20230616.csv
NYY20230616.csv
OAK20230616.csv
PHI20230616.csv
PIT20230616.csv
SDP20230616.csv
SEA20230616.csv
SFG20230616.csv
STL20230616.csv
TBR20230616.csv
TEX20230616.csv
TOR20230616.csv
WSN20230616.csv


### Run All

In [77]:
# for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data\A8. Sims - 1. Players"): 
#     # 2023 
#     if filename.endswith(".csv") and filename.startswith("Player_Sims_2023"):
#         # Pull out date
#         date = filename[12:20]
#         print(date)
#         create_team_rosters(date)

In [78]:
print("Code was last run on: {} at {}.".format(datetime.date.today(), datetime.datetime.now().strftime("%H:%M:%S")))

Code was last run on: 2023-06-16 at 17:01:43.
