In [1]:
import pandas as pd
import numpy as np
import os
import statsmodels.formula.api as smf
import glob
import warnings
warnings.filterwarnings("ignore")
import datetime
from datetime import date
import time
import re

import pickle

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

baseball_path = r"C:\Users\james\OneDrive\Documents\MLB\Data"
download_path = r"C:\Users\james\Downloads"

In [2]:
todaysdate = date.today()
todaysdate_dash = str(todaysdate)
todaysdate = todaysdate_dash.replace("-", "")
todaysdate

'20220930'

In [3]:
def remove_accents(old):
    new = re.sub(r'[àáâãäå]', 'a', old)
    new = re.sub(r'[èéêë]', 'e', new)
    new = re.sub(r'[ìíîï]', 'i', new)
    new = re.sub(r'[òóôõö]', 'o', new)
    new = re.sub(r'[ùúûü]', 'u', new)
    new = re.sub(r'[ñ]', 'n', new)
    return new

In [4]:
# Set driver
driver = webdriver.Chrome(executable_path=r'C:\Users\james\OneDrive\Documents\MLB\chromedriver.exe')
# Choose url
driver.get(f'https://www.fangraphs.com/projections.aspx?pos=all&stats=bat&type=steameru&team=0&lg=all&players=0')
# Select element
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ProjectionBoard1_cmdCSV"]'))))

time.sleep(15)

In [5]:
# Find all Fangraphs downloads
fangraphs_files = glob.glob(os.path.join(download_path, 'FanGraphs Leaderboard*.csv'))
fangraphs_files.sort(key=os.path.getmtime)
  
# Pick the second to last (this will be the highest number (which is above the unnumbered one)
latest = fangraphs_files[-1]
print(latest)

batters_lb = pd.read_csv(os.path.join(download_path, latest))

filename = "Batters_FG_" + todaysdate + ".csv"
batters_lb.to_csv(os.path.join(baseball_path, "FanGraphs", "Batters", filename))


try:
    driver.close()
except:
    pass
    

batters_lb[['First', 'Last']] = batters_lb['Name'].str.split(" ", n=1, expand=True)

batters_lb['First2'] = batters_lb['First'].str.slice(0,2).str.lower()
batters_lb['Last5'] = batters_lb['Last'].str.slice(0,5).str.lower()

batters_lb['First2'] = batters_lb.apply(lambda x: remove_accents(x['First2']), axis=1)
batters_lb['Last5'] = batters_lb.apply(lambda x: remove_accents(x['Last5']), axis=1)
    
batters_lb

C:\Users\james\Downloads\FanGraphs Leaderboard - 2022-09-30T173516.113.csv


Unnamed: 0,Name,Team,G,PA,AB,H,2B,3B,HR,R,...,Fld,-1.2,Off,Def,WAR,playerid,First,Last,First2,Last5
0,Aaron Judge,NYY,157,702,577,180,29,0,63,135,...,3.2,,86.6,-0.7,11.3,15640,Aaron,Judge,aa,judge
1,Nolan Arenado,STL,150,632,568,166,42,1,31,74,...,12.8,,36.1,13.0,7.4,9777,Nolan,Arenado,no,arena
2,Manny Machado,SDP,151,654,584,173,37,1,32,101,...,5.8,,42.1,6.2,7.4,11493,Manny,Machado,ma,macha
3,Paul Goldschmidt,STL,153,663,571,181,42,0,36,106,...,-4.2,,63.2,-16.3,7.3,9218,Paul,Goldschmidt,pa,golds
4,Freddie Freeman,LAD,159,712,616,200,47,2,21,117,...,2.7,,53.0,-9.2,7.1,5361,Freddie,Freeman,fr,freem
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4196,Jeremy Santana,,0,1,1,0,0,0,0,0,...,0.0,,-0.1,0.0,,sa3018599,Jeremy,Santana,je,santa
4197,Robert Hendrick,,0,1,1,0,0,0,0,0,...,0.0,,-0.1,0.0,,sa3015573,Robert,Hendrick,ro,hendr
4198,Jean Estrada,,0,1,1,0,0,0,0,0,...,0.0,,-0.1,0.0,,sa3016153,Jean,Estrada,je,estra
4199,Ruben Yustiz,,0,1,1,0,0,0,0,0,...,0.0,,-0.1,0.0,,sa3016179,Ruben,Yustiz,ru,yusti


In [6]:
batters_fg = batters_lb.copy()
batters_fg['1B'] = batters_fg['H'] - batters_fg['2B'] - batters_fg['3B'] - batters_fg['HR']

hit_list = ['1B', '2B', '3B', 'HR', 'BB', 'HBP', 'SO']

rate_list = ['OBP', 'SLG', 'wOBA']
for stat in hit_list:
    rate = stat + "_rate"
    rate_list.append(rate)
    batters_fg[rate] = batters_fg[stat] / batters_fg['PA']
    
batters_fg['SBA'] = batters_fg['SB'] + batters_fg['CS']
batters_fg['SBO'] = batters_fg['1B'] + batters_fg['BB'] + batters_fg['HBP']
batters_fg['sba_imp'] = batters_fg['SBA'] / batters_fg['SBO']

# Cap imputed SBA 
batters_fg['sba_imp'] = np.where(batters_fg['sba_imp'] > 0.5, 0.5, batters_fg['sba_imp'])



batters_fg['sbr'] = batters_fg['SB'] / batters_fg['SBA']
    
keep_list = ['playerid', 'First2', 'Last5', 'sba_imp', 'sbr'] + rate_list
batters_fg = batters_fg[keep_list]
batters_fg['sbr'].fillna(0.6, inplace=True) # assume 25th percentile 
batters_fg.columns = batters_fg.columns.str.lower()
batters_fg.rename(columns={'1b_rate': 'b1_rate', '2b_rate': 'b2_rate', '3b_rate': 'b3_rate'}, inplace=True)
batters_fg.dropna(inplace=True)
batters_fg.describe()

Unnamed: 0,sba_imp,sbr,obp,slg,woba,b1_rate,b2_rate,b3_rate,hr_rate,bb_rate,hbp_rate,so_rate
count,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0
mean,0.077342,0.685931,0.293857,0.355156,0.287167,0.136587,0.037949,0.003004,0.022968,0.078138,0.010581,0.248795
std,0.103867,0.249917,0.060849,0.10932,0.064604,0.046358,0.02746,0.005047,0.018015,0.039325,0.013141,0.085628
min,0.0,0.0,0.038,0.01,0.037,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.6,0.265,0.301,0.255,0.111111,0.025316,0.0,0.008403,0.054159,0.0,0.192
50%,0.038462,0.615385,0.3,0.365,0.294,0.137037,0.03856,0.0,0.022346,0.075188,0.007576,0.240214
75%,0.111111,0.888889,0.328,0.418,0.323,0.162791,0.049365,0.004785,0.034091,0.1,0.014337,0.298805
max,0.5,1.0,0.703,1.169,0.795,0.333333,0.5,0.058824,0.096774,0.285714,0.125,0.571429


In [7]:
sba_2b_reg = pickle.load(open('sba_2b_20220901.sav', 'rb'))
batters_fg['sba_2b'] = sba_2b_reg.predict(batters_fg[['sba_imp']])

sba_3b_reg = pickle.load(open('sba_3b_20220901.sav', 'rb'))
batters_fg['sba_3b'] = sba_3b_reg.predict(batters_fg[['sba_imp']])

sb_2b_reg = pickle.load(open('sb_2b_20220901.sav', 'rb'))
batters_fg['sb_2b'] = sb_2b_reg.predict(batters_fg[['sbr']])

sb_3b_reg = pickle.load(open('sb_3b_20220901.sav', 'rb'))
batters_fg['sb_3b'] = sb_3b_reg.predict(batters_fg[['sbr']])
batters_fg.describe()

Unnamed: 0,sba_imp,sbr,obp,slg,woba,b1_rate,b2_rate,b3_rate,hr_rate,bb_rate,hbp_rate,so_rate,sba_2b,sba_3b,sb_2b,sb_3b
count,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0
mean,0.077342,0.685931,0.293857,0.355156,0.287167,0.136587,0.037949,0.003004,0.022968,0.078138,0.010581,0.248795,0.048113,0.013608,0.710657,0.628366
std,0.103867,0.249917,0.060849,0.10932,0.064604,0.046358,0.02746,0.005047,0.018015,0.039325,0.013141,0.085628,0.062515,0.022442,0.167492,0.060746
min,0.0,0.0,0.038,0.01,0.037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001563,-0.003103,0.25095,0.461639
25%,0.0,0.6,0.265,0.301,0.255,0.111111,0.025316,0.0,0.008403,0.054159,0.0,0.192,0.001563,-0.003103,0.653066,0.607479
50%,0.038462,0.615385,0.3,0.365,0.294,0.137037,0.03856,0.0,0.022346,0.075188,0.007576,0.240214,0.024712,0.005207,0.663377,0.611219
75%,0.111111,0.888889,0.328,0.418,0.323,0.162791,0.049365,0.004785,0.034091,0.1,0.014337,0.298805,0.068438,0.020904,0.846678,0.677698
max,0.5,1.0,0.703,1.169,0.795,0.333333,0.5,0.058824,0.096774,0.285714,0.125,0.571429,0.3025,0.104928,0.921144,0.704705


In [8]:
# Import batters data to get handedness data
batter_df = pd.read_csv(os.path.join(baseball_path, "API", "Inputs", "Batters.csv"))
batter_hands = batter_df[['batter', 'batSide_l', 'key_fangraphs']]
batter_hands['b_L'] = np.where(batter_hands['batSide_l'] == "L", 1, 0)
batter_hands.drop(columns={'batSide_l'}, inplace=True)

batters_fg['playerid'] = pd.to_numeric(batters_fg['playerid'], errors='coerce')
print(batters_fg['playerid'])


# Merge onto existing data
batters_fg = batters_fg.merge(batter_hands, left_on='playerid', right_on='key_fangraphs', how='left')
batters_fg['b_L'].fillna(0, inplace=True)

batters_fg

0       15640.0
1        9777.0
2       11493.0
3        9218.0
4        5361.0
         ...   
3200     3269.0
3201     1744.0
3202     3142.0
3224     3086.0
3301     9927.0
Name: playerid, Length: 665, dtype: float64


Unnamed: 0,playerid,first2,last5,sba_imp,sbr,obp,slg,woba,b1_rate,b2_rate,...,bb_rate,hbp_rate,so_rate,sba_2b,sba_3b,sb_2b,sb_3b,batter,key_fangraphs,b_L
0,15640.0,aa,judge,0.093596,0.842105,0.424,0.690,0.459,0.125356,0.041311,...,0.156695,0.007123,0.247863,0.057896,0.017120,0.815324,0.666327,592450.0,15640.0,0.0
1,9777.0,no,arena,0.052632,0.625000,0.358,0.534,0.381,0.145570,0.066456,...,0.083861,0.011076,0.117089,0.033241,0.008269,0.669821,0.613556,571448.0,9777.0,0.0
2,11493.0,ma,macha,0.058824,0.900000,0.367,0.528,0.380,0.157492,0.056575,...,0.100917,0.001529,0.203364,0.036967,0.009607,0.854124,0.680399,592518.0,11493.0,0.0
3,9218.0,pa,golds,0.037234,1.000000,0.403,0.580,0.419,0.155354,0.063348,...,0.120664,0.007541,0.215686,0.023973,0.004942,0.921144,0.704705,502671.0,9218.0,0.0
4,5361.0,fr,freem,0.073394,0.812500,0.406,0.512,0.392,0.182584,0.066011,...,0.116573,0.007022,0.144663,0.045737,0.012755,0.795482,0.659131,518692.0,5361.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,3269.0,ro,cano,0.000000,0.600000,0.184,0.192,0.170,0.123810,0.009524,...,0.038095,0.000000,0.238095,0.001563,-0.003103,0.653066,0.607479,429664.0,3269.0,1.0
810,1744.0,mi,cabre,0.008696,1.000000,0.302,0.320,0.274,0.194508,0.025172,...,0.061785,0.006865,0.233410,0.006797,-0.001224,0.921144,0.704705,408234.0,1744.0,0.0
811,3142.0,ro,chiri,0.021739,1.000000,0.268,0.292,0.255,0.099548,0.040724,...,0.090498,0.018100,0.307692,0.014647,0.001594,0.921144,0.704705,455139.0,3142.0,0.0
812,3086.0,mi,morel,0.000000,0.600000,0.299,0.418,0.312,0.333333,0.000000,...,0.000000,0.000000,0.333333,0.001563,-0.003103,0.653066,0.607479,519048.0,3086.0,1.0


In [9]:
simple_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo']

# Read in models
woba_reg = pickle.load(open('woba_20220908.sav', 'rb'))
obp_reg = pickle.load(open('obp_20220908.sav', 'rb'))
slg_reg = pickle.load(open('slg_20220908.sav', 'rb'))

fg_vs_lhp = pickle.load(open('fg_vs_lhp_20220905.sav', 'rb'))
fg_vs_rhp = pickle.load(open('fg_vs_rhp_20220905.sav', 'rb'))
fg_vs_lhb = pickle.load(open('fg_vs_lhb_20220905.sav', 'rb'))
fg_vs_rhb = pickle.load(open('fg_vs_rhb_20220905.sav', 'rb'))

In [10]:
# Create vs lhp dataframe
vs_lhp_preds = fg_vs_lhp.predict_proba(batters_fg[['b_L', 'b1_rate', 'b2_rate', 'b3_rate', 'hr_rate', 'bb_rate', 'hbp_rate', 'so_rate']])
vs_lhp_df = pd.DataFrame(vs_lhp_preds, columns=fg_vs_lhp.classes_)

vs_lhp_df['woba'] = woba_reg.predict(vs_lhp_df[simple_list])
vs_lhp_df['obp'] = obp_reg.predict(vs_lhp_df[simple_list])
vs_lhp_df['slg'] = slg_reg.predict(vs_lhp_df[simple_list])

vs_lhp_df = vs_lhp_df.add_suffix("_l")

vs_lhp_df

Unnamed: 0,b1_l,b2_l,b3_l,bb_l,fo_l,go_l,hbp_l,hr_l,lo_l,po_l,so_l,woba_l,obp_l,slg_l
0,0.119824,0.031250,0.001060,0.173680,0.116497,0.161172,0.007342,0.072280,0.033023,0.038686,0.245185,0.421931,0.405436,0.478348
1,0.152685,0.081511,0.002942,0.092719,0.160049,0.192524,0.006259,0.046459,0.072455,0.082531,0.109866,0.407364,0.382576,0.513893
2,0.150698,0.066488,0.003395,0.109652,0.125010,0.208695,0.004271,0.046498,0.048851,0.040728,0.195714,0.397701,0.381002,0.482399
3,0.145579,0.064175,0.003150,0.133407,0.117928,0.191974,0.005629,0.050536,0.043868,0.037041,0.206711,0.415590,0.402477,0.488632
4,0.168988,0.056658,0.006497,0.089327,0.114567,0.262998,0.009261,0.018565,0.056529,0.035467,0.181144,0.338201,0.349296,0.379562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,0.118103,0.012799,0.000326,0.027805,0.098667,0.416386,0.001601,0.003451,0.055246,0.062520,0.203097,0.148679,0.164084,0.157242
810,0.188290,0.030391,0.001431,0.060363,0.109200,0.292989,0.002954,0.011212,0.062189,0.024244,0.216737,0.274270,0.294641,0.298683
811,0.093273,0.042962,0.002322,0.097408,0.124170,0.209289,0.011571,0.014542,0.054945,0.042480,0.307036,0.246138,0.262078,0.247581
812,0.300948,0.009108,0.001739,0.013270,0.033920,0.326737,0.000204,0.000907,0.029754,0.007356,0.276058,0.291796,0.326175,0.327751


In [11]:
# Create vs rhp dataframe
vs_rhp_preds = fg_vs_rhp.predict_proba(batters_fg[['b_L', 'b1_rate', 'b2_rate', 'b3_rate', 'hr_rate', 'bb_rate', 'hbp_rate', 'so_rate']])
vs_rhp_df = pd.DataFrame(vs_rhp_preds, columns=fg_vs_rhp.classes_)

vs_rhp_df['woba'] = woba_reg.predict(vs_rhp_df[simple_list])
vs_rhp_df['obp'] = obp_reg.predict(vs_rhp_df[simple_list])
vs_rhp_df['slg'] = slg_reg.predict(vs_rhp_df[simple_list])

vs_rhp_df = vs_rhp_df.add_suffix("_r")

vs_rhp_df

Unnamed: 0,b1_r,b2_r,b3_r,bb_r,fo_r,go_r,hbp_r,hr_r,lo_r,po_r,so_r,woba_r,obp_r,slg_r
0,0.119709,0.048722,0.000700,0.140875,0.112293,0.166009,0.005323,0.086605,0.041074,0.035943,0.242747,0.448865,0.401934,0.569504
1,0.143963,0.070449,0.002736,0.073306,0.177115,0.215018,0.010484,0.049762,0.064624,0.068463,0.124080,0.381843,0.350700,0.497147
2,0.156465,0.060277,0.001393,0.091977,0.128488,0.211688,0.002388,0.048662,0.053647,0.040751,0.204264,0.382688,0.361163,0.477676
3,0.152611,0.071253,0.001113,0.109772,0.116671,0.193645,0.005748,0.055721,0.049984,0.034618,0.208862,0.421994,0.396219,0.524857
4,0.197418,0.069114,0.002153,0.115699,0.124696,0.238408,0.005089,0.032901,0.054421,0.026669,0.133431,0.416991,0.422374,0.476463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,0.119591,0.019512,0.001021,0.034613,0.136324,0.308115,0.001819,0.009546,0.074085,0.057121,0.238253,0.177052,0.186102,0.199043
810,0.199345,0.029950,0.001380,0.052627,0.112045,0.262366,0.006037,0.009448,0.061888,0.029667,0.235246,0.276650,0.298788,0.302841
811,0.102565,0.044280,0.001814,0.078674,0.121414,0.204336,0.017262,0.015658,0.051204,0.048387,0.314405,0.248700,0.260254,0.264786
812,0.336612,0.025707,0.000446,0.029209,0.054399,0.216031,0.000491,0.002570,0.044853,0.005181,0.284500,0.356886,0.395035,0.400070


In [12]:
batters_fg = batters_fg[['playerid', 'batter', 'b_L', 'first2', 'last5', 'sba_2b', 'sba_3b', 'sb_2b', 'sb_3b']]
batters_fg.rename(columns={'first2': 'First2', 'last5':'Last5'},inplace=True)
batters_fg = pd.concat([batters_fg, vs_lhp_df, vs_rhp_df], axis=1)
batters_fg


Unnamed: 0,playerid,batter,b_L,First2,Last5,sba_2b,sba_3b,sb_2b,sb_3b,b1_l,...,fo_r,go_r,hbp_r,hr_r,lo_r,po_r,so_r,woba_r,obp_r,slg_r
0,15640.0,592450.0,0.0,aa,judge,0.057896,0.017120,0.815324,0.666327,0.119824,...,0.112293,0.166009,0.005323,0.086605,0.041074,0.035943,0.242747,0.448865,0.401934,0.569504
1,9777.0,571448.0,0.0,no,arena,0.033241,0.008269,0.669821,0.613556,0.152685,...,0.177115,0.215018,0.010484,0.049762,0.064624,0.068463,0.124080,0.381843,0.350700,0.497147
2,11493.0,592518.0,0.0,ma,macha,0.036967,0.009607,0.854124,0.680399,0.150698,...,0.128488,0.211688,0.002388,0.048662,0.053647,0.040751,0.204264,0.382688,0.361163,0.477676
3,9218.0,502671.0,0.0,pa,golds,0.023973,0.004942,0.921144,0.704705,0.145579,...,0.116671,0.193645,0.005748,0.055721,0.049984,0.034618,0.208862,0.421994,0.396219,0.524857
4,5361.0,518692.0,1.0,fr,freem,0.045737,0.012755,0.795482,0.659131,0.168988,...,0.124696,0.238408,0.005089,0.032901,0.054421,0.026669,0.133431,0.416991,0.422374,0.476463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,3269.0,429664.0,1.0,ro,cano,0.001563,-0.003103,0.653066,0.607479,0.118103,...,0.136324,0.308115,0.001819,0.009546,0.074085,0.057121,0.238253,0.177052,0.186102,0.199043
810,1744.0,408234.0,0.0,mi,cabre,0.006797,-0.001224,0.921144,0.704705,0.188290,...,0.112045,0.262366,0.006037,0.009448,0.061888,0.029667,0.235246,0.276650,0.298788,0.302841
811,3142.0,455139.0,0.0,ro,chiri,0.014647,0.001594,0.921144,0.704705,0.093273,...,0.121414,0.204336,0.017262,0.015658,0.051204,0.048387,0.314405,0.248700,0.260254,0.264786
812,3086.0,519048.0,1.0,mi,morel,0.001563,-0.003103,0.653066,0.607479,0.300948,...,0.054399,0.216031,0.000491,0.002570,0.044853,0.005181,0.284500,0.356886,0.395035,0.400070


# Pitchers

In [13]:
# Set driver
driver = webdriver.Chrome(executable_path=r'C:\Users\james\OneDrive\Documents\MLB\chromedriver.exe')
# Choose url
driver.get(f'https://www.fangraphs.com/projections.aspx?pos=all&stats=pit&type=steameru&team=0&lg=all&players=0')
# Select element
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ProjectionBoard1_cmdCSV"]'))))

time.sleep(15)

TimeoutException: Message: 


In [None]:
# Find all Fangraphs downloads
fangraphs_files = glob.glob(os.path.join(download_path, 'FanGraphs Leaderboard*.csv'))
fangraphs_files.sort(key=os.path.getmtime)
  
# Pick the second to last (this will be the highest number (which is above the unnumbered one)
latest = fangraphs_files[-1]
print(latest)

pitchers_lb = pd.read_csv(os.path.join(download_path, latest))

filename = "Pitchers_FG_" + todaysdate + ".csv"
pitchers_lb.to_csv(os.path.join(baseball_path, "FanGraphs", "Pitchers", filename))

try:
    driver.close()
except:
    pass
    

pitchers_lb[['First', 'Last']] = pitchers_lb['Name'].str.split(" ", n=1, expand=True)

pitchers_lb['First2'] = pitchers_lb['First'].str.slice(0,2).str.lower()
pitchers_lb['Last5'] = pitchers_lb['Last'].str.slice(0,5).str.lower()

pitchers_lb['First2'] = pitchers_lb.apply(lambda x: remove_accents(x['First2']), axis=1)
pitchers_lb['Last5'] = pitchers_lb.apply(lambda x: remove_accents(x['Last5']), axis=1)
    
pitchers_lb

In [None]:
pitchers_fg = pitchers_lb.copy()

# Import batters data to get handedness data
pitcher_df = pd.read_csv(os.path.join(baseball_path, "API", "Inputs", "Pitchers.csv"))
pitcher_hands = pitcher_df[['pitcher', 'pitchHand_l', 'key_fangraphs']]
pitcher_hands['p_L'] = np.where(pitcher_hands['pitchHand_l'] == "L", 1, 0)
pitcher_hands.drop(columns={'pitchHand_l'}, inplace=True)


pitchers_fg['playerid'] = pd.to_numeric(pitchers_fg['playerid'], errors='coerce')

# Merge onto existing data
pitchers_fg = pitchers_fg.merge(pitcher_hands, left_on='playerid', right_on='key_fangraphs', how='left')
pitchers_fg['p_L'].fillna(0, inplace=True)


pitchers_fg['H/9'] = pitchers_fg['H'] / pitchers_fg['IP'] * 9
pitchers_fg['HR/9'] = pitchers_fg['HR'] / pitchers_fg['IP'] * 9
    
keep_list = ['playerid', 'pitcher', 'First2', 'Last5', 'p_L', 'H/9', 'HR/9', 'K/9', 'BB/9'] 
pitchers_fg = pitchers_fg[keep_list]
pitchers_fg


In [None]:
# Create vs lhb dataframe
vs_lhb_preds = fg_vs_lhb.predict_proba(pitchers_fg[['p_L', 'H/9', 'HR/9', 'K/9', 'BB/9']])
vs_lhb_df = pd.DataFrame(vs_lhb_preds, columns=fg_vs_lhb.classes_)

vs_lhb_df['woba'] = woba_reg.predict(vs_lhb_df[simple_list])
vs_lhb_df['obp'] = obp_reg.predict(vs_lhb_df[simple_list])
vs_lhb_df['slg'] = slg_reg.predict(vs_lhb_df[simple_list])

vs_lhb_df = vs_lhb_df.add_suffix("_l")

vs_lhb_df

In [None]:
# Create vs rhb dataframe
vs_rhb_preds = fg_vs_rhb.predict_proba(pitchers_fg[['p_L', 'H/9', 'HR/9', 'K/9', 'BB/9']])
vs_rhb_df = pd.DataFrame(vs_rhb_preds, columns=fg_vs_rhb.classes_)

vs_rhb_df['woba'] = woba_reg.predict(vs_rhb_df[simple_list])
vs_rhb_df['obp'] = obp_reg.predict(vs_rhb_df[simple_list])
vs_rhb_df['slg'] = slg_reg.predict(vs_rhb_df[simple_list])

vs_rhb_df = vs_rhb_df.add_suffix("_r")

vs_rhb_df

In [None]:
pitchers_fg = pitchers_fg[['playerid', 'pitcher', 'p_L', 'First2', 'Last5']]

pitchers_fg = pd.concat([pitchers_fg, vs_lhb_df, vs_rhb_df], axis=1)
pitchers_fg


# Export 

In [None]:
# Batters
batters_fg.to_csv(os.path.join(baseball_path, "FanGraphs", "Batters_FG.csv"))
# Pitchers
pitchers_fg.to_csv(os.path.join(baseball_path, "FanGraphs", "Pitchers_FG.csv"))


batters_fg.query('playerid == 671739')

In [None]:
batters_fg.dropna(subset='playerid', axis=0, inplace=True)
pitchers_fg.dropna(subset='playerid', axis=0, inplace=True)

batters_fg.query('Last5 == "harri"')

# Merge

Batters

In [None]:
# Merged on MLB id
batter_merged1 = batter_df.merge(batters_fg, on='batter', how='inner', suffixes=("", "_fg1"))
# Merged on name
batter_merged2 = batter_df.merge(batters_fg, on=['First2', 'Last5'], how='inner', suffixes=("", "_fg2"))

# List of batters who merged
batter_merged1_list = batter_merged1['batterName'].unique()
batter_merged2_list = batter_merged2['batterName'].unique()

# print(batter_merged1.query('batterName == "Michael Harris II"'))
# print(batters_fg.query('batter == 671739'))

# Keep only those in the second group who didn't merge in the first group
batter_merged2 = batter_merged2[~batter_merged2['batterName'].isin(batter_merged1_list)]

# Append them together
batter_merged = pd.concat([batter_merged1, batter_merged2], axis=0)

# Replace with FanGraphs imputations if sample is small 
stat_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 'woba', 'obp', 'slg']

# Fill in missings with 0 so the < will work 
batter_merged['pa_b_l'].fillna(0, inplace=True)
batter_merged['pa_b_r'].fillna(0, inplace=True)

# Replace small sample size stats with FanGraphs imputations
# Left
for stat in stat_list:
    short = stat + "_b_l"
    long = stat + "_b_long_l"
    fangraphs = stat + "_l"
    batter_merged[short] = np.where(batter_merged['pa_b_l'] < 40, batter_merged[fangraphs], batter_merged[short])
    batter_merged[long] = np.where(batter_merged['pa_b_l'] < 40, batter_merged[fangraphs], batter_merged[long])

# Right
for stat in stat_list:
    short = stat + "_b_r"
    long = stat + "_b_long_r"
    fangraphs = stat + "_r"
    batter_merged[short] = np.where(batter_merged['pa_b_r'] < 40, batter_merged[fangraphs], batter_merged[short])
    batter_merged[long] = np.where(batter_merged['pa_b_r'] < 40, batter_merged[fangraphs], batter_merged[long])
    
batter_merged['imp'] = np.where(batter_merged['pa_b_r'] < 40, 1, 0)
    
batter_merged.to_csv(os.path.join(baseball_path, "Model Input", "Batters.csv"))

Pitchers

In [None]:
# Merged on MLB id
pitcher_merged1 = pitcher_df.merge(pitchers_fg, on='pitcher', how='inner', suffixes=("", "_fg1"))
# Merged on name
pitcher_merged2 = pitcher_df.merge(pitchers_fg, on=['First2', 'Last5'], how='inner', suffixes=("", "_fg2"))

# List of batters who merged
pitcher_merged1_list = pitcher_merged1['pitcherName'].unique()
pitcher_merged2_list = pitcher_merged2['pitcherName'].unique()

# Keep only those in the second group who didn't merge in the first group
pitcher_merged2 = pitcher_merged2[~pitcher_merged2['pitcherName'].isin(pitcher_merged1_list)]

# Append them together
pitcher_merged = pd.concat([pitcher_merged1, pitcher_merged2], axis=0)

# Replace with FanGraphs imputations if sample is small 
stat_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 'woba', 'obp', 'slg']

# Fill in missings with 0 so the < will work 
pitcher_merged['pa_p_l'].fillna(0, inplace=True)
pitcher_merged['pa_p_r'].fillna(0, inplace=True)

# Replace small sample size stats with FanGraphs imputations
# Left
for stat in stat_list:
    short = stat + "_p_l"
    long = stat + "_p_long_l"
    fangraphs = stat + "_l"
    pitcher_merged[short] = np.where(pitcher_merged['pa_p_l'] < 40, pitcher_merged[fangraphs], pitcher_merged[short])
    pitcher_merged[long] = np.where(pitcher_merged['pa_p_l'] < 40, pitcher_merged[fangraphs], pitcher_merged[long])

# Right
for stat in stat_list:
    short = stat + "_p_r"
    long = stat + "_p_long_r"
    fangraphs = stat + "_r"
    pitcher_merged[short] = np.where(pitcher_merged['pa_p_r'] < 40, pitcher_merged[fangraphs], pitcher_merged[short])
    pitcher_merged[long] = np.where(pitcher_merged['pa_p_r'] < 40, pitcher_merged[fangraphs], pitcher_merged[long])
    

pitcher_merged['imp'] = np.where(pitcher_merged['pa_p_r'] < 40, 1, 0)
    
pitcher_merged.to_csv(os.path.join(baseball_path, "Model Input", "Pitchers.csv"))

In [None]:
# Problem: all rookies are righties. This is because they don't have a fangraphs ID in chadwick
# Can add manually, or can do some sort of merge with fg splits but that's really messy
# I'd try to do a nice fix of chadwick somehow. Merging it with rosters somehow idk

In [None]:
print("Code was last run on: " + str(datetime.date.today()))