# 07. Stats
Source: <br> 
1. FanGraphs API <br>

This imports stats from FanGraphs <br>
This calculates stats that aren't used in the models but help us get there <br>

### Scrape FanGraphs

In [19]:
# FanGraphs API - Batters
def scrape_batters():
    # Read in API json
    batters_lb = pd.read_json('https://www.fangraphs.com/api/projections?type=steamer&stats=bat&pos=all&team=0&players=0&lg=all')
    
    # Name is currently some weird thing with a bunch of data. We don't need all that
    batters_lb['Name'] = batters_lb['PlayerName']
    # Rename to match steam. Note that steamerid = key_fangraphs
    batters_lb.rename(columns={'playerids':'steamerid'}, inplace=True)
    # Convert to string
    chadwick['key_fangraphs'] = (chadwick['key_fangraphs']).astype('str')
    # Remove trailing .0
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].str.replace(r'\.\d', "", regex=True)
    
    # Merge with chadwick for mlbamid
    batters_lb = batters_lb.merge(chadwick, left_on='steamerid', right_on='key_fangraphs', how='left')
    crosswalk['steamerid'] = crosswalk['steamerid'].astype('str')
    batters_lb = batters_lb.merge(crosswalk, on='steamerid', how='left')
    
    
    # Create missing columns to match what's provided by steamer 
    batters_lb['proj_date'] = todaysdate
    batters_lb['mlbamid'] = batters_lb['key_mlbam']
    batters_lb['bats'] = "MI" # Not included in FanGraphs data
    batters_lb['playerid'] = batters_lb['steamerid']
    batters_lb['NIBB'] = batters_lb['BB'] - batters_lb['IBB']
    batters_lb.rename(columns={'name_first':'firstname', 'name_last':'lastname', 'minpos':'position'}, inplace=True)

    batters_lb['mlbamid'].fillna(batters_lb['mlbamid_fill'], inplace=True)
    
    # Keep relevant variables and in order
    batters_lb = batters_lb[['proj_date', 'mlbamid', 'steamerid', 'firstname', 'lastname', 'Team', 'position', 'bats', 
                             'PA', 'IBB', 'NIBB', 'BB', 'SO', 'HBP', 'H', '2B', '3B', 'HR', 'OBP', 'SLG', 'wOBA', 'SB', 'CS', 'playerid', 'Name']]

    
    # Export to CSV
    batters_lb.to_csv(os.path.join(baseball_path, "7. Stats", "A. Raw FanGraphs", "Batters", "Batters_FG_" + todaysdate + ".csv"), encoding='iso-8859-1')

In [20]:
# FanGraphs API - Pitchers
def scrape_pitchers():
    # Read in API json
    pitchers_lb = pd.read_json('https://www.fangraphs.com/api/projections?type=steamer&stats=pit&pos=all&team=0&players=0&lg=all')

    # Name is currently some weird thing with a bunch of data. We don't need all that
    pitchers_lb['Name'] = pitchers_lb['PlayerName']
    # Rename to match steam. Note that steamerid = key_fangraphs
    pitchers_lb.rename(columns={'playerids':'steamerid'}, inplace=True)
    # Convert to string
    chadwick['key_fangraphs'] = (chadwick['key_fangraphs']).astype('str')
    # Remove trailing .0
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].str.replace(r'\.\d', "", regex=True)
    
    # Merge with chadwick for mlbamid
    pitchers_lb = pitchers_lb.merge(chadwick, left_on='steamerid', right_on='key_fangraphs', how='left')
    crosswalk['steamerid'] = crosswalk['steamerid'].astype('str')
    pitchers_lb = pitchers_lb.merge(crosswalk, on='steamerid', how='left')


    # Create missing columns to match what's provided by steamer 
    pitchers_lb['proj_date'] = todaysdate
    pitchers_lb['mlbamid'] = pitchers_lb['key_mlbam']
    pitchers_lb['Throws'] = "MI" # Not included in FanGraphs data
    pitchers_lb['playerid'] = pitchers_lb['steamerid']
    pitchers_lb.rename(columns={'name_first':'firstname', 'name_last':'lastname', 'minpos':'position'}, inplace=True)

    pitchers_lb['mlbamid'].fillna(pitchers_lb['mlbamid_fill'], inplace=True)

    
    # Keep relevant variables and in order
    pitchers_lb = pitchers_lb[['proj_date', 'mlbamid', 'steamerid', 'firstname', 'lastname', 'Throws', 
                               'IP', 'G', 'GS', 'K/9', 'BB/9', 'H', 'HR', 'playerid', 'Name']]
    
    # Export to CSV
    pitchers_lb.to_csv(os.path.join(baseball_path, "7. Stats", "A. Raw FanGraphs", "Pitchers", "Pitchers_FG_" + todaysdate + ".csv"), encoding='iso-8859-1')

### Create Useful Stats

In [21]:
def create_intermediate_batters(date):
    # Read in file
    filename = "Batters_FG_" + date + ".csv"
    df = pd.read_csv(os.path.join(baseball_path, "7. Stats", "A. Raw FanGraphs", "Batters", filename), encoding='iso-8859-1')
    # Create singles
    df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
    
    # Basic stats
    hit_list = ['1B', '2B', '3B', 'HR', 'BB', 'HBP', 'SO']

    # Advance stats
    rate_list = ['OBP', 'SLG', 'wOBA']
    for stat in hit_list:
        rate = stat + "_rate"
        rate_list.append(rate)
        df[rate] = df[stat] / df['PA']

    df['SBA'] = df['SB'] + df['CS']
    df['SBO'] = df['1B'] + df['BB'] + df['HBP']
    df['sba_imp'] = df['SBA'] / df['SBO']

    # Cap imputed SBA 
    df['sba_imp'] = np.where(df['sba_imp'] > 0.15, 0.15, df['sba_imp'])

    # Determine stolen base success rate
    df['sbr'] = df['SB'] / df['SBA']
    
    keep_list = ['Name', 'mlbamid', 'playerid', 'sba_imp', 'sbr'] + rate_list
    df = df[keep_list]
    
    df['sbr'].fillna(0.6, inplace=True) # assume 25th percentile 
    df['sba_imp'].fillna(0.05, inplace=True) # assume low prob
    
    df.columns = df.columns.str.lower()
    df.rename(columns={'1b_rate': 'b1_rate', '2b_rate': 'b2_rate', '3b_rate': 'b3_rate'}, inplace=True)

    df.dropna(inplace=True)
    
    sba_2b_reg = pickle.load(open(os.path.join(model_path, 'sba_2b_20220901.sav'), 'rb'))
    df['sba_2b'] = sba_2b_reg.predict(df[['sba_imp']])

    sba_3b_reg = pickle.load(open(os.path.join(model_path, 'sba_3b_20220901.sav'), 'rb'))
    df['sba_3b'] = sba_3b_reg.predict(df[['sba_imp']])

    sb_2b_reg = pickle.load(open(os.path.join(model_path, 'sb_2b_20220901.sav'), 'rb'))
    df['sb_2b'] = sb_2b_reg.predict(df[['sbr']])

    sb_3b_reg = pickle.load(open(os.path.join(model_path, 'sb_3b_20220901.sav'), 'rb'))
    df['sb_3b'] = sb_3b_reg.predict(df[['sbr']])
       
    # Merge with chadwick to fix certain fangraphs ids 
    df = df.merge(chadwick, left_on='mlbamid', right_on='key_mlbam', how='left')
    # df['key_fangraphs'] = df['key_fangraphs'].astype('int', errors='ignore')
    df['playerid'] = np.where(df['playerid'].str.startswith("sa") & ~df['key_fangraphs'].isna(), df['key_fangraphs'], df['playerid'])
    df['playerid'] = df['playerid'].astype('string')
    df['playerid'] = df['playerid'].str.replace(r'\.0', '', regex=True)
        
    df.drop(columns={'index', 'key_fangraphs', 'key_mlbam', 'name_first', 'name_last'}, inplace=True)
    
    # Write intermediate FanGraphs data to csv
    df.to_csv(os.path.join(baseball_path, "7. Stats", "B. Clean FanGraphs", "Batters", "Batters_FG2_" + date + ".csv"), encoding='iso-8859-1')
        
    return df

In [22]:
def create_intermediate_pitchers(date):
    # Read in file
    filename = "Pitchers_FG_" + date + ".csv"
    df = pd.read_csv(os.path.join(baseball_path, "7. Stats", "A. Raw FanGraphs", "Pitchers", filename), encoding='iso-8859-1')
    
    df['H9'] = df['H'] / df['IP'] * 9
    df['HR9'] = df['HR'] / df['IP'] * 9
    
    df.rename(columns={'K/9':'K9', 'BB/9':'BB9'}, inplace=True)
    
    keep_list = ['playerid', 'mlbamid', 'H9', 'HR9', 'K9', 'BB9'] 
    df = df[keep_list]
    
    # Merge with chadwick to fix certain fangraphs ids 
    df = df.merge(chadwick, left_on='mlbamid', right_on='key_mlbam', how='left')
    # df['key_fangraphs'] = df['key_fangraphs'].astype('int', errors='ignore')
    df['playerid'] = np.where(df['playerid'].str.startswith("sa") & ~df['key_fangraphs'].isna(), df['key_fangraphs'], df['playerid'])
    df['playerid'] = df['playerid'].astype('string')
    df['playerid'] = df['playerid'].str.replace(r'\.0', '', regex=True)
    
    df.drop(columns={'index', 'key_fangraphs', 'key_mlbam', 'name_first', 'name_last'}, inplace=True)
    
    
    # Write intermediate FanGraphs data to csv
    df.to_csv(os.path.join(baseball_path, "7. Stats", "B. Clean FanGraphs", "Pitchers", "Pitchers_FG2_" + date + ".csv"), encoding='iso-8859-1')
    
    return df

In [23]:
def batter_merge(date):
    # Read in batter stats from API
    batter_filename = "Batters" + date + ".csv"
    batters_api = pd.read_csv(os.path.join(baseball_path, "4. Dataset", "Batters", batter_filename), encoding='iso-8859-1')
    
    # Make string, remove trailing .0
    batters_api = fix_fangraphs(batters_api)
    
    # Read in batter projections from FanGraphs
    batters_fg = create_intermediate_batters(date)
    batters_fg['key_fangraphs'] = batters_fg['playerid']
    
    
    # Merge API data with FG data
    batters_df = batters_api.merge(batters_fg, left_on='key_mlbam', right_on='mlbamid', how='outer')
    
    return batters_df

In [24]:
def pitcher_merge(date):
    # Read in pitcher stats from API
    pitcher_filename = "Pitchers" + date + ".csv"
    pitchers_api = pd.read_csv(os.path.join(baseball_path, "4. Dataset", "Pitchers", pitcher_filename), encoding='iso-8859-1')
    
    # Make string, remove trailing .0
    pitchers_api = fix_fangraphs(pitchers_api)
    
    # Read in pitcher projections from FanGraphs
    pitchers_fg = create_intermediate_pitchers(date)
    pitchers_fg['key_fangraphs'] = pitchers_fg['playerid']

    # Merge API data with FG data
    pitchers_df = pitchers_api.merge(pitchers_fg, left_on='key_mlbam', right_on='mlbamid', how='outer')

    
    return pitchers_df

In [25]:
os.chdir(r"C:\Users\james\Documents\MLB\Code\Models")

In [26]:
## Normal stats  
# Standardize the data using StandardScaler
scaler_filename = "batter_stats_scaler.pkl"
with open(scaler_filename, "rb") as file:
    batter_scaler = pickle.load(file)
    
scaler_filename = "pitcher_stats_scaler.pkl"
with open(scaler_filename, "rb") as file:
    pitcher_scaler = pickle.load(file)

In [27]:
## FG Stats
# Standardize the data using StandardScaler
scaler_filename = "batter_stats_fg_scaler.pkl"
with open(scaler_filename, "rb") as file:
    batter_fg_scaler = pickle.load(file)
    
scaler_filename = "pitcher_stats_fg_scaler.pkl"
with open(scaler_filename, "rb") as file:
    pitcher_fg_scaler = pickle.load(file)

In [28]:
## Imputations
imp_filename = "batter_imputations.pkl"
with open(imp_filename, "rb") as file:
    batter_stats_model = pickle.load(file)

imp_filename = "pitcher_imputations.pkl"
with open(imp_filename, "rb") as file:
    pitcher_stats_model = pickle.load(file)

In [29]:
# Change directory back so I can call it from another sheet and it won't change the directory for that sheet
os.chdir(r"C:\Users\james\Documents\MLB\Code")

In [1]:
# %run "Imports.ipynb"

### Batter

In [2]:
# Create position/length-specific inputs from stats
stats = ['b1','b2','b3','hr','bb','hbp',
            'so','fo','go','lo','po',
            'iso','slg','obp','woba','estimated_woba_using_speedangle',
            'to_left','to_middle','to_right',
            'hard_hit','barrel','totalDistance', 'maxSpeed', 'maxSpin', 'launchSpeed', 'ab', 'pa']

batter_stats_short = [f"{stat}_b" for stat in stats]
batter_stats_long  = [f"{stat}_b_long" for stat in stats]

# FanGraphs stats
batter_stats_fg =    ['b1_rate','b2_rate','b3_rate','hr_rate','bb_rate','hbp_rate','so_rate', 'woba', 'slg', 'obp']

### Pitcher

In [4]:
# Create position/length-specific inputs from stats
pitcher_stats_short = [f"{stat}_p" for stat in stats]
pitcher_stats_long  = [f"{stat}_p_long" for stat in stats]

# FanGraphs stats
pitcher_stats_fg =    ['H/9','HR/9','K/9','BB/9']

### Other

In [5]:
# Venues
venue_nums = ['1', '2', '3', '4', '5', '7', '10', '12', '13', '14', '15', '16', '17', '19', '22', '31', '32', 
              '680', '2392', '2394', '2395', '2535', '2536', '2602', '2680', '2681', '2701', '2735', '2756', 
              '2889', '3289', '3309', '3312', '3313', '4169', '4705', '5010', '5325', '5365', '5381', '5445']

venues = [f"venue_{num}" for num in venue_nums]

# Years
years = [f"year_{year}" for year in range(2015,2024)]

# Matchup, weather, and game stat
other_list = ['p_L','b_L','x_vect','y_vect','temperature','onFirst','onSecond','onThird','inning','top','score_diff']

### Exclusions

In [7]:
# Stats that do not apply to the position or we just don't want
exclude = ["maxSpeed_b", "maxSpin_b", "maxSpeed_b_long", "maxSpin_b_long", 
           "totalDistance_p", "totalDistance_p_long", "launchSpeed_p", "launchSpeed_p_long",
           "ab_b", "pa_b", "ab_b_long", "pa_b_long", 
           "ab_p", "pa_p", "ab_p_long", "pa_p_long"]

### Input Lists

In [8]:
# Batter inputs (into final mode)
batter_stats = batter_stats_short + batter_stats_long
batter_stats = [item for item in batter_stats if item not in exclude]

# Pitcher inputs (into final mode)
pitcher_stats = pitcher_stats_short + pitcher_stats_long
pitcher_stats = [item for item in pitcher_stats if item not in exclude]

# All inputs into final model
inputs = batter_stats + pitcher_stats + venues + years + other_list


# Add additional variables for ease of use
inputs_plus = inputs + ['batterName', 'pitcherName', 'batter', 'pitcher', 'batSide', 'pitchHand', 'eventsModel']

### Create Rosters

In [15]:
def process_file(rosters_path, team_folder, filename, date, batters_df, pitchers_df):
    print(filename)
    # Read in roster
    df = pd.read_csv(os.path.join(rosters_path, filename), encoding='iso-8859-1')

    # Destination     
    excel_file = filename.replace(".csv", "")
    excel_file = excel_file + ".xlsx"
    file_name = os.path.join(baseball_path, "7. Stats", "C. Teams", team_folder, excel_file)

    ### Batters
    batters_merged = df.merge(batters_df, left_on='id', right_on='batter', how='left', suffixes=("", "_api"))

    # Only keep batters
    batters_merged = batters_merged.query('position != "P"')

    # Create dummy variable for if they're a lefty. This is necessary to project. (maybe move this)
    batters_merged['b_L'] = np.where(batters_merged['batSide'] == "L", 1, 0)


    ## Standardize FG
    batters_merged[batter_stats_fg] = batter_fg_scaler.transform(batters_merged[batter_stats_fg])        

    ## Standardize normal stats
    # Left
    # Rename stats to make compatible with scaler
    for stat in batter_stats:
        # Get the column name with the '_l' suffix
        column_l = stat + '_l'

        # Rename the column by removing the '_l' suffix
        batters_merged.rename(columns={column_l: stat}, inplace=True)

    # Apply the scaler to the renamed column
    batters_merged[batter_stats] = batter_scaler.transform(batters_merged[batter_stats])

    for stat in batter_stats:
        # Get the column name with the '_l' suffix
        column_l = stat + '_l'

        batters_merged.rename(columns={stat: column_l}, inplace=True)

    # Right
    for stat in batter_stats:
        # Get the column name with the '_r' suffix
        column_r = stat + '_r'

        # Rename the column by removing the '_r' suffix
        batters_merged.rename(columns={column_r: stat}, inplace=True)

    # Apply the scaler to the renamed column
    batters_merged[batter_stats] = batter_scaler.transform(batters_merged[batter_stats])

    for stat in batter_stats:
        # Get the column name with the '_r' suffix
        column_r = stat + '_r'

        batters_merged.rename(columns={stat: column_r}, inplace=True)

    ## Impute for small sample
    # Use the trained model to make predictions

    batter_stats_fg2 = batter_stats_fg + ['b_L', 'p_L']

    # Vs left        
    for stat in batter_stats:
        batters_merged[stat] = batters_merged[f'{stat}_l']

    batters_merged['p_L'] = 1 

    try:
        prediction = batter_stats_model.predict(batters_merged.loc[batters_merged['pa_b_l'] < 40, batter_stats_fg2])
    except:
        prediction = None
        print("No batter L imputations")

    # Impute missing values in pitcher_stats with the predicted values
    batter_stats_l = [f'{stat}_l' for stat in batter_stats]
    batters_merged.loc[batters_merged['pa_b_l'] < 40, batter_stats_l] = prediction

    # Vs right
    for stat in batter_stats:
        batters_merged[stat] = batters_merged[f'{stat}_r']

    batters_merged['p_L'] = 0 

    try:
        prediction = batter_stats_model.predict(batters_merged.loc[batters_merged['pa_b_r'] < 40,  batter_stats_fg2])
    except:
        prediction = None
        print("No batter R imputations")

    # Impute missing values in pitcher_stats with the predicted values
    batter_stats_r = [f'{stat}_r' for stat in batter_stats]
    batters_merged.loc[batters_merged['pa_b_r'] < 40, batter_stats_r] = prediction


    # Save as Excel
    batters_merged.to_excel(file_name, sheet_name="Batters", engine='openpyxl')


    ### Pitcher
    pitchers_merged = df.merge(pitchers_df, left_on='id', right_on='pitcher', how='left', suffixes=("", "_api"))

    # Only keep pitchers
    desired_positions = ['P', 'TWP', 'Pitcher', 'Two-Way Player']
    pitchers_merged = pitchers_merged[pitchers_merged['position'].isin(desired_positions)]

    # Create dummy variable for if they're a lefty. This is necessary to project. (maybe move this)
    pitchers_merged['p_L'] = np.where(pitchers_merged['pitchHand'] == "L", 1, 0)

    pitchers_merged.rename(columns={'H9':'H/9', 'HR9':'HR/9','K9':'K/9','BB9':'BB/9'}, inplace=True)
    pitcher_stats_fg = ['H/9', 'HR/9','K/9', 'BB/9']

    ## Standardize FG
    pitchers_merged[pitcher_stats_fg] = pitcher_fg_scaler.transform(pitchers_merged[pitcher_stats_fg])        

    ## Standardize normal stats
    # Left
    # Rename stats to make compatible with scaler
    for stat in pitcher_stats:
        # Get the column name with the '_l' suffix
        column_l = stat + '_l'

        # Rename the column by removing the '_l' suffix
        pitchers_merged.rename(columns={column_l: stat}, inplace=True)

    # Apply the scaler to the renamed column
    pitchers_merged[pitcher_stats] = pitcher_scaler.transform(pitchers_merged[pitcher_stats])

    for stat in pitcher_stats:
        # Get the column name with the '_l' suffix
        column_l = stat + '_l'

        pitchers_merged.rename(columns={stat: column_l}, inplace=True)

    # Right
    for stat in pitcher_stats:
        # Get the column name with the '_r' suffix
        column_r = stat + '_r'

        # Rename the column by removing the '_r' suffix
        pitchers_merged.rename(columns={column_r: stat}, inplace=True)

    # Apply the scaler to the renamed column
    pitchers_merged[pitcher_stats] = pitcher_scaler.transform(pitchers_merged[pitcher_stats])

    for stat in pitcher_stats:
        # Get the column name with the '_r' suffix
        column_r = stat + '_r'

        pitchers_merged.rename(columns={stat: column_r}, inplace=True)

    ## Impute for small sample
    # Use the trained model to make predictions

    pitcher_stats_fg2 = pitcher_stats_fg + ['b_L', 'p_L']

    # Vs left        
    for stat in pitcher_stats:
        pitchers_merged[stat] = pitchers_merged[f'{stat}_l']

    pitchers_merged['b_L'] = 1 

    try:
        prediction = pitcher_stats_model.predict(pitchers_merged.loc[pitchers_merged['pa_p_l'] < 40, pitcher_stats_fg2])
    except:
        prediction = None
        print("No pitcher L imputations")

    # Impute missing values in pitcher_stats with the predicted values
    pitcher_stats_l = [f'{stat}_l' for stat in pitcher_stats]
    pitchers_merged.loc[pitchers_merged['pa_p_l'] < 40, pitcher_stats_l] = prediction

    # Vs right
    for stat in pitcher_stats:
        pitchers_merged[stat] = pitchers_merged[f'{stat}_r']

    pitchers_merged['b_L'] = 0 

    try:
        prediction = pitcher_stats_model.predict(pitchers_merged.loc[pitchers_merged['pa_p_r'] < 40,  pitcher_stats_fg2])
    except:
        prediction = None
        print("No pitcher R imputations")

    # Impute missing values in pitcher_stats with the predicted values
    pitcher_stats_r = [f'{stat}_r' for stat in pitcher_stats]
    pitchers_merged.loc[pitchers_merged['pa_p_r'] < 40, pitcher_stats_r] = prediction


    pitchers_merged.rename(columns={'H/9':'H9', 'HR/9':'HR9','K/9':'K9','BB/9':'BB9'}, inplace=True)

    # Save as Excel
    with pd.ExcelWriter(file_name, mode='a', engine='openpyxl') as writer:  
        pitchers_merged.to_excel(writer, sheet_name='Pitchers')

In [16]:
def create_team_rosters(date=None):
    # Create new folder with daily rosters
    team_folder = "Daily" + date
    
    try:
        os.mkdir(os.path.join(baseball_path, "7. Stats", "C. Teams", team_folder))
    except:
        pass
    
    # Locate daily rosters
    rosters_folder = "Rosters" + date
    rosters_path = os.path.join(baseball_path, "6. Rosters", rosters_folder)
    
    
    # Merge API and FG data
    batters_df = batter_merge(date)
    pitchers_df = pitcher_merge(date)
    
    # Get the list of files in the rosters_path directory
    file_list = os.listdir(rosters_path)
                
    # Use Parallel to run the process_file function in parallel for each file
    Parallel(n_jobs=-2)(delayed(process_file)(rosters_path, team_folder, filename, date, batters_df, pitchers_df) for filename in file_list)


### Run

In [14]:
def run_stats(date=None, historic=False):
    if historic == False:
        create_team_rosters(date)
        
   
    else:
        def create_team_rosters2(date):
            try:
                create_team_rosters(date)
            except:
                print("Missing for {}.".format(date))

        # Maybe add parallel processing support to this
        for i in range(len(history)):  
            create_team_rosters2(history['date'][i])