# A7. Matchups
Source: DKSalaries, A5. Stats <br>

Description: This uses DKSalaries and A5. Stats to create matchup Excel files as simulation input <br>

# Imports

# Could you move the last part of A5 Stats here?

In [1]:
import numpy as np
import pandas as pd
import os
import xlrd
import unidecode
import datetime
from datetime import date
from pathlib import Path
import os
import glob
import re
import import_ipynb
from Utilities import *

import warnings
warnings.simplefilter(action="ignore")
from distutils.dir_util import copy_tree

baseball_path = r"C:\Users\james\Documents\MLB\Data"
download_path = r"C:\Users\james\Downloads"

importing Jupyter notebook from Utilities.ipynb


In [2]:
todaysdate = date.today()
todaysdate_dash = str(todaysdate)
todaysdate = todaysdate_dash.replace("-", "")

# todaysdate = "20230330"
print(todaysdate)

20230426


In [3]:
# This reads in a map of team name, codes, and the number Fangraphs uses in their URLs
team_map = pd.read_csv(os.path.join(baseball_path, "Utilities", "Team Map.csv"))

# We just need teams right now
team_map = team_map[['DKTEAM', 'BBREFTEAM', 'SFBBTEAM']]

In [4]:
lineups = pd.read_csv(f"https://baseballmonster.com/Lineups.aspx?csv=1&d={todaysdate_dash}")

lineups = lineups.merge(team_map, left_on='team code', right_on='SFBBTEAM', how='inner') 

# Fill missings
lineups[' mlb id'] = np.where(lineups[' player name'] == "Masataka Yoshida", 807799, lineups[' mlb id'])

# Check missings
lineups[' mlb id'].fillna(999999, inplace=True)
for i in range(len(lineups)):
    if lineups[' mlb id'][i] == 999999:
        print(lineups[' player name'][i])
        
lineups = lineups[[' mlb id', ' batting order', 'BBREFTEAM']]
lineups.rename(columns={' mlb id': 'key_mlbam', ' batting order':'batting_order_fill'}, inplace=True)

# DK Salaries

In [5]:
# This reads in the most recently downloaded salary file and saves it 
def create_dk_salaries():
    # Find all DK downloads
    dk_files = glob.glob(os.path.join(download_path, 'DKSalaries*.csv'))
    dk_files.sort(key=os.path.getmtime)

    latest = dk_files[-1]


    dk_name = "DKSalaries_" + todaysdate + ".csv"

    # Clean DK salaries for merge
    dk_salaries = pd.read_csv(os.path.join(download_path, latest))
    
    print(dk_salaries)
    dk_salaries = dk_salaries[dk_salaries['Name'] != "Caleb Smith"] # There are two Ca Smiths on Pit. Kept Canaan
    
    dk_salaries['ID'] = np.where(dk_salaries['Name'] == "Shohei Ohtani", 134045, dk_salaries['ID'])
    
    dk_salaries.to_csv(os.path.join(baseball_path, "A7. Matchups - 1. Salaries", dk_name))
    
    print(dk_salaries.query("Name == 'Shohei Ohtani'"))
    
create_dk_salaries()

     Position                    Name + ID              Name        ID  \
0          RP   Jeffrey Springs (27647887)   Jeffrey Springs  27647887   
1          SP      Zack Wheeler (27647888)      Zack Wheeler  27647888   
2          SP      Max Scherzer (27647889)      Max Scherzer  27647889   
3          SP  Shane McClanahan (27647891)  Shane McClanahan  27647891   
4          RP   Spencer Strider (27647890)   Spencer Strider  27647890   
...       ...                          ...               ...       ...   
1007       OF     Luis Gonzalez (27647843)     Luis Gonzalez  27647843   
1008        C      Tres Barrera (27647851)      Tres Barrera  27647851   
1009       1B     Ronald Guzman (27647862)     Ronald Guzman  27647862   
1010       OF     Oscar Mercado (27647878)     Oscar Mercado  27647878   
1011       2B      Ford Proctor (27647880)      Ford Proctor  27647880   

     Roster Position  Salary                      Game Info TeamAbbrev  \
0                  P   11500   HOU@TB

In [6]:
# This reads in saves salary files and keeps relevant variables. It also creates a list of games.
def clean_dk_salaries(date=todaysdate):
    # Read in DK Salaries
    dk_name = "DKSalaries_" + date + ".csv"
    # Clean DK salaries for merge
    dk_salaries = pd.read_csv(os.path.join(baseball_path, "A7. Matchups - 1. Salaries", dk_name))
    
    # Clean game info
    # For scraped, we already fix the names - make this better universally
    dk_salaries['Game Info'] = dk_salaries['Game Info'].replace({"CWS":"CHW", "KC": "KCR", "SD": "SDP", "SF":"SFG", "TB":"TBR", "WAS":"WSN", "@": "_", ":": "", "/": ""}, regex=True)
    dk_salaries['Game Info'] = dk_salaries['Game Info'].replace({"@": "_", ":": "", "/": ""}, regex=True)
    # Merge with team map to get baseball reference names
    dk_salaries = dk_salaries.merge(team_map, left_on='TeamAbbrev', right_on='DKTEAM', how='left')

    # Convert to Baseball Reference team code
    dk_salaries['TeamAbbrev'] = dk_salaries['BBREFTEAM']

    # Change Ohtani's number
    dk_salaries['ID'] = np.where((dk_salaries['Name'] == "Shohei Ohtani"), 134045, dk_salaries['ID'])
   
    # Clean names
    dk_salaries = name_clean(dk_salaries)

    # This is all we need to merge. 
    dk_salaries_cut = dk_salaries[['First2', 'Last5', 'BBREFTEAM', 'Salary', 'ID']]


    # We also want a separate game info df with teams, time, who's home, date
    game_info = dk_salaries[['Game Info']]

    # This is the list of all matchups
    matchups = game_info['Game Info'].unique()
    matchups = matchups.tolist()
    
    try:
        matchups.remove('Postponed')
    except:
        pass
    try:
        matchups.remove('Cancelled')
    except:
        pass
    
    return dk_salaries_cut, matchups

# FanGraphs Replacements

In [7]:
def fg_batter_replace(df):
    # Replace with FanGraphs imputations if sample is small 
    stat_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 'woba', 'obp', 'slg']
    
    # Replace small sample size stats with FanGraphs imputations
    # Left
    df['pa_b_l'].fillna(0, inplace=True)
    for stat in stat_list:
        short = stat + "_b_l"
        long = stat + "_b_long_l"
        fangraphs = stat + "_l"
        df[short] = np.where(df['pa_b_l'] < 40, df[fangraphs], df[short])
        df[long] = np.where(df['pa_b_l'] < 40, df[fangraphs], df[long])

    # Right
    df['pa_b_r'].fillna(0, inplace=True)
    for stat in stat_list:
        short = stat + "_b_r"
        long = stat + "_b_long_r"
        fangraphs = stat + "_r"
        df[short] = np.where(df['pa_b_r'] < 40, df[fangraphs], df[short])
        df[long] = np.where(df['pa_b_r'] < 40, df[fangraphs], df[long])
        
    return df

In [8]:
def fg_pitcher_replace(df):
    # Replace with FanGraphs imputations if sample is small 
    stat_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 'woba', 'obp', 'slg']
    
    # Replace small sample size stats with FanGraphs imputations
    # Left
    df['pa_p_l'].fillna(0, inplace=True)
    for stat in stat_list:
        short = stat + "_p_l"
        long = stat + "_p_long_l"
        fangraphs = stat + "_l"
        df[short] = np.where(df['pa_p_l'] < 40, df[fangraphs], df[short])
        df[long] = np.where(df['pa_p_l'] < 40, df[fangraphs], df[long])

    # Right
    df['pa_p_r'].fillna(0, inplace=True)
    for stat in stat_list:
        short = stat + "_p_r"
        long = stat + "_p_long_r"
        fangraphs = stat + "_r"
        df[short] = np.where(df['pa_p_r'] < 40, df[fangraphs], df[short])
        df[long] = np.where(df['pa_p_r'] < 40, df[fangraphs], df[long])
        
    return df

In [9]:
def fill_ins(df):
    # Take first two characters of first name
    df['First2_fill'] = df['firstName'].str.slice(0,2)
    # And first 5 characters of last name
    df['Last5_fill'] = df['lastName'].str.slice(0,5)

    # Make lower case
    df['First2_fill'] = df['First2_fill'].str.lower()
    df['Last5_fill'] = df['Last5_fill'].str.lower()

    # Make string (this makes the f_remove_accents function work properly
    df['First2_fill'] = df['First2_fill'].astype(str) # this one is necessary
    df['Last5_fill'] = df['Last5_fill'].astype(str) # this one is not

    # Remove accents
    df['First2_fill'] = df.apply(lambda x: remove_accents(x['First2_fill']), axis=1)  # remove accents
    df['Last5_fill'] = df.apply(lambda x: remove_accents(x['Last5_fill']), axis=1)  # remove accents

    # Remove abnormal characters
    df['First2_fill'] = df['First2_fill'].str.replace('[^a-zA-Z0-9 ]', '')
    df['Last5_fill'] = df['Last5_fill'].str.replace('[^a-zA-Z0-9 ]', '')
    
    df['First2'].fillna(df['First2_fill'],inplace=True)
    df['Last5'].fillna(df['Last5_fill'],inplace=True)
    
    try:
        df['outs'].fillna(9, inplace=True)
        df['avgFaced'].fillna(15, inplace=True)
        df['starter_api'].fillna(1, inplace=True)
    except:
        pass
    
    df.drop(columns=['First2_fill', 'Last5_fill'], inplace=True)
    
    return df

# Matchup

In [10]:
def clean_matchups(df):
    # Add RP leverage to starting pitcher leverage (1 if starter)
    df['Leverage'] = np.where(df['starter'] == 1, 1, df['Leverage'])
    
    # Determine batting order
    df['batting_order'] = np.nan
    for i in range(9):
        df['batting_order'] = np.where(df['order'] == (i+1)*100, i+1, df['batting_order'])
    
    # Imputed flag
    try:
        df['imp'] = np.where(df['pa_b_long_r'] < 40, 1, 0)
    except:
        df['imp'] = np.where(df['pa_p_long_r'] < 40, 1, 0)
        
    # Delete unnamed columns
    df = df.loc[:,~df.columns.str.startswith('Unnamed')]
    
    return df

In [11]:
def create_matchups(date=todaysdate):
    # Read in DK Salaries data
    dk_salaries_cut, matchups = clean_dk_salaries(date)
    # print(dk_salaries_cut[['First2', 'Last5']])
    
    daily_folder = "Daily" + date
    for matchup in matchups:
        
        # Create new folder with daily rosters
        matchup_folder = "Matchups" + date
        try:
            os.mkdir(os.path.join(baseball_path, "A7. Matchups - 2. Matchups", matchup_folder))
        except:
            pass
        
        away = matchup[0:3]
        home = matchup[4:7]
        
        away_file = away + date + ".xlsx"
        home_file = home + date + ".xlsx"
        
        # Read in away data
        away_batters = pd.read_excel(os.path.join(baseball_path, "A5. Stats - 2. Teams", daily_folder, away_file), sheet_name='Batters')
        away_pitchers = pd.read_excel(os.path.join(baseball_path, "A5. Stats - 2. Teams", daily_folder, away_file), sheet_name='Pitchers')
        
        # Read in home data
        home_batters = pd.read_excel(os.path.join(baseball_path, "A5. Stats - 2. Teams", daily_folder, home_file), sheet_name='Batters')
        home_pitchers = pd.read_excel(os.path.join(baseball_path, "A5. Stats - 2. Teams", daily_folder, home_file), sheet_name='Pitchers')
            
        # Replace stats with FG stats if there are too few PAs
        away_batters = fg_batter_replace(away_batters)
        away_pitchers = fg_pitcher_replace(away_pitchers)
        home_batters = fg_batter_replace(home_batters)
        home_pitchers = fg_pitcher_replace(home_pitchers)
        
        # Fill in missings
        away_batters = fill_ins(away_batters)
        away_pitchers = fill_ins(away_pitchers)
        home_batters = fill_ins(home_batters)
        home_pitchers = fill_ins(home_pitchers)
    
        # Merge with DK Salaries
        away_batters = away_batters.merge(dk_salaries_cut, on=['First2', 'Last5', 'BBREFTEAM'], how='left')
        away_pitchers = away_pitchers.merge(dk_salaries_cut, on=['First2', 'Last5', 'BBREFTEAM'], how='left')
        
        home_batters = home_batters.merge(dk_salaries_cut, on=['First2', 'Last5', 'BBREFTEAM'], how='left')
        home_pitchers = home_pitchers.merge(dk_salaries_cut, on=['First2', 'Last5', 'BBREFTEAM'], how='left')
        
        # Create name variable so Sims reads in right objects
        away_batters['Name'] = away_batters['fullName']
        away_pitchers['Name'] = away_pitchers['fullName']
        home_batters['Name'] = home_batters['fullName']
        home_pitchers['Name'] = home_pitchers['fullName']
             
        # Clean
        away_batters = clean_matchups(away_batters)
        away_pitchers = clean_matchups(away_pitchers)
        
        home_batters = clean_matchups(home_batters)
        home_pitchers = clean_matchups(home_pitchers)   
        
        # Since they're getting fixed, we can make imp all 0, eventually get rid of it
        away_batters['imp'] = 0
        away_pitchers['imp'] = 0
        home_batters['imp'] = 0
        home_pitchers['imp'] = 0
        
        
        # Add lineups
        away_batters = away_batters.merge(lineups, on=['key_mlbam', 'BBREFTEAM'], how='left')
        home_batters = home_batters.merge(lineups, on=['key_mlbam', 'BBREFTEAM'], how='left')
        away_pitchers = away_pitchers.merge(lineups, on=['key_mlbam', 'BBREFTEAM'], how='left')
        home_pitchers = home_pitchers.merge(lineups, on=['key_mlbam', 'BBREFTEAM'], how='left')
              
        # Don't need this if backtesting
        # Fill in batting orders
        away_batters['batting_order_fill'].fillna("-1", inplace=True)
        away_batters['batting_order'] = away_batters['batting_order_fill'].astype('int')
        away_batters['batting_order'] = np.where(away_batters['batting_order'] == -1, np.nan, away_batters['batting_order'])
        away_batters.drop(columns={'batting_order_fill'}, inplace=True)
        
        home_batters['batting_order_fill'].fillna("-1", inplace=True)
        home_batters['batting_order'] = home_batters['batting_order_fill'].astype('int')
        home_batters['batting_order'] = np.where(home_batters['batting_order'] == -1, np.nan, home_batters['batting_order'])
        home_batters.drop(columns={'batting_order_fill'}, inplace=True)
        
        away_pitchers['batting_order'] = away_pitchers['batting_order_fill']
        away_pitchers['starter'] = np.where(away_pitchers['batting_order'] == "SP", 1, 0)
        away_pitchers['Leverage'] = np.where(away_pitchers['batting_order'] == "SP", 1, away_pitchers['Leverage'])
        away_pitchers.drop(columns={'batting_order_fill'}, inplace=True)
        
        home_pitchers['batting_order'] = home_pitchers['batting_order_fill']
        home_pitchers['starter'] = np.where(home_pitchers['batting_order'] == "SP", 1, 0)
        home_pitchers['Leverage'] = np.where(home_pitchers['batting_order'] == "SP", 1, home_pitchers['Leverage'])
        home_pitchers.drop(columns={'batting_order_fill'}, inplace=True)
        
        
        away_batters_list = list(away_pitchers.columns)
        
        batter_list = ['fullName', 'firstName', 'lastName', 'Salary', 'batting_order', 'starter', 'Leverage', 'batSide', 'pitchHand', 
         'position', 'BBREFTEAM', 
         'ID', 'id', 'key_mlbam', 'key_fangraphs', 'key_bbref_minors', 'key_bbref', 
         'name_first', 'name_last', 'First2', 'Last5', 'Name', 'order',        
         'status', 'venue_id', 'game_date', 'game_type', 'game_num', 'summary', 
         'weather', 'wind', 'missing', 
         'batSide_l', 'so_b_l', 'b1_b_l', 'b2_b_l', 'b3_b_l', 'hr_b_l', 'bb_b_l', 'hbp_b_l', 'lo_b_l', 
         'po_b_l', 'go_b_l', 'fo_b_l', 'pa_b_l', 'ab_b_l', 'woba_b_l', 'slg_b_l', 'obp_b_l', 'so_b_long_l',
         'b1_b_long_l', 'b2_b_long_l', 'b3_b_long_l', 'hr_b_long_l', 'bb_b_long_l', 'hbp_b_long_l', 'lo_b_long_l', 
         'po_b_long_l', 'go_b_long_l', 'fo_b_long_l', 'pa_b_long_l', 'ab_b_long_l', 'woba_b_long_l', 'slg_b_long_l', 'obp_b_long_l', 
         'batSide_r', 'so_b_r', 'b1_b_r', 'b2_b_r', 'b3_b_r', 'hr_b_r', 'bb_b_r', 'hbp_b_r', 'lo_b_r',
         'po_b_r', 'go_b_r', 'fo_b_r', 'pa_b_r', 'ab_b_r', 'woba_b_r', 'slg_b_r', 'obp_b_r', 'so_b_long_r', 'b1_b_long_r', 
         'b2_b_long_r', 'b3_b_long_r', 'hr_b_long_r', 'bb_b_long_r', 'hbp_b_long_r', 'lo_b_long_r', 'po_b_long_r', 'go_b_long_r', 
         'fo_b_long_r', 'pa_b_long_r', 'ab_b_long_r', 'woba_b_long_r', 'slg_b_long_r', 'obp_b_long_r', 
         'b_L', 'sba_imp', 'sbr', 'obp', 'slg', 'woba', 'b1_rate', 'b2_rate', 'b3_rate', 'hr_rate', 'bb_rate', 'hbp_rate', 'so_rate',
         'sba_2b', 'sba_3b', 'sb_2b', 'sb_3b', 
         'b1_l', 'b2_l', 'b3_l', 'bb_l', 'fo_l', 'go_l', 'hbp_l', 'hr_l', 'lo_l', 'po_l', 'so_l', 'woba_l', 'obp_l', 'slg_l', 
         'b1_r', 'b2_r', 'b3_r', 'bb_r', 'fo_r', 'go_r', 'hbp_r', 'hr_r', 'lo_r', 'po_r', 'so_r', 'woba_r', 'obp_r', 'slg_r', 
         'imp']
        
        pitcher_list = ['fullName', 'firstName', 'lastName', 'Salary', 'batting_order', 'starter', 'Leverage', 'batSide', 'pitchHand', 
         'position', 'BBREFTEAM', 
         'ID', 'id', 'key_mlbam', 'key_fangraphs', 'key_bbref_minors', 'key_bbref',  
         'name_first', 'name_last', 'First2', 'Last5', 'Name', 'order',               
         'status', 'venue_id', 'game_date', 'game_type', 'game_num', 'summary', 
         'weather', 'wind', 'missing',  
         'pitchHand_l', 'so_p_l', 'b1_p_l', 'b2_p_l', 'b3_p_l', 'hr_p_l', 'bb_p_l', 'hbp_p_l', 'lo_p_l', 
         'po_p_l', 'go_p_l', 'fo_p_l', 'pa_p_l', 'ab_p_l', 'woba_p_l', 'slg_p_l', 'obp_p_l', 'so_p_long_l', 
         'b1_p_long_l', 'b2_p_long_l', 'b3_p_long_l', 'hr_p_long_l', 'bb_p_long_l', 'hbp_p_long_l', 'lo_p_long_l', 
         'po_p_long_l', 'go_p_long_l', 'fo_p_long_l', 'pa_p_long_l', 'ab_p_long_l', 'woba_p_long_l', 'slg_p_long_l', 'obp_p_long_l', 
         'pitchHand_r', 'so_p_r', 'b1_p_r', 'b2_p_r', 'b3_p_r', 'hr_p_r', 'bb_p_r', 'hbp_p_r', 'lo_p_r', 
         'po_p_r', 'go_p_r', 'fo_p_r', 'pa_p_r', 'ab_p_r', 'woba_p_r', 'slg_p_r', 'obp_p_r', 
         'so_p_long_r', 'b1_p_long_r', 'b2_p_long_r', 'b3_p_long_r', 'hr_p_long_r', 'bb_p_long_r', 'hbp_p_long_r', 'lo_p_long_r', 
         'po_p_long_r', 'go_p_long_r', 'fo_p_long_r', 'pa_p_long_r', 'ab_p_long_r', 'woba_p_long_r', 'slg_p_long_r', 'obp_p_long_r', 
         'p_L', 'outs', 'avgFaced',
         'H/9', 'HR/9', 'K/9', 'BB/9', 
         'b1_l', 'b2_l', 'b3_l', 'bb_l', 'fo_l', 'go_l', 'hbp_l', 'hr_l', 'lo_l', 'po_l', 'so_l', 'woba_l', 'obp_l', 'slg_l', 
         'b1_r', 'b2_r', 'b3_r', 'bb_r', 'fo_r', 'go_r', 'hbp_r', 'hr_r', 'lo_r', 'po_r', 'so_r', 'woba_r', 'obp_r', 'slg_r', 
         'imp']
        
        away_batters = away_batters[batter_list]
        home_batters = home_batters[batter_list]
        away_pitchers = away_pitchers[pitcher_list]
        home_pitchers = home_pitchers[pitcher_list]
        
        away_batters.sort_values('batting_order', inplace=True)
        home_batters.sort_values('batting_order', inplace=True)
        away_pitchers.sort_values('Leverage', inplace=True)
        home_pitchers.sort_values('Leverage', inplace=True)
        
        print(matchup)
        if away_batters['batting_order'].sum() != 45:
            print("The sum of the away team's batting order is {}.".format(away_batters['batting_order'].sum()))
        if home_batters['batting_order'].sum() != 45:
            print("The sum of the home team's batting order is {}.".format(home_batters['batting_order'].sum()))
        if 1 not in list(away_pitchers['Leverage']):
            print("The away team is missing a starting pitcher.")
        if 4 not in list(away_pitchers['Leverage']):
            print("The away team is missing a closer.")
        if 1 not in list(home_pitchers['Leverage']):
            print("The home team is missing a starting pitcher.")
        if 4 not in list(home_pitchers['Leverage']):
            print("The home team is missing a closer.")
        
        # Create file named after matchup
        matchup_file = matchup + ".xlsx"
        
        # Write to Excel
        away_batters.to_excel(os.path.join(baseball_path, "A7. Matchups - 2. Matchups", matchup_folder, matchup_file), sheet_name="AwayBatters", engine='openpyxl')

        with pd.ExcelWriter(os.path.join(baseball_path, "A7. Matchups - 2. Matchups", matchup_folder, matchup_file), mode='a', engine='openpyxl') as writer:  
            away_pitchers.to_excel(writer, sheet_name='AwayPitchers')

        with pd.ExcelWriter(os.path.join(baseball_path, "A7. Matchups - 2. Matchups", matchup_folder, matchup_file), mode='a', engine='openpyxl') as writer:  
            home_batters.to_excel(writer, sheet_name='HomeBatters')

        with pd.ExcelWriter(os.path.join(baseball_path, "A7. Matchups - 2. Matchups", matchup_folder, matchup_file), mode='a', engine='openpyxl') as writer:  
            home_pitchers.to_excel(writer, sheet_name='HomePitchers')

In [12]:
create_matchups(todaysdate)
# Would be nice to sort these by time

HOU_TBR 04262023 0640PM ET
SEA_PHI 04262023 0640PM ET
WSN_NYM 04262023 0710PM ET
MIA_ATL 04262023 0720PM ET
SDP_CHC 04262023 0740PM ET
LAD_PIT 04262023 0635PM ET
The sum of the away team's batting order is 39.0.
STL_SFG 04262023 0945PM ET
The sum of the away team's batting order is 37.0.
OAK_LAA 04262023 0938PM ET


# Run All

In [13]:
# # Loop over dates that we have fangraphs projections for
# for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data\Salaries Scraped"):
#     date = filename[11:19]
#     print(date)
#     try:
#         create_matchups(date)
#     except:
#         print("Can't do")

In [14]:
print("Code was last run on: {} at {}.".format(datetime.date.today(), datetime.datetime.now().strftime("%H:%M:%S")))

Code was last run on: 2023-04-26 at 18:10:47.
