# Matchups
Version 1.1 <br>
Created 4/4/2022 <br>
Runtime: 10 seconds
Description: This reads in DK CSV and creates matchup files, with sheets for each team's hitters and pitchers <br>

Version 1.0 Creates matchup files <br>
Version 1.1 Creates date-specific matchup folder 

# Download from DK Now!

In [1]:
import numpy as np
import pandas as pd
import os
import xlrd
import unidecode
import datetime
from datetime import date
from pathlib import Path
import os
import glob

from distutils.dir_util import copy_tree

baseball_path = r"C:\Users\james\Documents\MLB\Data"
download_path = r"C:\Users\james\Downloads"

In [2]:
todaysdate = date.today()
todaysdate_dash = str(todaysdate)
todaysdate = todaysdate_dash.replace("-", "")

# todaysdate = "20221008"
print(todaysdate)

20221014


In [3]:
# Empty the matchup folder
[f.unlink() for f in Path(os.path.join(baseball_path, "Matchups")).glob("*") if f.is_file()] 

[None, None, None]

In [4]:
# Clean names for consistency
def name_clean(df):    
    df['Name_Adjusted'] = df['Name'].apply(unidecode.unidecode)  # remove accents
    df['Name_Adjusted'] = df['Name_Adjusted'].str.replace('[^a-zA-Z0-9 ]', '')

    df['Name_Adjusted'] = np.where(df['Name_Adjusted'] == "Josh HSmith", "Josh Smith", df['Name_Adjusted'])
    
    # Separate first and last names
    df['First'] = df['Name_Adjusted'].str.split(" ").str[0] 
    df['First'] = df['First'].str.lower()
    df['Last'] = df['Name_Adjusted'].str.split(" ").str[1]
    df['Last'] = df['Last'].str.lower()
    
    # Take first 2 letters of first name and first 5 of last as a sort of merge code
    df['First2'] = df['First'].str.slice(0,2)
    df['Last5'] = df['Last'].str.slice(0,5)
    
    return df

In [5]:
# This reads in a map of team name, codes, and the number Fangraphs uses in their URLs
team_map = pd.read_csv(os.path.join(baseball_path, "Utilities", "Team Map.csv"))

# We just need teams right now
team_map = team_map[['DKTEAM', 'BBREFTEAM', 'SFBBTEAM']]
team_map

Unnamed: 0,DKTEAM,BBREFTEAM,SFBBTEAM
0,ARI,ARI,ARI
1,ATL,ATL,ATL
2,BAL,BAL,BAL
3,BOS,BOS,BOS
4,CHC,CHC,CHC
5,CWS,CHW,CHW
6,CIN,CIN,CIN
7,CLE,CLE,CLE
8,COL,COL,COL
9,DET,DET,DET


In [6]:
# Find all DK downloads
dk_files = glob.glob(os.path.join(download_path, 'DKSalaries*.csv'))
dk_files.sort(key=os.path.getmtime)

latest = dk_files[-1]

print(latest)

dk_name = "DKSalaries_" + todaysdate + ".csv"

# Clean DK salaries for merge
dk_salaries = pd.read_csv(os.path.join(download_path, latest))
dk_salaries.to_csv(os.path.join(baseball_path, "Salaries", dk_name))

dk_salaries['Game Info'] = dk_salaries['Game Info'].replace({"CWS":"CHW", "KC": "KCR", "SD": "SDP", "SF":"SFG", "TB":"TBR", "WAS":"WSN", "@": "_", ":": "", "/": ""}, regex=True)
dk_salaries = dk_salaries.merge(team_map, left_on='TeamAbbrev', right_on='DKTEAM', how='left')

# Convert to Baseball Reference team code
dk_salaries['TeamAbbrev'] = dk_salaries['BBREFTEAM']

# Change Ohtani's number
dk_salaries['ID'] = np.where((dk_salaries['Name'] == "Shohei Ohtani"), 134045, dk_salaries['ID'])
         
# Clean names
dk_salaries = name_clean(dk_salaries)

# This is all we need to merge. 
dk_salaries_cut = dk_salaries[['First2', 'Last5', 'TeamAbbrev', 'Salary', 'ID']]


# We also want a separate game info df with teams, time, who's home, date
game_info = dk_salaries[['Game Info']]

del dk_salaries

C:\Users\james\Downloads\DKSalaries (6).csv


  df['Name_Adjusted'] = df['Name_Adjusted'].str.replace('[^a-zA-Z0-9 ]', '')


In [7]:
# Read in lineup from website (don't need to download anymore)
todayslineup = "https://baseballmonster.com/Lineups.aspx?csv=1&d=" + todaysdate_dash
lineups = pd.read_csv(todayslineup)

# Calculate number of games played
lineups['games'] = lineups.groupby('team code')[' game_number'].transform(max)
# Keep last game
lineups = lineups[lineups['games'] == lineups[' game_number']]


# Clean daily lineups for merge

# Change name variable for compatibility with cleaner
lineups.rename(columns={' player name': 'Name'}, inplace=True)

# Clean lineup names
lineups = name_clean(lineups)

# Pitcher Ohtani is missing an ID for some reason
lineups[' mlb id'] = np.where((lineups['Name'] == "Shohei (P) Ohtani)") & (lineups[' batting order'] == "SP"), 660271, lineups[' mlb id'])

# Merge team codes onto lineups to get baseball reference team code
lineups = lineups.merge(team_map, left_on='team code', right_on='SFBBTEAM', how='left')

# Keep only relevant variables
lineups = lineups[[' mlb id', ' batting order', 'DKTEAM', 'BBREFTEAM', 'First2', 'Last5']]

  df['Name_Adjusted'] = df['Name_Adjusted'].str.replace('[^a-zA-Z0-9 ]', '')


In [8]:
pa_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo','po', 'go', 'fo']

In [9]:
# This simply determines if a pitcher is a starter that day
def determine_leverage(df):
    # Batting order will either say SP for starters or a number for Ohtani
    df['Leverage'] = np.where(~df[' batting order'].isna(), 1, df['Leverage'])
    number_of_starters = len(df[df['Leverage'] == 1])
    print(number_of_starters)
    if number_of_starters > 1:
        df['Leverage'] = np.where(df['Name'] == "Shohei Ohtani", 0, df['Leverage'])
    df['Leverage'].fillna(0, inplace=True)
    
    return df

In [10]:
# This is the list of all matchups
matchups = game_info['Game Info'].unique()
print(type(matchups))
matchups = matchups.tolist()
try:
    matchups.remove('Postponed')
except:
    pass
try:
    matchups.remove('Cancelled')
except:
    pass

print(matchups)

for matchup in matchups:
    print(matchup)
    away = matchup[0:3]
    home = matchup[4:7]

    away_file = away + ".xlsx"
    home_file = home + ".xlsx"
    
    # away_batters.drop(columns='key_mlbam_y', inplace=True)
    # home_batters.drop(columns='key_mlbam_y', inplace=True)
    # away_batters.rename(columns={'key_mlbam_x':'key_mlbam'}, inplace=True)
    # home_batters.rename(columns={'key_mlbam_x':'key_mlbam'}, inplace=True)
    
    # Read in away batter data
    away_batters = pd.read_excel(os.path.join(baseball_path, "Rosters", away_file), sheet_name='Batters')
    # Fill in missing key_mlbam variable
    away_batters['key_mlbam'].fillna(away_batters['key_fangraphs'], inplace=True)   
    # Merge with lineups
    # Try using ID
    away_batters = away_batters.merge(lineups, left_on='key_mlbam', right_on=' mlb id', how='left', suffixes=('', '_1'))  
    # If that doesn't work, use First2 Last5
    away_batters = away_batters.merge(lineups, left_on=['First2', 'Last5', 'TeamAbbrev'], right_on=['First2', 'Last5', 'BBREFTEAM'], how='left', suffixes=('', '_2'))
    # And then fill in missings     
    away_batters[' batting order'].fillna(away_batters[' batting order_2'], inplace=True)
    # Merge with Salaries
    away_batters = away_batters.merge(dk_salaries_cut, on=['First2', 'Last5', 'TeamAbbrev'], how='left')
    
    away_batters['Salary'] = np.where(away_batters['Salary'] == 0, 99999, away_batters['Salary'])


    # Create batting order
    # Remove pitchers (shouldn't be necessary next year)
    away_batters[' batting order'] = pd.to_numeric(away_batters[' batting order'], errors='coerce')
    away_batters.sort_values(by=[' batting order'], inplace=True)
    away_batters.reset_index(inplace=True)


    # Read in home batter data
    home_batters = pd.read_excel(os.path.join(baseball_path, "Rosters", home_file), sheet_name='Batters')
    # Fill in missing key_mlbam variable
    home_batters['key_mlbam'].fillna(home_batters['key_fangraphs'], inplace=True)
    # Merge with lineups
    # Try using ID
    home_batters = home_batters.merge(lineups, left_on='key_mlbam', right_on=' mlb id', how='left', suffixes=('', '_1'))  
    # If that doesn't work, use First2 Last5
    home_batters = home_batters.merge(lineups, left_on=['First2', 'Last5', 'TeamAbbrev'], right_on=['First2', 'Last5', 'BBREFTEAM'], how='left', suffixes=('', '_2'))
    # And then fill in missings     
    home_batters[' batting order'].fillna(home_batters[' batting order_2'], inplace=True)
    # Merge with salaries
    home_batters = home_batters.merge(dk_salaries_cut, on=['First2', 'Last5', 'TeamAbbrev'], how='left')
    
    home_batters['Salary'] = np.where(home_batters['Salary'] == 0, 99999, home_batters['Salary'])

    # Create batting order
    # Remove pitchers 
    home_batters[' batting order'] = pd.to_numeric(home_batters[' batting order'], errors='coerce')
    home_batters.sort_values(by=[' batting order'], inplace=True)
    home_batters.reset_index(inplace=True)

    
    
    # Read in away pitcher data
    away_pitchers = pd.read_excel(os.path.join(baseball_path, "Rosters", away_file), sheet_name='Pitchers')
    # Fill in missing key_mlbam variable
    away_pitchers['key_mlbam'].fillna(away_pitchers['key_fangraphs'], inplace=True)
    # Merge with lineups
    # Try using ID
    away_pitchers = away_pitchers.merge(lineups, left_on='key_mlbam', right_on=' mlb id', how='left', suffixes=('', '_1'))
    # If that doesn't work, use First2 Last5
    away_pitchers = away_pitchers.merge(lineups, left_on=['First2', 'Last5', 'TeamAbbrev'], right_on=['First2', 'Last5', 'BBREFTEAM'], how='left', suffixes=('', '_2'))
    # And then fill in missings
    away_pitchers[' batting order'].fillna(away_pitchers[' batting order_2'], inplace=True)    
    # Merge with Salaries
    away_pitchers = away_pitchers.merge(dk_salaries_cut, on=['First2', 'Last5', 'TeamAbbrev'], how='left')
    
    away_pitchers['Salary'] = np.where(away_pitchers['Salary'] == 0, 99999, away_pitchers['Salary'])
    away_pitchers = determine_leverage(away_pitchers)

    # Read in home pitcher data
    home_pitchers = pd.read_excel(os.path.join(baseball_path, "Rosters", home_file), sheet_name='Pitchers')
    # Fill in missing key_mlbam variable
    home_pitchers['key_mlbam'].fillna(home_pitchers['key_fangraphs'], inplace=True)
    # Merge with lineups
    # Try using ID
    home_pitchers = home_pitchers.merge(lineups, left_on='key_mlbam', right_on=' mlb id', how='left', suffixes=('', '_1'))
    # If that doesn't work, use First2 Last5
    home_pitchers = home_pitchers.merge(lineups, left_on=['First2', 'Last5', 'TeamAbbrev'], right_on=['First2', 'Last5', 'BBREFTEAM'], how='left', suffixes=('', '_2'))
    # And then fill in missings
    home_pitchers[' batting order'].fillna(home_pitchers[' batting order_2'], inplace=True)    
    # Merge with Salaries
    home_pitchers = home_pitchers.merge(dk_salaries_cut, on=['First2', 'Last5', 'TeamAbbrev'], how='left')
    
    home_pitchers['Salary'] = np.where(home_pitchers['Salary'] == 0, 99999, home_pitchers['Salary'])
    home_pitchers = determine_leverage(home_pitchers)


    matchup_file = matchup + ".xlsx"
    file_name = os.path.join(baseball_path, "Matchups", matchup_file)

    # Rename batting order variable (need to get rid of spaces to use in player objects)
    away_batters.rename(columns={' batting order': "batting_order"}, inplace=True)
    home_batters.rename(columns={' batting order': "batting_order"}, inplace=True)

    # Drop unneeded variables
    home_batters.drop(columns=['index', 'name_nick', 'name_last', 'name_first', ' mlb id', 'First2_1', 'Last5_1', ' mlb id_2', ' batting order_2', 'Name_Chadwick'], axis=1, inplace=True)
    away_batters.drop(columns=['index', 'name_nick', 'name_last', 'name_first', ' mlb id', 'First2_1', 'Last5_1', ' mlb id_2', ' batting order_2', 'Name_Chadwick'], axis=1, inplace=True)

    # Move batting order first
    # Home batters
    first_column = home_batters.pop('batting_order')
    home_batters.insert(0, 'batting_order', first_column)
    # Away batters
    first_column = away_batters.pop('batting_order')
    away_batters.insert(0, 'batting_order', first_column)
    
    # Move leverage first
    # Home pitchers
    first_column = home_pitchers.pop('Leverage')
    home_pitchers.insert(0, 'Leverage', first_column)
    # Away pitchers
    first_column = away_pitchers.pop('Leverage')
    away_pitchers.insert(0, 'Leverage', first_column)
    
    away_batters['ID'].fillna(away_batters['key_fangraphs'], inplace=True)
    home_batters['ID'].fillna(home_batters['key_fangraphs'], inplace=True)
    away_pitchers['ID'].fillna(away_pitchers['key_fangraphs'], inplace=True)
    home_pitchers['ID'].fillna(home_pitchers['key_fangraphs'], inplace=True)
    
    away_batters['Salary'].fillna(999999, inplace=True)
    home_batters['Salary'].fillna(999999, inplace=True)
    away_pitchers['Salary'].fillna(999999, inplace=True)
    home_pitchers['Salary'].fillna(999999, inplace=True)
    
    # If this is not 0, we're missing a batter
    print(45 - away_batters['batting_order'].sum())
    print(45 - home_batters['batting_order'].sum())
    
    
#     # Fill in with averages of team (ideally, you want to do averages of non-starters) Fix later
#     bat_list = ['so_l', 'b1_l', 'b2_l', 'b3_l', 'hr_l', 'bb_l', 'hbp_l', 'lo_l', 'po_l', 'go_l', 'fo_l', 
#                  'so_r', 'b1_r', 'b2_r', 'b3_r', 'hr_r', 'bb_r', 'hbp_r', 'lo_r', 'po_r', 'go_r', 'fo_r', 
#                  'sba_2b', 'sba_3b', 'sba_2b', 'sba_3b']
    
#     pit_list = ['so_l', 'b1_l', 'b2_l', 'b3_l', 'hr_l', 'bb_l', 'hbp_l', 'lo_l', 'po_l', 'go_l', 'fo_l', 
#              'so_r', 'b1_r', 'b2_r', 'b3_r', 'hr_r', 'bb_r', 'hbp_r', 'lo_r', 'po_r', 'go_r', 'fo_r', 
#              'avgouts', 'starter_sample']
    
#     for stat in bat_list:
#         away_batters[stat].fillna(away_batters[stat].mean(), inplace=True)
#         home_batters[stat].fillna(home_batters[stat].mean(), inplace=True)
    
#     for stat in pit_list:
#         away_pitchers[stat].fillna(away_pitchers[stat].mean(), inplace=True)
#         home_pitchers[stat].fillna(home_pitchers[stat].mean(), inplace=True)
                 
    
    # Write to Excel
    away_batters.to_excel(file_name, sheet_name="AwayBatters", engine='openpyxl')

    with pd.ExcelWriter(file_name, mode='a', engine='openpyxl') as writer:  
        away_pitchers.to_excel(writer, sheet_name='AwayPitchers')

    with pd.ExcelWriter(file_name, mode='a', engine='openpyxl') as writer:  
        home_batters.to_excel(writer, sheet_name='HomeBatters')

    with pd.ExcelWriter(file_name, mode='a', engine='openpyxl') as writer:  
        home_pitchers.to_excel(writer, sheet_name='HomePitchers')
        
print("Done")

<class 'numpy.ndarray'>
['CLE_NYY 10142022 0107PM ET', 'ATL_PHI 10142022 0437PM ET', 'LAD_SDP 10142022 0837PM ET']
CLE_NYY 10142022 0107PM ET
1
1
0.0
0.0
ATL_PHI 10142022 0437PM ET
1
1
0.0
0.0
LAD_SDP 10142022 0837PM ET
1
1
0.0
0.0
Done


# Notes:
At some point, you lost any indicator for whether a batter is a switch hitter. Now, we're just using the hand fairly randomly assigned in the statcast data. Fix this.
Basically only matters for filling in a switch hitters stats when you have little data

In [11]:
directory = "Matchups" + todaysdate

try:
    os.mkdir(os.path.join(baseball_path, "Matchups Archive", directory))
except:
    pass

In [12]:
from_directory = os.path.join(baseball_path, "Matchups")
to_directory = os.path.join(baseball_path, "Matchups Archive", directory)

copy_tree(from_directory, to_directory)

['C:\\Users\\james\\OneDrive\\Documents\\MLB\\Data\\Matchups Archive\\Matchups20221014\\ATL_PHI 10142022 0437PM ET.xlsx',
 'C:\\Users\\james\\OneDrive\\Documents\\MLB\\Data\\Matchups Archive\\Matchups20221014\\CLE_NYY 10142022 0107PM ET.xlsx',
 'C:\\Users\\james\\OneDrive\\Documents\\MLB\\Data\\Matchups Archive\\Matchups20221014\\LAD_SDP 10142022 0837PM ET.xlsx']

In [13]:
# This creates the folder where game sims will end up 
directory = "Matchups" + todaysdate

try:
    os.mkdir(os.path.join(baseball_path, "Player Sims", directory))
except:
    pass

This could be cleaned up to write to the date-specific folder only, without copying. This whole code is pretty gross and could benefit from some cleaning

Normal site doens't work in the playoffs. May have to enter manually.
https://www.rotowire.com/baseball/daily-lineups.php

In [14]:
print("Code was last run on: " + str(datetime.date.today()))

Code was last run on: 2022-10-14
