# Utilities

In [None]:
# Clean names
def remove_accents(old):
    new = re.sub(r'[àáâãäå]', 'a', old)
    new = re.sub(r'[èéêë]', 'e', new)
    new = re.sub(r'[ìíîï]', 'i', new)
    new = re.sub(r'[òóôõö]', 'o', new)
    new = re.sub(r'[ùúûü]', 'u', new)
    new = re.sub(r'[ñ]', 'n', new)
    return new

In [None]:
# Clean names for consistency
# This is really only used now to clean DK Salaries
# You should add players to this if they're not merging on salary information
def name_clean(df, name_col):
    df['Name'] = df[f'{name_col}']
    df['Name'] = np.where(df['Name'] == "Kike Hernandez", "Enrique Hernandez", df['Name'])
    df['Name'] = np.where(df['Name'] == "Michael A. Taylor", "Michael Taylor", df['Name'])
    # Note: to get all the de la Cruz's of the world right as last names, we need to manually add the Ji Mans to be one word so they're the first name
    df['Name'] = np.where(df['Name'] == "Ji Man Choi", "Ji-Man Choi", df['Name'])
    df['Name'] = np.where(df['Name'] == "Ji Hwan Bae", "Ji-Hwan Bae", df['Name']) # he technically has no dash, but we need it so it treats last name properly
    df['Name'] = np.where(df['Name'] == "Hyun Jin Ryu", "Hyun-Jin Ryu", df['Name'])
    
    
    df['Name'] = df.apply(lambda x: remove_accents(x['Name']), axis=1)  # remove accents
    df['Name'] = df['Name'].str.replace(r'[^a-zA-Z0-9 ]', '')
    df['Name'] = df['Name'].str.replace("Jr", "")
    df['Name'] = df['Name'].str.replace("Sr", "")
    df['Name'] = df['Name'].str.replace("II", "")
    df['Name'] = df['Name'].str.replace("III", "")
    df['Name'] = df['Name'].str.replace(".", "")
    
    
    df['Name'] = df['Name'].str.strip()
    
    return df

In [None]:
# Clean FanGraphs ID
# Once had individual manual replacements, now could probably be done without
def fix_fangraphs(chadwick):      
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].astype('str')
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].str.replace(r'\.\d', "", regex=True)

    return chadwick

In [None]:
# Reads in select variables from the Chadwick Register
# You should add keys to this if they're not merging with FanGraphs data
def read_chadwick(keep_list):
    # Separated across these suffixes
    chadwick_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"]
    # Create a list with a dataframe for each suffix
    dataframe_list = []
    # Loop over suffix
    for char in chadwick_list:
        # Read in that csv, keeping relevant variables
        df = pd.read_csv("https://raw.githubusercontent.com/chadwickbureau/register/master/data/people-{}.csv".format(char), low_memory=False, encoding='utf-8')[keep_list]
        # Drop if missing key_mlbam
        df.dropna(subset=['key_mlbam'], axis=0, inplace=True)
        # Add to dataframe list
        dataframe_list.append(df)
    # Append all dataframes together
    chadwick = pd.concat(dataframe_list, axis=0).reset_index()
    
    # Edit missing fangraphs IDs (if all else fails)
    chadwick = fix_fangraphs(chadwick)
    
    
    chadwick['name_last'].fillna("Missing", inplace=True)
    chadwick['name_first'].fillna("Mr", inplace=True)
    
    chadwick['name_first'] = chadwick['name_first'].str.replace(" ", "")
    chadwick['name_first'] = chadwick['name_first'].str.replace(".", "")
    chadwick['name_last'] = chadwick['name_last'].str.replace(" ", "")
    chadwick['name_last'] = chadwick['name_last'].str.replace(".", "")
    
    # Remove accents
    chadwick['name_last'] = chadwick.apply(lambda x: remove_accents(x['name_last']), axis=1)  # remove accents
    chadwick['name_first'] = chadwick.apply(lambda x: remove_accents(x['name_first']), axis=1)  # remove accents

    # Remove non-alpha numeric characters
    chadwick['name_first'] = chadwick['name_first'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", "", x))
    chadwick['name_last'] = chadwick['name_last'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", "", x))
    
    # Return big dataframe
    return chadwick

In [None]:
# Searches for player IDs
def new_ids(player, team, website):
    # Google player plus fangraphs
    search = player + ' ' + team + ' player page ' + website
    url = 'https://www.google.com/search'

    headers = {
        'Accept' : '*/*',
        'Accept-Language': 'en-US,en;q=0.5',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82',
    }
    parameters = {'q': search}

    
    # Get info from URL
    content = requests.get(url, headers=headers, params=parameters).text
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.find(id = 'search')
    first_link = text.find('a')
    
    # FanGraph's ID is found a little differently
    if website == "fangraphs":
        website_id = first_link['href'].split("/")[5]

    # This should work for RotoWire and MLB.com
    else:
        website_id = first_link['href'].split("-")[-1]  
    
    # If it's a minor leaguer code, add quotes
    if website_id.startswith("sa"):
        website_id = "'" + website_id + "'"
        
    
    return website_id

In [None]:
# Reads in contest history
def contest_history(entry_min=8, date_min="20220301", date_max="20991231"):
    history = pd.read_csv(os.path.join(baseball_path, "Utilities", "draftkings-contest-entry-history.csv"))
    history = history[history['Sport'] == "MLB"]
    history = history[history['Contest_Entries'] >= entry_min]
    history.drop_duplicates('Contest_Key', inplace=True)
    
    history['date'] = pd.to_datetime(history['Contest_Date_EST']).dt.strftime('%Y%m%d')
    history = history[history['date'] > date_min]
    history = history[history['date'] < date_max]

    history = history.reset_index(drop=True)

    return history

In [None]:
# Calculate the share of each outcome as a share of either outs or not outs
def pa_share(df, hand=""):
    safe_list = ['b1', 'b2', 'b3', 'hr', 'bb', 'hbp']
    out_list  = ['so', 'fo', 'go', 'lo', 'po']
    
    # Share reaching base
    df[f'safe_share_b{hand}'] = 0
    df[f'safe_share_p{hand}'] = 0    
    df[f'safe_share_b_long{hand}'] = 0
    df[f'safe_share_p_long{hand}'] = 0    
    
    for stat in safe_list:
        df[f'safe_share_b{hand}'] = df[f'safe_share_b{hand}'] + df[f'{stat}_b{hand}']
        df[f'safe_share_p{hand}'] = df[f'safe_share_p{hand}'] + df[f'{stat}_p{hand}']
        df[f'safe_share_b_long{hand}'] = df[f'safe_share_b_long{hand}'] + df[f'{stat}_b_long{hand}']
        df[f'safe_share_p_long{hand}'] = df[f'safe_share_p_long{hand}'] + df[f'{stat}_p_long{hand}']
        
    # Share out
    df[f'out_share_b{hand}'] = 1 - df[f'safe_share_b{hand}']
    df[f'out_share_p{hand}'] = 1 - df[f'safe_share_p{hand}']
    df[f'out_share_b_long{hand}'] = 1 - df[f'safe_share_b_long{hand}']
    df[f'out_share_p_long{hand}'] = 1 - df[f'safe_share_p_long{hand}']
    
    # Calculate stats as percent of on base
    for stat in safe_list:
        df[f'{stat}_b{hand}'] = df[f'{stat}_b{hand}'] / df[f'safe_share_b{hand}']
        df[f'{stat}_p{hand}'] = df[f'{stat}_p{hand}'] / df[f'safe_share_p{hand}']
        df[f'{stat}_b_long{hand}'] = df[f'{stat}_b_long{hand}'] / df[f'safe_share_b_long{hand}']
        df[f'{stat}_p_long{hand}'] = df[f'{stat}_p_long{hand}'] / df[f'safe_share_p_long{hand}']
    
    # Calculate stats as percent of on base
    for stat in out_list:
        df[f'{stat}_b{hand}'] = df[f'{stat}_b{hand}'] / df[f'out_share_b{hand}']
        df[f'{stat}_p{hand}'] = df[f'{stat}_p{hand}'] / df[f'out_share_p{hand}']
        df[f'{stat}_b_long{hand}'] = df[f'{stat}_b_long{hand}'] / df[f'out_share_b_long{hand}']
        df[f'{stat}_p_long{hand}'] = df[f'{stat}_p_long{hand}'] / df[f'out_share_p_long{hand}']
    
    return df

In [None]:
# Calculate the share of each outcome as a share of either outs or not outs
def pa_share(df, position="_b", period="", hand=""):
    safe_list = ['b1', 'b2', 'b3', 'hr', 'bb', 'hbp']
    out_list  = ['so', 'fo', 'go', 'lo', 'po']
    
    # Share reaching base
    df[f'safe_share{position}{period}{hand}'] = 0
    
    for stat in safe_list:
        df[f'safe_share{position}{period}{hand}'] = df[f'safe_share{position}{period}{hand}'] + df[f'{stat}{position}{period}{hand}']
        
    # Share out
    df[f'out_share{position}{period}{hand}'] = 1 - df[f'safe_share{position}{period}{hand}']
    
    # Calculate stats as percent of on base
    for stat in safe_list:
        df[f'{stat}{position}{period}{hand}'] = df[f'{stat}{position}{period}{hand}'] / df[f'safe_share{position}{period}{hand}']
        
    # Calculate stats as percent of on base
    for stat in out_list:
        df[f'{stat}{position}{period}{hand}'] = df[f'{stat}{position}{period}{hand}'] / df[f'out_share{position}{period}{hand}']
    
    return df

In [None]:
# Read in schedule to extract gamePKs based on teams and dates
def read_schedule():
    # 2022 games
    games2022 = statsapi.schedule(start_date="03/04/2022", end_date="11/05/2022")
    # 2023 games
    games2023 = statsapi.schedule(start_date="03/04/2023", end_date="11/01/2023")
    # 2024 games
    games2024 = statsapi.schedule(start_date="03/04/2024", end_date="11/01/2024")
    # Add 'em together
    games = games2022 + games2023 + games2024
    
    return games

In [None]:
# Read in games (either from Stats API or pickle)
def read_and_save_games(team_map, generate=False):
    if generate == True:
        # This may generate an HTTP error, particularly in 2022. It's not on my end. Trying multiple times usually works.
        games = read_schedule()

        # Save to a pickle file
        with open(os.path.join(baseball_path, "games.pkl"), "wb") as file:
            pickle.dump(games, file)
            
    else:
        # Load the pickled file
        with open(os.path.join(baseball_path, "games.pkl"), "rb") as file:
            games = pickle.load(file)
    
    # Create dataframe
    game_df = pd.DataFrame(games)
    # Create date variable
    game_df['date'] = game_df['game_date'].str.replace("-","")
    # Create year variable
    game_df['year'] = game_df['game_date'].str[0:4]
    # Select subsample of games to run (exclude spring training, all-star games, exhibitions, and cancelled games
    game_df = game_df.query('game_type != "S" and game_type != "A" and game_type != "E" and status != "Cancelled"').reset_index(drop=True)
            
    # Select columns of interest
    team_map = team_map[['FULLNAME', 'BBREFTEAM']]
    
    # Merge in BBREFTEAM
    game_df = game_df.merge(team_map, left_on='away_name', right_on='FULLNAME', how='left')
    game_df = game_df.merge(team_map, left_on='home_name', right_on='FULLNAME', how='left')
        
    # Rename
    game_df.rename(columns={'BBREFTEAM_x':'away_team', 'BBREFTEAM_y':'home_team'},inplace=True)
    game_df.drop(columns={'FULLNAME_x', 'FULLNAME_y', 'away_name', 'home_name'},inplace=True)

    # Convert to numeric
    game_df['away_score'] = game_df['away_score'].astype('int')
    game_df['home_score'] = game_df['home_score'].astype('int')
    
    # Drop duplicates
    game_df.drop_duplicates('game_id', inplace=True, keep='last')
    game_df.reset_index(inplace=True, drop=True)
    
    return game_df

In [None]:
def pause_code(start_time='2023-08-09T07:24:30'):
    pause_until = datetime.datetime.fromisoformat(start_time) # or whatever timestamp you gonna need
    print((pause_until - datetime.datetime.now()).total_seconds())
    time.sleep((pause_until - datetime.datetime.now()).total_seconds())

In [6]:
def pause_code(start_time='2023-08-09T07:24:30', timezone='EST'):
    est_timezone = pytz.timezone('America/New_York')  # Eastern Standard Time (EST)
    
    # Convert start_time to datetime object in EST timezone
    naive_datetime = datetime.datetime.fromisoformat(start_time)
    est_start_time = est_timezone.localize(naive_datetime)

    # Convert EST time to UTC
    utc_start_time = est_start_time.astimezone(pytz.utc)

    time_difference = utc_start_time - datetime.datetime.now(pytz.utc)
    total_seconds = time_difference.total_seconds()
    
    hours = int(total_seconds // 3600)
    minutes = int((total_seconds % 3600) // 60)
    seconds = int(total_seconds % 60)
    
    est_time_str = est_start_time.strftime("%I:%M%p")
    time_until_str = f"{est_time_str}. {hours} hours, {minutes} minutes, and {seconds} seconds."
    
    print("Time until", time_until_str)

    # Loop with a small sleep interval, checking for interruption
    try:
        while total_seconds > 0:
            time.sleep(1)  # Sleep for 1 second
            total_seconds -= 1
    except KeyboardInterrupt:
        print("Program interrupted by user.")
        return


    ### Set date (may be different in morning)
    # Today's Date
    # YYYY-MM-DD (datetime)
    todaysdate_dt = datetime.date.today()
    
    # YYYY-MM-DD (string)
    todaysdate_dash = str(todaysdate_dt)
    
    # MM/DD/YYYY
    todaysdate_slash = todaysdate_dash.split("-")
    todaysdate_slash = todaysdate_slash[1] + "/" + todaysdate_slash[2] + "/" + todaysdate_slash[0]
    
    # YYYYMMDD
    todaysdate = todaysdate_dash.replace("-", "")
    
    ## MM-DD-YYYY
    todaysdate_dash = todaysdate[:4] + "-" + todaysdate[4:6] + "-" + todaysdate[6:]


    # Get the current date
    current_date = datetime.datetime.now()
    
    # Subtract one day from the current date to get yesterday's date
    yesterday_dt = current_date - datetime.timedelta(days=1)
    
    # Format yesterday's date as "YYYYMMDD"
    yesterdaysdate = yesterday_dt.strftime("%Y%m%d")
    
    # MM/DD/YYYY
    yesterdaysdate_slash = yesterdaysdate[4:6] + "/" + yesterdaysdate[6:8] + "/" + yesterdaysdate[0:4] 
    
    ## MM-DD-YYYY
    yesterdaysdate_dash = yesterdaysdate[:4] + "-" + yesterdaysdate[4:6] + "-" + yesterdaysdate[6:]

A universal function that takes the job of team_map for renaming? 
just like, look at column with name abbrevs, if CHW, CWS -> CWS