# Utilities

In [16]:
# import re
# import pandas as pd
# import numpy as np
# import requests
# from bs4 import BeautifulSoup
# import os

# import warnings
# warnings.simplefilter(action="ignore")

# baseball_path = r"C:\Users\james\Documents\MLB\Data2"


In [2]:
# Clean names
def remove_accents(old):
    new = re.sub(r'[àáâãäå]', 'a', old)
    new = re.sub(r'[èéêë]', 'e', new)
    new = re.sub(r'[ìíîï]', 'i', new)
    new = re.sub(r'[òóôõö]', 'o', new)
    new = re.sub(r'[ùúûü]', 'u', new)
    new = re.sub(r'[ñ]', 'n', new)
    return new

In [3]:
# Clean names for consistency
# This is really only used now to clean DK Salaries
# You should add players to this if they're not merging on salary information
def name_clean(df):
    df['Name'] = np.where(df['Name'] == "Kike Hernandez", "Enrique Hernandez", df['Name'])
    df['Name'] = np.where(df['Name'] == "Michael A. Taylor", "Michael Taylor", df['Name'])
    # Note: to get all the de la Cruz's of the world right as last names, we need to manually add the Ji Mans to be one word so they're the first name
    df['Name'] = np.where(df['Name'] == "Ji Man Choi", "Ji-Man Choi", df['Name'])
    df['Name'] = np.where(df['Name'] == "Ji Hwan Bae", "Ji-Hwan Bae", df['Name']) # he technically has no dash, but we need it so it treats last name properly
    df['Name'] = np.where(df['Name'] == "Hyun Jin Ryu", "Hyun-Jin Ryu", df['Name'])
    
    
    df['Name_Adjusted'] = df.apply(lambda x: remove_accents(x['Name']), axis=1)  # remove accents
    df['Name_Adjusted'] = df['Name_Adjusted'].str.replace(r'[^a-zA-Z0-9 ]', '')
    df['Name_Adjusted'] = df['Name_Adjusted'].str.replace("Jr", "")
    df['Name_Adjusted'] = df['Name_Adjusted'].str.replace("Sr", "")
    df['Name_Adjusted'] = df['Name_Adjusted'].str.replace("II", "")
    df['Name_Adjusted'] = df['Name_Adjusted'].str.replace("III", "")
    
    # Separate first and last names
    df[['First','Last']] = df['Name_Adjusted'].str.split(" ", n=1, expand=True) 
    df['First'] = df['First'].str.lower()
    df['First'] = df['First'].str.replace(r'[ ]', '')
    df['Last'] = df['Last'].str.lower()
    df['Last'] = df['Last'].str.replace(r'[ ]', '')
    
    # Take first 2 letters of first name and first 5 of last as a sort of merge code
    df['First2'] = df['First'].str.slice(0,2)
    df['Last5'] = df['Last'].str.slice(0,5)
    
    
    
    
    return df

In [4]:
# Clean FanGraphs ID
# Once had individual manual replacements, now could probably be done without
def fix_fangraphs(chadwick):  
    
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].astype('str')
    chadwick['key_fangraphs'] = chadwick['key_fangraphs'].str.replace(r'\.\d', "", regex=True)

    return chadwick

In [7]:
# Reads in select variables from the Chadwick Register
# You should add keys to this if they're not merging with FanGraphs data
def read_chadwick(keep_list):
    # Separated across these suffixes
    chadwick_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"]
    # Create a list with a dataframe for each suffix
    dataframe_list = []
    # Loop over suffix
    for char in chadwick_list:
        # Read in that csv, keeping relevant variables
        df = pd.read_csv("https://raw.githubusercontent.com/chadwickbureau/register/master/data/people-{}.csv".format(char), low_memory=False, encoding='utf-8')[keep_list]
        # Drop if missing key_mlbam
        df.dropna(subset=['key_mlbam'], axis=0, inplace=True)
        # Add to dataframe list
        dataframe_list.append(df)
    # Append all dataframes together
    chadwick = pd.concat(dataframe_list, axis=0).reset_index()
    
    # Edit missing fangraphs IDs (if all else fails)
    chadwick = fix_fangraphs(chadwick)
    
    
    chadwick['name_last'].fillna("Missing", inplace=True)
    chadwick['name_first'].fillna("Mr", inplace=True)
    
    chadwick['name_first'] = chadwick['name_first'].str.replace(" ", "")
    chadwick['name_first'] = chadwick['name_first'].str.replace(".", "")
    chadwick['name_last'] = chadwick['name_last'].str.replace(" ", "")
    chadwick['name_last'] = chadwick['name_last'].str.replace(".", "")
    
    # Remove accents
    chadwick['name_last'] = chadwick.apply(lambda x: remove_accents(x['name_last']), axis=1)  # remove accents
    chadwick['name_first'] = chadwick.apply(lambda x: remove_accents(x['name_first']), axis=1)  # remove accents

    # Remove non-alpha numeric characters
    chadwick['name_first'] = chadwick['name_first'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", "", x))
    chadwick['name_last'] = chadwick['name_last'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", "", x))
    
    # Return big dataframe
    return chadwick

Unnamed: 0,index,key_uuid,key_person,key_retro,key_mlbam,key_bbref,key_fangraphs,name_last,name_first
0,0,000007d9-a2b6-47fd-bc61-e582f190e74a,000007d9,,472542.0,,,Garcia,Adolfito
1,1,000018b8-a25e-45b8-b060-5c4877cb84cc,000018b8,,572962.0,,,Kaupang,Stephen
2,12,0001534a-3266-4c2f-9fdd-76d2a5094ad2,0001534a,,664660.0,,,Ramirez,Dagin
3,19,000207da-aeaa-4ac4-ab4d-834407e1df1d,000207da,,670448.0,,,Dexter,Sam
4,22,00022ebb-4e9a-4106-9ffa-6dacdf73e34e,00022ebb,,808910.0,,,Webb,Bryan


In [20]:
# Searches for player IDs
def new_ids(player, team, website):
    # Google player plus fangraphs
    search = player + ' ' + team + ' player page ' + website
    url = 'https://www.google.com/search'

    headers = {
        'Accept' : '*/*',
        'Accept-Language': 'en-US,en;q=0.5',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82',
    }
    parameters = {'q': search}

    
    # Get info from URL
    content = requests.get(url, headers=headers, params=parameters).text
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.find(id = 'search')
    first_link = text.find('a')
    
    # FanGraph's ID is found a little differently
    if website == "fangraphs":
        website_id = first_link['href'].split("/")[5]

    # This should work for RotoWire and MLB.com
    else:
        website_id = first_link['href'].split("-")[-1]  
    
    # If it's a minor leaguer code, add quotes
    if website_id.startswith("sa"):
        website_id = "'" + website_id + "'"
        
    
    return website_id

In [19]:
# Reads in contest history
def contest_history(entry_min=8, date_min="20220301", date_max="20991231"):
    history = pd.read_csv(os.path.join(baseball_path, "Utilities", "draftkings-contest-entry-history.csv"))
    history = history[history['Sport'] == "MLB"]
    history = history[history['Contest_Entries'] >= entry_min]
    history.drop_duplicates('Contest_Key', inplace=True)
    
    history['date'] = pd.to_datetime(history['Contest_Date_EST']).dt.strftime('%Y%m%d')
    history = history[history['date'] > date_min]
    history = history[history['date'] < date_max]

    history = history.reset_index(drop=True)

    return history

In [None]:
def pause_code(start_time='2023-08-09T07:24:30'):
    pause_until = datetime.datetime.fromisoformat(start_time) # or whatever timestamp you gonna need
    print((pause_until - datetime.datetime.now()).total_seconds())
    time.sleep((pause_until - datetime.datetime.now()).total_seconds())