# DFS Golf Analysis
This program is meant to read from various sources and explore possibilities of modelling golfer performance at various PGA courses to make money on DraftKings or FanDuel.

First import the necessary libraries:

In [1]:
import pandas as pd
import numpy as np

# Input necessary setup variables
engine = 'FanDuel'   # Enter FanDuel or DraftKings

Prep for metadata by creating dictionaries for all the location codes that are encountered in the data.  These will be used later to create cleaner location information about each tournament which will likely be used as a feature for estimating player performance.

In [2]:
USabbrevs = {'al': 'Alabama','ak': 'Alaska', 'az': 'Arizona', 'ar': 'Arkansas', 'ca': 'California', 'co': 'Colorado',
 'ct': 'Connecticut', 'de': 'Delaware', 'fl': 'Florida', 'ga': 'Georgia', 'hi': 'Hawaii', 'id': 'Idaho', 'il': 'Illinois',
 'in': 'Indiana', 'ia': 'Iowa', 'ks': 'Kansas', 'ky': 'Kentucky', 'la': 'Louisiana', 'me': 'Maine', 'md': 'Maryland',
 'ma': 'Massachusetts', 'mi': 'Michigan', 'mn': 'Minnesota', 'ms': 'Mississippi', 'mo': 'Missouri', 'mt': 'Montana',
 'ne': 'Nebraska', 'nv': 'Nevada', 'nh': 'New Hampshire', 'nj': 'New Jersey', 'nm': 'New Mexico', 'ny': 'New York',
 'nc': 'North Carolina', 'nd': 'North Dakota', 'oh': 'Ohio', 'ok': 'Oklahoma', 'or': 'Oregon', 'pa': 'Pennsylvania',
 'ri': 'Rhode Island', 'sc': 'South Carolina', 'sd': 'South Dakota', 'tn': 'Tennessee', 'tx': 'Texas', 'ut': 'Utah',
 'vt': 'Vermont', 'va': 'Virginia', 'wa': 'Washington', 'wv': 'West Virginia', 'wi': 'Wisconsin', 'wy': 'Wyoming'}
CANabbrevs = {'on': 'Ontario'}
UKabbrevs = {'england': 'England', 'nir': 'Northern Ireland', 'eng': 'England'}
trans = {'jpn':'Japan','mex':'Mexico', 'pur':'Puerto Rico','aus':'Australia','bah':'Bahamas','ber':'Bermuda',
         'chn':'China','kor':'South Korea','can':'Canada','dom':'Dominican Republic','mas':'Malaysia'}

# Combine all dicts
all_abbrevs = {}
_ = [all_abbrevs.update(d) for d in (USabbrevs, CANabbrevs, UKabbrevs, trans)]

# Prep Location Breakdown for tournament data
def loc_breakdown(row):  
    # Look for a match with the dictionaries listed at the top of this notebook
    if row['end_loc'] in USabbrevs.values():  # located in USA
        row['City'] = row['beg_loc']
        row['State'] = row['end_loc']
        row['Country'] = 'United States'
    elif row['end_loc'] in CANabbrevs.values():  # located in Canada
        row['City'] = row['beg_loc']
        row['State'] = row['end_loc']
        row['Country'] = 'Canada'
    elif row['end_loc'] in UKabbrevs.values():  # located in United Kingdom
        row['City'] = row['beg_loc']
        row['State'] = row['end_loc']
        row['Country'] = 'United Kingdom'
    elif row['end_loc'] in trans.values():    # located elsewhere
        row['City'] = row['beg_loc']
        row['Country'] = row['end_loc']
    else:
        pass
    return row

# Prep Zipcodes for use in tournament data
def find_zip(row):
    if row['Country']=='United States':
        try:
            return zips[(zips.state_name==row['State']) & (zips.city==row['City'])].index[0]
        except:
            return np.nan
    else:
        return np.nan

Replace missing cities with actual city name from the USPS website.  Read in csv file with zip code data for use in creating a location region feature

In [3]:
missing_cities = {'Kapalua': 'Lahaina', 'Ft. Worth': 'Fort Worth', 'Auburn/Opelika': 'Opelika', 'St. Louis': 'Saint Louis',
                  'Hilton Head': 'Hilton Head Island', 'Avondale': 'Westwego', 'Erin': 'Hartford', 'Blaine': 'Minneapolis',
                  'McKinney': 'Mckinney', 'Kiawah Island': 'Johns Island'}

# Read and modify the zips
zips = pd.read_csv('uszips.csv', index_col='zip')

FileNotFoundError: [Errno 2] No such file or directory: 'uszips.csv'

Create functions for easy extraction of data from various internet sources which include:
    [sportsdata.io](https://sportsdata.io/developers/api-documentation/golf#)

The data can be retreived in the following formats:
    - entire season data
    - specific player data
    - data from all players
    
Add new functions here as new data sources are discovered

In [4]:
# Setup calls to data
api = 'de4dc63e16ee485b9df3bb79146bdcc1'

# Individual seasons
def season_data(season):
    return pd.read_json('https://api.sportsdata.io/golf/v2/json/Tournaments/{}?key={}'.format(str(season),api))

# Individual players
def player_data(player_id):
    return pd.read_json('https://api.sportsdata.io/golf/v2/json/Player/{}?key={}'.format(str(player_id),api))

# All players
def all_players_data():
    players = pd.read_json('https://api.sportsdata.io/golf/v2/json/Players?key={}'.format(api)).replace({None: np.nan})
    players = players[players['DraftKingsName'].notnull() & players['FanDuelName'].notnull()]  # strip out the nulls
    for col in players.select_dtypes(include=np.float).columns:
        players[col] = players[col].astype(pd.Int32Dtype())  # convert float columns to ints

    cols_to_drop = ['FantasyAlarmPlayerID','FantasyDraftName','FantasyDraftPlayerID','PhotoUrl',
                    'RotoWirePlayerID', 'RotoworldPlayerID', 'SportRadarPlayerID', 'YahooPlayerID']
    if engine.lower() == 'draftkings':
        cols_to_drop.extend(['FanDuelName','FanDuelPlayerID'])
        players = players.drop(cols_to_drop,axis=1).set_index('DraftKingsName')
    elif engine.lower() == 'fanduel':
        cols_to_drop.extend(['DraftKingsName','DraftKingsPlayerID'])
        players = players.drop(cols_to_drop,axis=1).set_index('FanDuelName')
    
    return players

# All tournaments from current season (same as season_data(2020))
def tournament_data():
    col_order=['StartDate', 'StartDateTime', 'EndDate', 'City', 'State', 'Country', 'Location', 'ZipCode', 'TimeZone', 
               'Covered', 'Format', 'IsInProgress', 'IsOver', 'Name', 'Par', 'Purse', 'Rounds', 'TournamentID', 'Venue', 
               'Yards', 'Canceled']  # modify the order that the data is shown
    tourn = (pd.read_json('https://api.sportsdata.io/golf/v2/json/Tournaments?key={}'.format(api))
               .replace({None: np.nan})  # replace Nones with NaNs
               .dropna(subset=['Location'])  # drop row with NaN in column Location
               .loc[:,col_order])        # only use columns listed above
    
    # Convert the dates to datetime
    tourn.EndDate = pd.to_datetime(tourn.EndDate)
    
    # Add columns for location breakdown
    tourn['beg_loc'] = tourn['Location'].str.extract('^([A-Za-z0-9 /\.]+),')   # Extract before comma
    tourn['end_loc'] = tourn['Location'].str.extract(', ([A-Za-z0-9 ]+)$')  # Extract after last comma
    tourn['end_loc'] = tourn['end_loc'].str.lower().replace(all_abbrevs)  # Replace with adjustments
    
    # Convert data to proper columns and drop unnecessary columns
    tourn = tourn.apply(loc_breakdown, axis=1).drop(['Location','beg_loc','end_loc'],axis=1)
    
    # Fill in zip code column
    tourn['City'] = tourn['City'].replace(missing_cities)  # replace in missing_cities so all zip codes can be found
    tourn['ZipCode'] = tourn.apply(find_zip, axis=1).astype('Int64')
    return tourn
    


### Test tournament data function here

In [5]:
tournament_data()

Unnamed: 0,StartDate,StartDateTime,EndDate,City,State,Country,ZipCode,TimeZone,Covered,Format,IsInProgress,IsOver,Name,Par,Purse,Rounds,TournamentID,Venue,Yards,Canceled
11,2022-08-25T00:00:00,,2022-08-28,Atlanta,Georgia,United States,,America/New York,1.0,Stroke,False,False,TOUR Championship,70.0,,"[{'TournamentID': 509, 'RoundID': 15873, 'Numb...",509,East Lake Golf Club,7346.0,0.0
12,2022-08-18T00:00:00,,2022-08-21,Wilmington,Delaware,United States,,America/New York,1.0,Stroke,False,False,BMW Championship,,11500000.0,"[{'TournamentID': 508, 'RoundID': 15869, 'Numb...",508,Wilmington Country Club (South Course),,0.0
13,2022-08-11T00:00:00,,2022-08-14,Memphis,Tennessee,United States,,America/Chicago,1.0,Stroke,False,False,FedEx St. Jude Championship,70.0,11500000.0,"[{'TournamentID': 507, 'RoundID': 15865, 'Numb...",507,TPC Southwind,7237.0,0.0
14,2022-08-04T00:00:00,,2022-08-07,Greensboro,North Carolina,United States,,America/New York,1.0,Stroke,False,False,Wyndham Championship,70.0,7300000.0,"[{'TournamentID': 506, 'RoundID': 15861, 'Numb...",506,Sedgefield Country Club,7127.0,0.0
15,2022-07-28T00:00:00,,2022-07-31,Detroit,Michigan,United States,,America/New York,1.0,Stroke,False,False,Rocket Mortgage Classic,72.0,8400000.0,"[{'TournamentID': 505, 'RoundID': 15857, 'Numb...",505,Detroit Golf Club,7340.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,2014-11-06T00:00:00,,2014-11-09,,,,,,1.0,,False,True,World Golf Championships-HSBC Champions,72.0,8500000.0,"[{'TournamentID': 42, 'RoundID': 159, 'Number'...",42,Sheshan International GC,7261.0,0.0
414,2014-10-30T00:00:00,,2014-11-02,,,,,,1.0,,False,True,CIMB Classic,72.0,7000000.0,"[{'TournamentID': 50, 'RoundID': 191, 'Number'...",50,Kuala Lumpur G&CC,6985.0,0.0
415,2014-10-23T00:00:00,,2014-10-26,Sea Island,Georgia,United States,,,1.0,,False,True,The McGladrey Classic,70.0,5600000.0,"[{'TournamentID': 51, 'RoundID': 195, 'Number'...",51,Sea Island Resort (Seaside),7005.0,0.0
416,2014-10-16T00:00:00,,2014-10-19,Las Vegas,Nevada,United States,,,1.0,,False,True,Shriners Hospitals for Children Open,71.0,6200000.0,"[{'TournamentID': 36, 'RoundID': 135, 'Number'...",36,TPC Summerlin,7255.0,0.0
