# This notebook creates a dataset for min/max winning scores
- min win df score
- max win df score
- median team score (real game scores)
- 75th percentile team score (real game scores)
- number of slate games
- median df score for each player position
- 75th percentile df score for each player position
- median df score of top 50% of players for each position over the previous W weeks
- 75th percentile df score of top 50% for each position over the previous W weeks

In [24]:
# Load all daily fantasy contest data
from datetime import date
from functools import partial
import re
from typing import Optional

import pandas as pd

from fantasy_py import ContestStyle
from fantasy_py.lineup.strategy import GeneralPrizePool, FiftyFifty


# the datasets to generate, dict mapping sport to dict with keys sport, min_date, max_date, historic data filename
SPORT_CFGS = {
    'mlb': {
        'min_date': date(2019, 1, 1),
        'max_date': date(2021, 1, 1),
        'db_filename': "/home/delano/working/fantasy/mlb_hist_20082020.scored.db",
        'cost_pos_drop': {'DH', 'RP'},
        'cost_pos_rename': {'SP': 'P'},
    },
    'nfl': {
        'min_date': date(2020, 8, 1),
        'max_date': date(2021, 1, 1),
        'db_filename': "/home/delano/working/fantasy/nfl_hist_2009-2020.db",
    },
    'nba': {
        'min_date': date(2019, 8, 1),
        'max_date': date(2020, 8, 1),
        'db_filename': "/home/delano/working/fantasy/nba_hist_20082009-20192020.scored.db",
    },
    'nhl': {
        'min_date': date(2017, 8, 1),
        'max_date': date(2021, 4, 1),
        'db_filename': "/home/delano/working/fantasy/nhl_hist_20072008-20192020.scored.db",
        'cost_pos_rename': {'LW': 'W', 'RW': 'W'},
    },
}


# days to use to identify top players going into a slate
TOP_PLAYER_DAYS = 21
# players above this percentil over the last TOP_PLAYER_DAYS are considered top players
TOP_PLAYER_PERCENTILE = .70


def get_stat_names(sport, service_abbr):
    return (
        f"'{service_abbr}_score'"
        if sport != 'nfl' else
        f"'{service_abbr}_score_off','{service_abbr}_score_def'"
    )


# fanduel/draftkings/yahoo
SERVICES = [
    'draftkings',
    'fanduel',
    'yahoo',
]

STYLES = [
    ContestStyle.CLASSIC,
    ContestStyle.SHOWDOWN,
]

# GeneralPrizePool/FiftyFifty
CONTEST_TYPES = [
    FiftyFifty,
    GeneralPrizePool,
]

In [2]:
def infer_contest_style(service, title) -> ContestStyle:
    if service == 'draftkings':
        if ('Showdown' in title or
            re.match('.*.{2,3} vs .{2,3}\)', title)):
           return ContestStyle.SHOWDOWN
        return ContestStyle.CLASSIC
    if service == 'fanduel':
        if '@' in (title or ''):
            return ContestStyle.SHOWDOWN
        return ContestStyle.CLASSIC
    if service == 'yahoo':
        if (' Cup ' in title or 
            ' to 1st]' in title or 
            ' 50/50' in title or
            'QuickMatch vs ' in title or 
            'H2H vs ' in title or
            '-Team' in title or   # N-team contests are classic
            'Freeroll' in title or
            'Quadruple Up' in title or
            'Guaranteed' in title):
           return ContestStyle.CLASSIC
    raise NotImplementedError(f"Could not infer contest style for {service=} {title=}")    
    
    
def infer_contest_type(service, title) -> str:
    if service == 'draftkings':
        if re.match('.* vs\. [^)]+$', title):
            return 'H2H'
        return FiftyFifty.NAME if 'Double Up' in title else GeneralPrizePool.NAME
    if service == 'fanduel':
        if 'Head-to-head' in (title or ''):
            return 'H2H'
        if (title or '').startswith('50/50'):
            return FiftyFifty.NAME
        return GeneralPrizePool.NAME
    if service == 'yahoo':
        if (' QuickMatch vs ' in title or 
            'H2H vs ' in title):
            return 'H2H'
        if ' 50/50' in title:
            return FiftyFifty.NAME
        if (' Cup ' in title or 
            ' to 1st]' in title or 
            'Freeroll' in title or
            'Quadruple Up' in title or
            '-Team' in title or                        # multi-team games are GPP if not caught by 50/50
            title.endswith('Team Winner Takes All') or        # treat winner takes all like a gpp
            'Guaranteed' in title):
           return GeneralPrizePool.NAME
    raise NotImplementedError(f"Could not infer contest type for {service=} {title=}")
    

def get_contest_df(service, sport, style, contest_type, min_date, max_date) -> pd.DataFrame:
    contest_df = pd.read_csv(service + ".contest.csv", parse_dates=['date']) \
                   .query('sport == @sport and @min_date <= date < @max_date') \
                   [['contest_id', 'date', 'title', 'top_score', 'last_winning_score']]
    contest_df.date = contest_df.date.dt.normalize()
    contest_df = contest_df.where(contest_df.notnull(), None)

    # add style and type
    #     with pd.option_context('max_rows', 1000, 'max_colwidth', 100):
    #         display(contest_df)
    contest_df['style'] = contest_df.title.map(
        partial(infer_contest_style, service)
    )
    contest_df['type'] = contest_df.title.map(
        partial(infer_contest_type, service)
    )
    queries = []
    if style is not None:
        # print(f"Filtering for {style=}")
        queries.append('style == @style')
    if contest_type is not None:
        # print(f"Filtering for {contest_type=}")
        queries.append('type == @contest_type.NAME')
    if len(queries) > 0:
        contest_df = contest_df.query(' and '.join(queries))
    # return contest_df.set_index('contest_id')
    return contest_df


# contest_df = get_contest_df(SERVICE, SPORT, STYLE, CONTEST_TYPE, MIN_DATE, MAX_DATE)
# with pd.option_context('max_rows', 1000, 'max_columns', 100):
#    display(contest_df.sort_values(['style', 'type']))

In [3]:
SERVICE_ABBR = {
    'fanduel': 'fd',
    'draftkings': 'dk',
    'yahoo': 'y'
}


def get_draft_df(service, sport, style, min_date, max_date) -> pd.DataFrame:
    draft_df = pd.read_csv(service + ".draft.csv", parse_dates=['date']) \
                 .query('sport == @sport and @min_date <= date < @max_date')
    assert len(draft_df) > 0, "no draft data found"

    draft_df['service'] = draft_df.contest.map(lambda contest: contest.split('-', 1)[0])
    service_abbr = SERVICE_ABBR[service]
    draft_df = draft_df.query('service == @service_abbr and team_abbr.notnull()') \
        [['position', 'name', 'team_abbr', 'contest_id']]
    
    return draft_df


# draft_df = get_draft_df(SERVICE, SPORT, STYLE, MIN_DATE, MAX_DATE)
# display(draft_df)

In [4]:
from fantasy_py import FANTASY_SERVICE_DOMAIN, lineup, util


def create_team_contest_df(contest_df, draft_df, service, sport):
    service_cls = util.CLSRegistry.get_class(FANTASY_SERVICE_DOMAIN, service)
    abbr_remaps = service_cls.get_team_abbr_remapping(sport)

    # add team/lineup draft data
    team_contest_df = pd.merge(contest_df, draft_df, on='contest_id')
    team_contest_df.team_abbr = team_contest_df.team_abbr.map(
        lambda abbr: abbr_remaps.get(abbr) or abbr
    )
    
    return team_contest_df
    
    
# team_contest_df = create_team_contest_df(contest_df, draft_df, SERVICE, SPORT)
# print(f"{len(team_contest_df.contest_id.unique())} contests")
# display(team_contest_df)

In [5]:
import os


def common_title(title_series: pd.Series) -> str:
    """ the title of a contest will be the common prefix amongst all the possible contest titles """
    title_list = title_series.tolist()
    if None in title_list:
        return ""
    return os.path.commonprefix(title_list)


def create_teams_contest_df(tc_df):
    """ group contests together and create team sets used in each contest """
    tc_df = pd.DataFrame(
        tc_df.groupby(
            ['contest_id', 'date', 'style', 'type']
        ).agg(
            {'team_abbr': set,
             'title': common_title,
             'top_score': lambda score: score.mean(),
             'last_winning_score': lambda score: score.mean()}
        )
    ).reset_index()
    tc_df = tc_df.rename(columns={'team_abbr': 'teams'})
    tc_df['draft_team_count'] = tc_df.teams.map(len)
    return tc_df


# teams_contest_df = create_teams_contest_df(team_contest_df)
# display(f"{len(teams_contest_df)} team sets")
# display(teams_contest_df)

In [6]:
# load slate data from db
import sqlite3
import pandas as pd


def get_slate_df(db_filename, service, style, min_date, max_date) -> Optional[pd.DataFrame]:
    conn = sqlite3.connect(db_filename)
    sql = f"""
    select distinct daily_fantasy_slate.id as slate_id, date, 
        daily_fantasy_slate.name as slate_name, style as contest_style, abbr
    from daily_fantasy_slate 
        join daily_fantasy_cost on daily_fantasy_slate.id = daily_fantasy_cost.daily_fantasy_slate_id
        join team on team_id = team.id
    where service = '{service}' and date between '{min_date}' and '{max_date}'
    """

    if style is not None:
        sql += f" and style = '{style.name}'"

    # print(sql)
    db_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
    # with pd.option_context('max_rows', 100):
    #     display(db_df)
    conn.close()
    if len(db_df) == 0:
        return None

    # get team sets
    slate_db_df = pd.DataFrame(
        db_df.groupby(
            ['slate_id', 'date', 'slate_name', 'contest_style']
        ).agg(
            {'abbr': set}
        )
    ).reset_index()
    
    try:
        slate_db_df = slate_db_df.set_index('date') \
                                 .rename(columns={'abbr': 'teams'})
    except Exception as ex:
        raise ValueError("Error processing slate db df", slate_db_df) from ex
        
    slate_db_df['team_count'] = slate_db_df.teams.map(len)
    return slate_db_df


# slate_db_df = get_slate_df(DB_FILENAME, SERVICE, STYLE, MIN_DATE, MAX_DATE)
# with pd.option_context('max_rows', 100):
#     display(slate_db_df)

In [7]:
import numpy as np
from typing import Optional


def get_slate_id(contest_row, slate_db_df) -> pd.Series:
    """ 
    guesses the db slate id contest_row
    returns - series of (slate_id, number of teams playing in slate)
    """
    try:
        date_slates = slate_db_df.loc[[contest_row.date]].sort_values('team_count')
    except KeyError as ke:
        # print(f"Key error/No slate found for {contest_row.date}")
        return pd.Series({'slate_id': None, 'team_count': None})
    try:
        slates = date_slates.query("@contest_row.teams <= teams")
    except Exception as e:
        print(f"Unhandled exception querying for teams date {contest_row.date}")
        # display(date_slates)
        raise
        
    slates_found = len(slates)
    if slates_found == 0:
        # print(f"On {contest_row.date} the {len(date_slates)} db slates don't match contest teams {contest_row.teams}. "
        #       "DB slate team sets were:")
        # with pd.option_context('max_colwidth', None):
        #     display(date_slates[['slate_name', 'teams']])
        return pd.Series({'slate_id': None, 'team_count': None})
    #     if slates_found > 1:
    #         # display(slates)
    #         slates = slates.head(1)
    #         print(f"{slates_found} slates matched contest {contest_row.date} '{contest_row.title}'. "
    #               f"Using '{slates.iloc[0].slate_name}'")
    return slates.iloc[0][['slate_id', 'team_count']]
    
# slate_ids_df = teams_contest_df.apply(get_slate_id, axis=1)
# display(slate_ids_df)

In [8]:
# slate game score info

def create_team_score_df(db_filename, slate_ids_str, top_player_percentile) -> Optional[pd.DataFrame]:
    conn = sqlite3.connect(db_filename)
    sql = f"""
    select distinct daily_fantasy_slate.id as slate_id, game.id as game_id, game.score_home, game.score_away
    from daily_fantasy_slate
        join daily_fantasy_cost on daily_fantasy_slate.id = daily_fantasy_cost.daily_fantasy_slate_id
        join game on (game.date = daily_fantasy_slate.date and
                      game.season = daily_fantasy_slate.season and 
                      (daily_fantasy_cost.team_id in (game.away_team_id, game.home_team_id)))
    where daily_fantasy_slate.id in ({slate_ids_str})
    """

    # print(sql)
    db_team_score_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
    conn.close()
    # display(db_team_score_df)
    if len(db_team_score_df) == 0:
        return None

    team_score_df = db_team_score_df.melt(id_vars=['slate_id', 'game_id'], value_vars=['score_home', 'score_away']) \
              .groupby(['slate_id']) \
              .agg({'value': ['median', lambda x: np.percentile(x, top_player_percentile * 100)]})
    team_score_df.columns = ['team-med', f'team-{top_player_percentile * 100}th_pctl']
    return team_score_df


# for mlb double headers this will cause inaccuracy for players that played in both games
# slate_ids_str = ','.join(map(str, slate_ids_df.slate_id.dropna()))
# team_score_df = create_team_score_df(DB_FILENAME, slate_ids_str, TOP_PLAYER_PERCENTILE)
# display(team_score_df)

In [35]:
# get position scores

def get_exploded_pos_df(db_filename, sport, service_abbr, slate_ids_str, 
                        cost_pos_drop: Optional[set], cost_pos_rename: Optional[dict]):
    conn = sqlite3.connect(db_filename)
    stat_names = get_stat_names(sport, service_abbr)

    # for mlb double headers this will cause inaccuracy for players that played in both games
    sql = f"""
    select daily_fantasy_slate.id as slate_id, positions as cost_positions, player_position.abbr as stat_position, 
        value as score, daily_fantasy_cost.team_id, daily_fantasy_cost.player_id
    from daily_fantasy_slate
        join daily_fantasy_cost on daily_fantasy_slate.id = daily_fantasy_cost.daily_fantasy_slate_id
        join game on (game.date = daily_fantasy_slate.date and
                      game.season = daily_fantasy_slate.season and 
                      (daily_fantasy_cost.team_id in (game.away_team_id, game.home_team_id)))
        join calculation_datum on (calculation_datum.game_id = game.id and 
                                   calculation_datum.player_id is daily_fantasy_cost.player_id and
                                   calculation_datum.team_id = daily_fantasy_cost.team_id)
        join statistic on calculation_datum.statistic_id = statistic.id
        join player on daily_fantasy_cost.player_id = player.id
        join player_position on player.player_position_id = player_position.id
    where daily_fantasy_slate.id in ({slate_ids_str}) and
        statistic.name in ({stat_names})
    """
    # print(sql)

    db_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
    conn.close()
    
    db_df['position'] = db_df.apply(
        lambda row: row.stat_position if 'Unknown' in row.cost_positions else row.cost_positions,
        axis=1
    )

    db_exploded_pos_df = db_df.assign(position=db_df.position.str.split('/')) \
                 .explode('position')
    
    if cost_pos_drop is not None:
        db_exploded_pos_df = db_exploded_pos_df.query('position not in @cost_pos_drop')
    if cost_pos_rename is not None:
        for old_pos, new_pos in cost_pos_rename.items():
            db_exploded_pos_df.loc[db_exploded_pos_df.position == old_pos, 'position'] = new_pos
    return db_exploded_pos_df


def get_position_scores(db_exploded_pos_df, top_player_percentile):
    db_pos_scores_df = db_exploded_pos_df[['slate_id', 'position', 'score']] \
                 .groupby(['slate_id', 'position']) \
                 .agg(['median', lambda x: np.percentile(x, top_player_percentile * 100)]) 
    db_pos_scores_df.columns = ['med-dfs', f'{top_player_percentile * 100}th-pctl-dfs']
    db_pos_scores_df = db_pos_scores_df.reset_index(level='position') \
                 .pivot(columns='position', values=['med-dfs', f'{top_player_percentile * 100}th-pctl-dfs'])
    return db_pos_scores_df


# db_exploded_pos_df = get_exploded_pos_df(DB_FILENAME, SPORT, SERVICE_ABBR[SERVICE], slate_ids_str)
# display(db_exploded_pos_df)
# db_pos_scores_df = get_position_scores(db_exploded_pos_df, TOP_PLAYER_PERCENTILE)
# display(db_pos_scores_df)    

In [10]:
def get_player_scores(db_filename, db_exploded_pos_df,
                      sport, service_abbr, top_player_days, min_date, max_date):
    """ Get top player scores (e.g. players that are likely to be highly drafted) """
    conn = sqlite3.connect(db_filename)
    stat_names = get_stat_names(sport, service_abbr)

    sql = f"""
    select game.date, calculation_datum.player_id, calculation_datum.team_id, calculation_datum.value as score 
    from game
        join calculation_datum on calculation_datum.game_id = game.id
        join statistic on calculation_datum.statistic_id = statistic.id
    where statistic.name in ({stat_names}) 
        and date between date('{min_date}', '-{top_player_days} days') and '{max_date}'
    """
    # print(sql)
    db_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
    conn.close()
    # display(db_df)

    db_filtered_df = db_df.query(
        '(player_id in @db_exploded_pos_df.player_id) '
        'or (player_id.isnull() and team_id in @db_exploded_pos_df.team_id)'
    )
    return db_filtered_df

    
# db_filtered_df = get_player_scores(DB_FILENAME, SPORT, SERVICE_ABBR[SERVICE], TOP_PLAYER_DAYS, MIN_DATE, MAX_DATE)
# display(db_filtered_df)

In [11]:
def create_predict_df(teams_contest_df, slate_ids_df, team_score_df, db_pos_scores_df):
    predict_df = pd.concat(
        [teams_contest_df[['date', 'style', 'type', 'top_score', 'last_winning_score']],
         slate_ids_df],
        axis='columns',
    ).join(team_score_df, on='slate_id') \
     .join(db_pos_scores_df, on='slate_id')
    return predict_df


# predict_df = create_predict_df(teams_contest_df, slate_ids_df, team_score_df, db_pos_scores_df)
# with pd.option_context('max_columns', 100):
#     display(predict_df)
    
    
# filename = f"{SPORT}-{SERVICE}-{STYLE.name}-{CONTEST_TYPE.NAME}.csv"
# print(f"Writing data to file '{filename}'")
# predict_df.to_csv(filename, index=False)

In [37]:
def generate_dataset(sport, cfg, service, style, contest_type) -> pd.DataFrame:
    filename = f"{sport}-{service}-{style.name}-{contest_type.NAME}.csv"
    print(f"Creating data for file '{filename}'")

    db_filename = cfg['db_filename']
    min_date = cfg['min_date']
    max_date = cfg['max_date']
    
    contest_df = get_contest_df(service, sport, style, contest_type, min_date, max_date)

    draft_df = get_draft_df(service, sport, style, min_date, max_date)
    # display(draft_df)

    team_contest_df = create_team_contest_df(contest_df, draft_df, service, sport)
    # print(f"{len(team_contest_df.contest_id.unique())} contests")
    # display(team_contest_df)

    teams_contest_df = create_teams_contest_df(team_contest_df)
    # display(f"{len(teams_contest_df)} team sets")
    # display(teams_contest_df)

    slate_db_df = get_slate_df(db_filename, service, style, min_date, max_date)
    if slate_db_df is None:
        raise ValueError("No slates found")
    # with pd.option_context('max_rows', 100):
    #     display(slate_db_df)
    
    slate_ids_df = teams_contest_df.apply(get_slate_id, axis=1, args=(slate_db_df, ))
    # display(slate_ids_df)

    if len(slate_ids_df) == 0:
        raise ValueError("No slates ids found (A)")
    
    try:
        slate_ids_str = ','.join(map(str, slate_ids_df.slate_id.dropna()))
    except Exception as ex:
        raise ValueError("Something wrong with slate_ids_df", slate_ids_df) from ex
        
    if len(slate_ids_str) == 0:
        raise ValueError("No slate ids found (B)")
    team_score_df = create_team_score_df(db_filename, slate_ids_str, TOP_PLAYER_PERCENTILE)
    # display(team_score_df)

    db_exploded_pos_df = get_exploded_pos_df(
        db_filename, sport, SERVICE_ABBR[service], slate_ids_str,
        cfg.get('cost_pos_drop'), cfg.get('cost_pos_rename'),
    )
    # display(db_exploded_pos_df)
    db_pos_scores_df = get_position_scores(db_exploded_pos_df, TOP_PLAYER_PERCENTILE)
    # display(db_pos_scores_df)    

    db_filtered_df = get_player_scores(db_filename, db_exploded_pos_df,
                                       sport, SERVICE_ABBR[service], TOP_PLAYER_DAYS, min_date, max_date)
    # display(db_filtered_df)

    predict_df = create_predict_df(teams_contest_df, slate_ids_df, team_score_df, db_pos_scores_df)
    # with pd.option_context('max_columns', 100):
    #     display(predict_df)

    filename = f"{sport}-{service}-{style.name}-{contest_type.NAME}.csv"
    print(f"Writing data to file '{filename}'")
    predict_df.to_csv(filename, index=False)
    return predict_df

In [39]:
for sport in SPORT_CFGS:
    for service in SERVICES:
        for style in STYLES:
            for contest_type in CONTEST_TYPES:
                print(f"Processing {sport}, {service}, {style}, {contest_type}")
                try:
                    generate_dataset(sport, SPORT_CFGS[sport], service, style, contest_type)
                except ValueError as ex:
                    failure = ex
                    print(f"Error for {sport}, {service}, {style}, {contest_type}: {ex}")
                    
# try:                
#     sport = 'nhl'
#     df = generate_dataset(sport, SPORT_CFGS[sport], 'draftkings', ContestStyle.CLASSIC, GeneralPrizePool)
#     with pd.option_context('max_rows', 1000, 'max_columns', 100):
#         display(df)
# except ValueError as ex:
#     display(ex.args)
#     raise

Processing mlb, draftkings, classic, <class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>
Creating data for file 'mlb-draftkings-CLASSIC-FIFTY_FIFTY.csv'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Writing data to file 'mlb-draftkings-CLASSIC-FIFTY_FIFTY.csv'
Processing mlb, draftkings, classic, <class 'fantasy_py.lineup.strategy.bet_lineup.GeneralPrizePool'>
Creating data for file 'mlb-draftkings-CLASSIC-GPP.csv'
Writing data to file 'mlb-draftkings-CLASSIC-GPP.csv'
Processing mlb, draftkings, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>
Creating data for file 'mlb-draftkings-SHOWDOWN-FIFTY_FIFTY.csv'
Writing data to file 'mlb-draftkings-SHOWDOWN-FIFTY_FIFTY.csv'
Processing mlb, draftkings, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.GeneralPrizePool'>
Creating data for file 'mlb-draftkings-SHOWDOWN-GPP.csv'
Writing data to file 'mlb-draftkings-SHOWDOWN-GPP.csv'
Processing mlb, fanduel, classic, <class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>
Creating data for file 'mlb-fanduel-CLASSIC-FIFTY_FIFTY.csv'
Writing data to file 'mlb-fanduel-CLASSIC-FIFTY_FIFTY.csv'
Processing mlb, fanduel, classic, <class 'fantasy_py.lineup.strategy.bet_