# This notebook creates a dataset for min/max winning scores
- min win df score
- max win df score
- median team score (real game scores)
- 75th percentile team score (real game scores)
- number of slate games
- median df score for each player position
- 75th percentile df score for each player position
- median df score of top 50% of players for each position over the previous W weeks
- 75th percentile df score of top 50% for each position over the previous W weeks

In [37]:
# Load all daily fantasy contest data
from datetime import date
from functools import partial
import re
from typing import Optional, Literal, Union

import pandas as pd

from fantasy_py import ContestStyle
from fantasy_py.lineup.strategy import GeneralPrizePool, FiftyFifty


# the datasets to generate, dict mapping sport to dict with keys sport, min_date, max_date, historic data filename
SPORT_CFGS = {
    'mlb': {
        'min_date': date(2019, 1, 1),
        'max_date': date(2021, 1, 1),
        'db_filename': "/home/delano/working/fantasy/mlb_hist_20082020.scored.db",
        'cost_pos_drop': {'DH', 'RP'},
        'cost_pos_rename': {'SP': 'P'},
    },
    'nfl': {
        'min_date': date(2020, 8, 1),
        'max_date': date(2021, 1, 1),
        'db_filename': "/home/delano/working/fantasy/nfl_hist_2009-2020.db",
    },
    'nba': {
        'min_date': date(2019, 8, 1),
        'max_date': date(2020, 8, 1),
        'db_filename': "/home/delano/working/fantasy/nba_hist_20082009-20192020.scored.db",
    },
    'nhl': {
        'min_date': {'draftkings': date(2019, 10, 9),   # dk changed scoring formula for nhl
                     None: date(2017, 8, 1)},
        'max_date': date(2021, 4, 1),
        'db_filename': "/home/delano/working/fantasy/nhl_hist_20072008-20192020.scored.db",
        'cost_pos_rename': {'LW': 'W', 'RW': 'W'},
    },
    'lol': {
        'db_filename': "/home/delano/working/fantasy/lol_hist_2014-2020.scored.db",
        'min_date': date(2020, 1, 1),
        'max_date': date(2021, 1, 1),
        'services': ['draftkings', 'fanduel'],
    }
}


# days to use to identify top players going into a slate
TOP_PLAYER_DAYS = 21
# players above this percentil over the last TOP_PLAYER_DAYS are considered top players
TOP_PLAYER_PERCENTILE = .70


def get_stat_names(sport, service_abbr: Literal['dk', 'fd', 'y'], as_str=False) -> Union[str, list[str]]:
    """
    returns stat names for the requested sport and service as either a comma seperated string that
    can be used in an sql query, or as a list of strings
    """
    stats: Union[str, list[str]]
        
    if sport == 'nfl':
        stats = [f'{service_abbr}_score_off', f'{service_abbr}_score_def']
    elif sport == 'lol':
        stats = [f'{service_abbr}_match_score']
    else:
        stats = [f'{service_abbr}_score']
        
    if as_str:
        stats = "'" + "','".join(stats) + "'"
    return stats

# fanduel/draftkings/yahoo
SERVICES = [
    'draftkings',
    'fanduel',
    'yahoo',
]

STYLES = [
    ContestStyle.CLASSIC,
    ContestStyle.SHOWDOWN,
]

# GeneralPrizePool/FiftyFifty
CONTEST_TYPES = [
    FiftyFifty,
    GeneralPrizePool,
]

In [24]:
def infer_contest_style(service, title) -> ContestStyle:
    if service == 'draftkings':
        if ('Showdown' in title or
            re.match('.*.{2,3} vs .{2,3}\)', title)):
           return ContestStyle.SHOWDOWN
        return ContestStyle.CLASSIC
    if service == 'fanduel':
        if '@' in (title or ''):
            return ContestStyle.SHOWDOWN
        return ContestStyle.CLASSIC
    if service == 'yahoo':
        if (' Cup ' in title or 
            ' to 1st]' in title or 
            ' 50/50' in title or
            'QuickMatch vs ' in title or 
            'H2H vs ' in title or
            '-Team' in title or   # N-team contests are classic
            'Freeroll' in title or
            'Quadruple Up' in title or
            'Guaranteed' in title):
           return ContestStyle.CLASSIC
    raise NotImplementedError(f"Could not infer contest style for {service=} {title=}")    
    
    
def infer_contest_type(service, title) -> str:
    if service == 'draftkings':
        if re.match('.* vs\. [^)]+$', title):
            return 'H2H'
        return FiftyFifty.NAME if 'Double Up' in title else GeneralPrizePool.NAME
    if service == 'fanduel':
        if 'Head-to-head' in (title or ''):
            return 'H2H'
        if (title or '').startswith('50/50'):
            return FiftyFifty.NAME
        return GeneralPrizePool.NAME
    if service == 'yahoo':
        if (' QuickMatch vs ' in title or 
            'H2H vs ' in title):
            return 'H2H'
        if ' 50/50' in title:
            return FiftyFifty.NAME
        if (' Cup ' in title or 
            ' to 1st]' in title or 
            'Freeroll' in title or
            'Quadruple Up' in title or
            '-Team' in title or                        # multi-team games are GPP if not caught by 50/50
            title.endswith('Team Winner Takes All') or        # treat winner takes all like a gpp
            'Guaranteed' in title):
           return GeneralPrizePool.NAME
    raise NotImplementedError(f"Could not infer contest type for {service=} {title=}")
    

def add_bet_links(service, contest_df: pd.DataFrame) -> pd.DataFrame:
    pass
    
    
def get_contest_df(service, sport, style, contest_type, min_date, max_date) -> pd.DataFrame:
    """ 
    create a dataframe from the contest dataset
    """
    contest_df = pd.read_csv(service + ".contest.csv", parse_dates=['date']) \
                   .query('sport == @sport and @min_date <= date < @max_date') \
                   [['contest_id', 'date', 'title', 'top_score', 'last_winning_score', 'entries']]
    contest_df.date = contest_df.date.dt.normalize()
    contest_df = contest_df.where(contest_df.notnull(), None)

    # add style and type
    #     with pd.option_context('max_rows', 1000, 'max_colwidth', 100):
    #         display(contest_df)
    contest_df['style'] = contest_df.title.map(
        partial(infer_contest_style, service)
    )
    contest_df['type'] = contest_df.title.map(
        partial(infer_contest_type, service)
    )
    queries = []
    if style is not None:
        # print(f"Filtering for {style=}")
        queries.append('style == @style')
    if contest_type is not None:
        # print(f"Filtering for {contest_type=}")
        queries.append('type == @contest_type.NAME')
    if len(queries) > 0:
        contest_df = contest_df.query(' and '.join(queries))

    bet_df = pd.read_csv(service + ".betting.csv") \
               .drop_duplicates('contest_id') \
               .set_index('contest_id')[['link']]
    contest_df = contest_df.merge(bet_df, how='left', on='contest_id')
    return contest_df


# contest_df = get_contest_df("draftkings", "nhl", ContestStyle.CLASSIC, FiftyFifty, date(2019, 1, 1), date(2020, 1, 1))
# with pd.option_context('max_rows', 1000, 'max_columns', 100, 'max_colwidth', 99999):
#    display(contest_df.sort_values(['style', 'type']))

In [25]:
SERVICE_ABBR = {
    'fanduel': 'fd',
    'draftkings': 'dk',
    'yahoo': 'y'
}


def get_draft_df(service, sport, style, min_date, max_date) -> pd.DataFrame:
    draft_df = pd.read_csv(service + ".draft.csv", parse_dates=['date']) \
                 .query('sport == @sport and @min_date <= date < @max_date')
    assert len(draft_df) > 0, \
        f"no draft data found for {sport=}, {service=}, {style=}, {min_date=}, {max_date=}"

    draft_df['service'] = draft_df.contest.map(lambda contest: contest.split('-', 1)[0])
    draft_df.team_abbr = draft_df.team_abbr.str.upper()
    service_abbr = SERVICE_ABBR[service]
    draft_df = draft_df.query('service == @service_abbr and team_abbr.notnull()') \
        [['position', 'name', 'team_abbr', 'contest_id']]
    
    return draft_df


# draft_df = get_draft_df(SERVICE, SPORT, STYLE, MIN_DATE, MAX_DATE)
# display(draft_df)

In [26]:
from fantasy_py import FANTASY_SERVICE_DOMAIN, lineup, util


def create_team_contest_df(contest_df, draft_df, service, sport):
    service_cls = util.CLSRegistry.get_class(FANTASY_SERVICE_DOMAIN, service)
    abbr_remaps = service_cls.get_team_abbr_remapping(sport)

    # add team/lineup draft data
    team_contest_df = pd.merge(contest_df, draft_df, on='contest_id')
    team_contest_df.team_abbr = team_contest_df.team_abbr.map(
        lambda abbr: abbr_remaps.get(abbr) or abbr
    )
    
    return team_contest_df
    
    
# team_contest_df = create_team_contest_df(contest_df, draft_df, SERVICE, SPORT)
# print(f"{len(team_contest_df.contest_id.unique())} contests")
# display(team_contest_df)

In [27]:
import os


def common_title(title_series: pd.Series) -> str:
    """ the title of a contest will be the common prefix amongst all the possible contest titles """
    title_list = title_series.tolist()
    if None in title_list:
        return ""
    return os.path.commonprefix(title_list)


def create_teams_contest_df(tc_df):
    """ group contests together and create team sets used in each contest """
    tc_df = pd.DataFrame(
        tc_df.groupby(
            ['contest_id', 'date', 'style', 'type', 'link', 'entries']
        ).agg(
            {'team_abbr': set,
             'title': common_title,
             'top_score': lambda score: score.mean(),
             'last_winning_score': lambda score: score.mean()}
        )
    ).reset_index()
    tc_df = tc_df.rename(columns={'team_abbr': 'teams'})
    tc_df['draft_team_count'] = tc_df.teams.map(len)
    return tc_df


# teams_contest_df = create_teams_contest_df(team_contest_df)
# display(f"{len(teams_contest_df)} team sets")
# display(teams_contest_df)

In [28]:
# load slate data from db
import sqlite3
import pandas as pd


def get_slate_df(db_filename, service, style, min_date, max_date) -> Optional[pd.DataFrame]:
    conn = sqlite3.connect(db_filename)
    sql = f"""
    select distinct daily_fantasy_slate.id as slate_id, date, 
        daily_fantasy_slate.name as slate_name, style as contest_style, abbr
    from daily_fantasy_slate 
        join daily_fantasy_cost on daily_fantasy_slate.id = daily_fantasy_cost.daily_fantasy_slate_id
        join team on team_id = team.id
    where service = '{service}' and date between '{min_date}' and date('{max_date}', '-1 days')
    """

    if style is not None:
        sql += f" and style = '{style.name}'"

    # print(sql)
    db_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
    # with pd.option_context('max_rows', 100):
    #     display(db_df)
    conn.close()
    if len(db_df) == 0:
        return None

    # get team sets
    slate_db_df = pd.DataFrame(
        db_df.groupby(
            ['slate_id', 'date', 'slate_name', 'contest_style']
        ).agg(
            {'abbr': set}
        )
    ).reset_index()
    
    try:
        slate_db_df = slate_db_df.set_index('date') \
                                 .rename(columns={'abbr': 'teams'})
    except Exception as ex:
        raise ValueError("Error processing slate db df", slate_db_df) from ex
        
    slate_db_df['team_count'] = slate_db_df.teams.map(len)
    return slate_db_df


# slate_db_df = get_slate_df(DB_FILENAME, SERVICE, STYLE, MIN_DATE, MAX_DATE)
# with pd.option_context('max_rows', 100):
#     display(slate_db_df)

In [29]:
import numpy as np
from typing import Optional


def get_slate_id(contest_row, slate_db_df) -> pd.Series:
    """ 
    guesses the db slate id contest_row
    returns - series of (slate_id, number of teams playing in slate)
    """
    try:
        date_slates = slate_db_df.loc[[contest_row.date]].sort_values('team_count')
    except KeyError as ke:
        # print(f"Key error/No slate found for {contest_row.date}")
        return pd.Series({'slate_id': None, 'team_count': None})
    try:
        slates = date_slates.query("@contest_row.teams <= teams")
    except Exception as e:
        print(f"Unhandled exception querying for teams date {contest_row.date}")
        # display(date_slates)
        raise
        
    slates_found = len(slates)
    if slates_found == 0:
        # print(f"On {contest_row.date} the {len(date_slates)} db slates don't match contest teams {contest_row.teams}. "
        #       "DB slate team sets were:")
        # with pd.option_context('max_colwidth', None):
        #     display(date_slates[['slate_name', 'teams']])
        return pd.Series({'slate_id': None, 'team_count': None})
    #     if slates_found > 1:
    #         # display(slates)
    #         slates = slates.head(1)
    #         print(f"{slates_found} slates matched contest {contest_row.date} '{contest_row.title}'. "
    #               f"Using '{slates.iloc[0].slate_name}'")
    return slates.iloc[0][['slate_id', 'team_count']]
    
# slate_ids_df = teams_contest_df.apply(get_slate_id, axis=1)
# display(slate_ids_df)

In [30]:
# slate game score info

def create_team_score_df(db_filename, slate_ids_str, top_player_percentile) -> Optional[pd.DataFrame]:
    conn = sqlite3.connect(db_filename)
    sql = f"""
    select distinct daily_fantasy_slate.id as slate_id, game.id as game_id, 
           game.score_home, game.score_away
    from daily_fantasy_slate
        join daily_fantasy_cost on daily_fantasy_slate.id = daily_fantasy_cost.daily_fantasy_slate_id
        join game on ((game.date = daily_fantasy_slate.date or 
		               game.dt between daily_fantasy_slate.date and datetime(daily_fantasy_slate.date, '+1 days', '+6 hours')) and
                      game.season = daily_fantasy_slate.season and 
                      (daily_fantasy_cost.team_id in (game.away_team_id, game.home_team_id)))
    where daily_fantasy_slate.id in ({slate_ids_str})
    """

    # print("team score sql\n", sql)
    db_team_score_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
    conn.close()
    # display("team score df", db_team_score_df)
    if len(db_team_score_df) == 0:
        return None

    team_score_df = db_team_score_df.melt(id_vars=['slate_id', 'game_id'], value_vars=['score_home', 'score_away']) \
              .groupby(['slate_id']) \
              .agg({'value': ['median', lambda x: np.percentile(x, top_player_percentile * 100)]})
    team_score_df.columns = ['team-med', f'team-{top_player_percentile * 100}th_pctl']
    return team_score_df


# for mlb double headers this will cause inaccuracy for players that played in both games
# slate_ids_str = ','.join(map(str, slate_ids_df.slate_id.dropna()))
# team_score_df = create_team_score_df(DB_FILENAME, slate_ids_str, TOP_PLAYER_PERCENTILE)
# display(team_score_df)

In [31]:
# get position scores

def get_exploded_pos_df(db_filename, sport, service_abbr, slate_ids_str, 
                        cost_pos_drop: Optional[set], cost_pos_rename: Optional[dict]) -> Optional[pd.DataFrame]:
    conn = sqlite3.connect(db_filename)
    stat_names = get_stat_names(sport, service_abbr, as_str=True)

    # for mlb double headers this query will cause inaccuracy for players that played in both games
    # games have a date equal to the slate date or must have a datetime starting prior to 6am on the following date
    sql = f"""
    select daily_fantasy_slate.id as slate_id, positions as cost_positions, 
        player_position.abbr as stat_position, 
        value as score, daily_fantasy_cost.team_id, daily_fantasy_cost.player_id
    from daily_fantasy_slate
        join daily_fantasy_cost on 
           daily_fantasy_slate.id = daily_fantasy_cost.daily_fantasy_slate_id
        join game on (
           (game.date = daily_fantasy_slate.date or 
		    game.dt between daily_fantasy_slate.date and datetime(daily_fantasy_slate.date, '+1 days', '+6 hours')) and
           game.season = daily_fantasy_slate.season and 
           (daily_fantasy_cost.team_id in (game.away_team_id, game.home_team_id))
        )
        join calculation_datum on (
            calculation_datum.game_id = game.id and 
            calculation_datum.player_id is daily_fantasy_cost.player_id and
            calculation_datum.team_id = daily_fantasy_cost.team_id
        )
        join statistic on calculation_datum.statistic_id = statistic.id
        join player on daily_fantasy_cost.player_id = player.id
        join player_position on player.player_position_id = player_position.id
    where daily_fantasy_slate.id in ({slate_ids_str}) and
        statistic.name in ({stat_names})
    """
    # print("Exploded POS data:\n", sql)

    db_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
    conn.close()
    
    if len(db_df) == 0:
        return None
    
    # TODO: only need to test for 'Unknown' so long as it is still stored in DB as a cost position value
    apply_func = lambda row: row.stat_position if row.cost_positions is None else row.cost_positions
    db_df['position'] = db_df.apply(
        apply_func,
        axis=1
    )

    db_exploded_pos_df = db_df.assign(position=db_df.position.str.split('/')) \
                              .explode('position')
    
    if cost_pos_drop is not None:
        db_exploded_pos_df = db_exploded_pos_df.query('position not in @cost_pos_drop')
    if cost_pos_rename is not None:
        for old_pos, new_pos in cost_pos_rename.items():
            db_exploded_pos_df.loc[db_exploded_pos_df.position == old_pos, 'position'] = new_pos
    return db_exploded_pos_df


def get_position_scores(db_exploded_pos_df, top_player_percentile):
    db_pos_scores_df = db_exploded_pos_df[['slate_id', 'position', 'score']] \
                 .groupby(['slate_id', 'position']) \
                 .agg(['median', lambda x: np.percentile(x, top_player_percentile * 100)]) 
    db_pos_scores_df.columns = ['med-dfs', f'{top_player_percentile * 100}th-pctl-dfs']
    db_pos_scores_df = db_pos_scores_df.reset_index(level='position') \
                 .pivot(columns='position', values=['med-dfs', f'{top_player_percentile * 100}th-pctl-dfs'])
    return db_pos_scores_df


# SPORT = 'lol'
# SERVICE = 'draftkings'

# db_exploded_pos_df = get_exploded_pos_df(DB_FILENAME, SPORT, SERVICE_ABBR[SERVICE], slate_ids_str)
# display(db_exploded_pos_df)
# db_pos_scores_df = get_position_scores(db_exploded_pos_df, TOP_PLAYER_PERCENTILE)
# display(db_pos_scores_df)    

In [32]:
def get_player_scores(db_filename, db_exploded_pos_df,
                      sport, service_abbr, top_player_days, min_date, max_date):
    """ Get top player scores (e.g. players that are likely to be highly drafted) """
    conn = sqlite3.connect(db_filename)
    stat_names = get_stat_names(sport, service_abbr, as_str=True)

    sql = f"""
    select game.date, calculation_datum.player_id, calculation_datum.team_id, calculation_datum.value as score 
    from game
        join calculation_datum on calculation_datum.game_id = game.id
        join statistic on calculation_datum.statistic_id = statistic.id
    where statistic.name in ({stat_names}) 
        and date between date('{min_date}', '-{top_player_days} days') and date('{max_date}', '-1 days')
    """
    # print(sql)
    db_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
    conn.close()
    # display(db_df)

    db_filtered_df = db_df.query(
        '(player_id in @db_exploded_pos_df.player_id) '
        'or (player_id.isnull() and team_id in @db_exploded_pos_df.team_id)'
    )
    return db_filtered_df

    
# db_filtered_df = get_player_scores(DB_FILENAME, SPORT, SERVICE_ABBR[SERVICE], TOP_PLAYER_DAYS, MIN_DATE, MAX_DATE)
# display(db_filtered_df)

In [33]:
from argparse import Namespace
import math
from typing import Optional

from fantasy_py import db, FANTASY_SERVICE_DOMAIN
from fantasy_py.lineup import gen_lineups, LineupSolutions, MultipleScoreError
from fantasy_py.lineup.knapsack import MixedIntegerKnapsackSolver
from fantasy_py.lineup.do_gen_lineup import lineup_plan_helper
from fantasy_py.util import CLSRegistry

mse = None

def best_possible_lineup_score(
    db_filename, sport, service_abbr,
    slate_id,
    best_score_cache: Optional[dict[int, Optional[float]]] = None
) -> Optional[float]:
    """ 
    calculate the best possible fantasy score for the requested slate
    used as a map function for a pandas series.
    
    pts_stats_names - the statistic names for the scores to use for players/teams
    best_score_cache - cache of slate ids mapped to their score. this will be 
        searched and possibly updated to include the score for the requested slate
        
    returns - None if there is an error calculating the best possible score
    """
    if not isinstance(slate_id, (int, float)) or math.isnan(slate_id):
        return None
    
    slate_id = int(slate_id)
    if best_score_cache is not None:
        if slate_id in best_score_cache:
            # print(f"For {slate_id=} using cached best score value of {best_score_dict[slate_id]}")
            return best_score_cache[slate_id]
        print(f"{slate_id=} not in best score cache")

    db_obj = db.get_db_obj(db_filename)
    
    # slate date
    with db_obj.session_scoped() as session:
        slate = session.query(db.DailyFantasySlate) \
                       .filter(db.DailyFantasySlate.id == int(slate_id)) \
                       .one_or_none()
        if slate == None:
            print(f"Error: Unable to find {slate_id=} in database")
            return None

        game_date = slate.date
        slate_name = slate.name
        service = slate.service
    
    print(f"Generating best historic lineup for {game_date} slate '{slate_name}' ({slate_id})")
    
    # TODO: the following should also take slate_id
    # get the starters
    starters = db_obj.db_manager.get_starters(
        service,
        games_date=game_date,
        db_obj=db_obj,
    ).filter_by_slate(slate_name)
    # print("starters: ", starters)

    # TODO: most of the following should be defaults for the args object and should not be required here
    args = Namespace(
        starters_stale_mins=9999999, 
        cache_dir=None,
        drop_games=None,
        no_fail=False,
        service=service,
        match_threshold=.5,
        slate=slate_name,
        no_default_lineup_plans=False,
        lineup_plan_paths=None,
        model_ids=None,
    )
    args, fca = db_obj.db_manager.gen_lineups_preprocess(db_obj, args, None, game_date, starters=starters)
    # print("fca: ", fca)

    service_cls = CLSRegistry.get_class(FANTASY_SERVICE_DOMAIN, service)

    args, lineup_plan_constraints = lineup_plan_helper(
        args, db_obj, starters, service_cls, []
    )
    constraints = service_cls.get_constraints(db_obj.db_manager.ABBR,
                                              slate=starters.slates[args.slate])
    assert constraints is not None
        
    solver = MixedIntegerKnapsackSolver(
        constraints.lineup_constraints,
        constraints.budget,
        totals_func=constraints.totals_func,
        fill_all_positions=constraints.fill_all_positions,
    )

    season = db_obj.db_manager.season_for_date(game_date)
    pts_stats = get_stat_names(sport, service_abbr)
    
    try:
        lineups, _, score_data = gen_lineups(
            db_obj, fca, args.model_ids, solver, service_cls, 
            1,       # of lineups
            season,
            slate=slate_name, 
            slate_info=starters.slates[slate_name],
            use_pts=True, use_pts_stat_names=pts_stats,
            skip_predictions=True,
            slate_date=game_date,
        )
        score = lineups[0].fpts
    except Exception as ex:
        print(f"Error calculating best lineup for {slate_id=} on {game_date}. ", ex)
        score = None
    
    # display(score_data)
    # display(lineups[0])
    # args.slate_id = slate_id
    # solutions = LineupSolutions(args, lineups, fca, 
    #                             score_data, 
    #                             game_date, 
    #                             db_obj.db_manager.ABBR, {})
    # display(
    #     solutions.get_lineup_as('df', 0, db_obj)
    # )
    
    if best_score_cache is not None:
        best_score_cache[slate_id] = score
    return score

# bpls = best_possible_lineup_score(
#     "/home/delano/working/fantasy/lol_hist_2014-2020.scored.db",
#     'lol', 'dk',
#     119,
# )
# print(bpls.args)

In [34]:
def create_predict_df(teams_contest_df, slate_ids_df, team_score_df, db_pos_scores_df, top_lineup_scores) -> pd.DataFrame:
    """
    join contest, slate id, team score and player position scores
    """
    
    dfs = [
        teams_contest_df[['date', 'style', 'type', 'top_score', 'last_winning_score', 'link']],
        top_lineup_scores,
        slate_ids_df,
    ]
    predict_df = pd.concat(dfs, axis='columns') \
                   .join(team_score_df, on='slate_id') \
                   .join(db_pos_scores_df, on='slate_id')
    return predict_df


# predict_df = create_predict_df(teams_contest_df, slate_ids_df, team_score_df, db_pos_scores_df)
# with pd.option_context('max_columns', 100):
#     display(predict_df)
    
    
# filename = f"{SPORT}-{SERVICE}-{STYLE.name}-{CONTEST_TYPE.NAME}.csv"
# print(f"Writing data to file '{filename}'")
# predict_df.to_csv(filename, index=False)

In [35]:
import json
from contextlib import contextmanager
from typing import Literal

TopScoreCacheMode = Literal['default', 'overwrite', 'missing']

@contextmanager
def best_score_cache(sport: str, top_score_cache_mode: TopScoreCacheMode) -> dict[int, Optional[float]]:
    top_score_cache_filename = sport + "-slate.top_score.json"
    top_score_dict: dict[int, float]
    orig_top_score_dict = {}
        
    if os.path.isfile(top_score_cache_filename):
        if top_score_cache_mode in ('default', 'missing'):
            # print(f"Loading best score cache from '{top_score_cache_filename}'")
            with open(top_score_cache_filename, 'r') as f:
                cache_data = json.load(f)
            for slate_id, score in cache_data.items():
                if top_score_cache_mode == 'missing' and score is None:
                    continue
                orig_top_score_dict[int(slate_id)] = score
        elif top_score_cache_mode == 'overwrite':
            print(f"Overwriting existing best score cache data at '{top_score_cache_filename}'")
        else:
            raise ValueError("Unexpected top score cache mode", top_score_cache_mode)
    else:
        print(f"Best score cache data not found! '{top_score_cache_filename}'")
        orig_top_score_dict = {}
        
    # make a copy so that we can figure out if there are updates
    # TODO: for diff, can probably do this more efficiently by comparing a hash of the before and after
    top_score_dict = dict(orig_top_score_dict)
    
    yield top_score_dict
    
    if orig_top_score_dict != top_score_dict:
        print(f"Writing updated best score values to cache '{top_score_cache_filename}'")
        with open(top_score_cache_filename, 'w') as f:
            json.dump(top_score_dict, f)   
    # else:
    # print("No change to best score cache.")


def generate_dataset(
    sport, cfg, service, style, contest_type,
    min_date=None, max_date=None, max_count: Optional[int] = None,
    top_score_cache_mode: TopScoreCacheMode = 'default',
) -> pd.DataFrame:
    """
    max_count - maximum number of slates to process
    min_date - includsive
    max_date - not inclusive
    top_score_cache_mode - 
        'default'=load and use the cache, 
        'overwrite'=overwrite all existing cache data if any exists
        'missing'=use all existing valid cache data, any cached failures will be rerun
    """
    assert (min_date is None) or (max_date is None) or min_date < max_date, \
        "invalidate date range. max_date must be greater than min_date. Or one must be None"
    filename = f"{sport}-{service}-{style.name}-{contest_type.NAME}.csv"
    # print(f"Creating data for file '{filename}'")

    db_filename = cfg['db_filename']
    if min_date is None:
        min_date = cfg['min_date']
    if max_date is None:
        max_date = cfg['max_date']
    
    contest_df = get_contest_df(service, sport, style, contest_type, min_date, max_date)
    if contest_df is not None:
        contest_df = contest_df.head(max_count)

    draft_df = get_draft_df(service, sport, style, min_date, max_date)
    # display(draft_df)

    team_contest_df = create_team_contest_df(contest_df, draft_df, service, sport)
    # display(f"{len(team_contest_df.contest_id.unique())} contests", team_contest_df)

    teams_contest_df = create_teams_contest_df(team_contest_df)
    # display(f"{len(teams_contest_df)} slate team sets", 
    #         teams_contest_df)

    slate_db_df = get_slate_df(db_filename, service, style, min_date, max_date)
    if slate_db_df is None:
        raise ValueError("No slates found for", service, style, min_date, max_date)
    # with pd.option_context('max_rows', 100):
    #     display("Slate db df", slate_db_df)
    
    slate_ids_df = teams_contest_df.apply(get_slate_id, axis=1, args=(slate_db_df, ))
    # display(slate_ids_df)

    if len(slate_ids_df) == 0:
        raise ValueError("No slates ids found (based on teams contest df)")
    
    try:
        # need this for subsequent sql queries
        slate_ids_str = ','.join(map(str, slate_ids_df.slate_id.dropna().astype(int)))
    except Exception as ex:
        raise ValueError("Something wrong with slate_ids_df", slate_ids_df) from ex
        
    if len(slate_ids_str) == 0:
        raise ValueError("No slate ids found after removing Nones")
    team_score_df = create_team_score_df(db_filename, slate_ids_str, TOP_PLAYER_PERCENTILE)
    if team_score_df is None:
        raise ValueError("Empty team score df")
    # display("team score df", team_score_df)

    db_exploded_pos_df = get_exploded_pos_df(
        db_filename, sport, SERVICE_ABBR[service], slate_ids_str,
        cfg.get('cost_pos_drop'), cfg.get('cost_pos_rename'),
    )
    
    if db_exploded_pos_df is None:
        raise ValueError("No exploded positional data returned!")
    # display(db_exploded_pos_df)
    
    db_pos_scores_df = get_position_scores(db_exploded_pos_df, TOP_PLAYER_PERCENTILE)
    # display(db_pos_scores_df)    

    db_filtered_df = get_player_scores(
        db_filename, db_exploded_pos_df,
        sport, SERVICE_ABBR[service], TOP_PLAYER_DAYS, min_date, max_date
    )
    # display(db_filtered_df)

    # cache for top scores
    with best_score_cache(sport, top_score_cache_mode) as top_score_dict: 
        top_lineup_scores = slate_ids_df.slate_id.map(
            partial(best_possible_lineup_score, 
                    db_filename, sport, SERVICE_ABBR[service],
                    best_score_cache=top_score_dict)
        )
        
    top_lineup_scores.name = 'best-possible-score'
    predict_df = create_predict_df(teams_contest_df, slate_ids_df, team_score_df, db_pos_scores_df, top_lineup_scores)
    # with pd.option_context('max_columns', 100):
    #     display("predict df", predict_df)

    filename = f"{sport}-{service}-{style.name}-{contest_type.NAME}.csv"
    print(f"Writing data to file '{filename}'")
    predict_df.to_csv(filename, index=False)
    return predict_df

In [39]:
TOP_SCORE_CACHE_MODE: TopScoreCacheMode = 'default'

for sport in ['lol']: # SPORT_CFGS:
    cfg_min_date = SPORT_CFGS[sport]['min_date']
    cfg_max_date = SPORT_CFGS[sport]['max_date']
    for service in SPORT_CFGS[sport].get('services', SERVICES):
        min_date = cfg_min_date.get(service, cfg_min_date.get(None)) \
            if isinstance(cfg_min_date, dict) else cfg_min_date
        max_date = cfg_max_date.get(service, cfg_max_date.get(None)) \
            if isinstance(cfg_max_date, dict) else cfg_max_date
        for style in STYLES:
            for contest_type in CONTEST_TYPES:
                print(f"Processing {sport}, {service}, {style}, {contest_type}")
                try:
                    df = generate_dataset(sport, SPORT_CFGS[sport], service, style, contest_type,
                                          min_date=min_date, max_date=max_date,
                                          top_score_cache_mode=TOP_SCORE_CACHE_MODE)
                    display(df)
                except ValueError as ex:
                    failure = ex
                    print(f"********************* Error for {sport}, {service}, {style}, {contest_type}: {ex}")
                                        
# try:                
#     sport = 'lol'
#     df = generate_dataset(
#         sport, SPORT_CFGS[sport], 'draftkings', 
#         ContestStyle.CLASSIC, GeneralPrizePool,
#         min_date=date(2020, 6, 12),
#         max_date=date(2020, 6, 13),
#         max_count=1,
#         top_score_cache_mode=TOP_SCORE_CACHE_MODE,
#     )
#     with pd.option_context('max_rows', 1000, 'max_columns', 100, 'max_colwidth', 9999):
#         display(df)
# except Exception as ex_:
#     ex = ex_
#     raise
# print("Done!")

Processing lol, draftkings, classic, <class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>
Writing data to file 'lol-draftkings-CLASSIC-FIFTY_FIFTY.csv'


Unnamed: 0,date,style,type,top_score,last_winning_score,link,best-possible-score,slate_id,team_count,team-med,...,"(med-dfs, ADCB)","(med-dfs, JNG)","(med-dfs, MID)","(med-dfs, SUP)","(med-dfs, TOP)","(70.0th-pctl-dfs, ADCB)","(70.0th-pctl-dfs, JNG)","(70.0th-pctl-dfs, MID)","(70.0th-pctl-dfs, SUP)","(70.0th-pctl-dfs, TOP)"
0,2020-06-23,classic,FIFTY_FIFTY,594.09,561.47,https://www.draftkings.com/contest/gamecenter/...,634.210197,137.0,4.0,1.5,...,68.259748,61.530099,63.079799,42.600050,52.870249,81.205638,79.20218,67.809728,50.402182,68.43627
1,2020-06-24,classic,FIFTY_FIFTY,639.59,582.59,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
2,2020-06-27,classic,FIFTY_FIFTY,213.20,111.44,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
3,2020-06-28,classic,FIFTY_FIFTY,240.26,183.86,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
4,2020-06-29,classic,FIFTY_FIFTY,458.53,422.58,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2020-08-30,classic,FIFTY_FIFTY,953.50,649.70,https://www.draftkings.com/contest/gamecenter/...,1039.279748,255.0,4.0,2.5,...,147.370151,112.190000,140.040000,89.069749,114.560000,153.790272,120.03799,153.313990,100.053548,117.87800
96,2020-09-05,classic,FIFTY_FIFTY,990.12,913.06,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
97,2020-09-05,classic,FIFTY_FIFTY,953.81,883.40,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
98,2020-09-06,classic,FIFTY_FIFTY,1158.96,989.86,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,


Processing lol, draftkings, classic, <class 'fantasy_py.lineup.strategy.bet_lineup.GeneralPrizePool'>




Writing data to file 'lol-draftkings-CLASSIC-GPP.csv'


Unnamed: 0,date,style,type,top_score,last_winning_score,link,best-possible-score,slate_id,team_count,team-med,...,"(med-dfs, ADCB)","(med-dfs, JNG)","(med-dfs, MID)","(med-dfs, SUP)","(med-dfs, TOP)","(70.0th-pctl-dfs, ADCB)","(70.0th-pctl-dfs, JNG)","(70.0th-pctl-dfs, MID)","(70.0th-pctl-dfs, SUP)","(70.0th-pctl-dfs, TOP)"
0,2020-04-29,classic,GPP,296.33,0.00,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
1,2020-04-29,classic,GPP,298.87,296.33,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
2,2020-06-12,classic,GPP,250.37,209.50,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
3,2020-06-12,classic,GPP,644.56,640.81,https://www.draftkings.com/contest/gamecenter/...,656.560000,116.0,4.0,1.0,...,63.130000,53.51,57.44,42.260000,41.96,96.530000,70.24000,82.38000,74.400000,69.936
4,2020-06-12,classic,GPP,253.13,244.10,https://www.draftkings.com/contest/gamecenter/...,290.160000,119.0,4.0,0.5,...,25.160000,21.48,26.72,14.620000,22.89,27.200000,25.01200,34.33200,20.756000,30.824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,2020-08-30,classic,GPP,1007.66,736.47,https://www.draftkings.com/contest/gamecenter/...,1039.279748,255.0,4.0,2.5,...,147.370151,112.19,140.04,89.069749,114.56,153.790272,120.03799,153.31399,100.053548,117.878
119,2020-09-05,classic,GPP,1019.63,950.57,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
120,2020-09-05,classic,GPP,987.77,897.52,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,
121,2020-09-06,classic,GPP,1158.96,989.86,https://www.draftkings.com/contest/gamecenter/...,,,,,...,,,,,,,,,,


Processing lol, draftkings, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>
********************* Error for lol, draftkings, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>: No slates ids found (based on teams contest df)
Processing lol, draftkings, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.GeneralPrizePool'>
********************* Error for lol, draftkings, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.GeneralPrizePool'>: No slates ids found (based on teams contest df)
Processing lol, fanduel, classic, <class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>




Writing data to file 'lol-fanduel-CLASSIC-FIFTY_FIFTY.csv'


Unnamed: 0,date,style,type,top_score,last_winning_score,link,best-possible-score,slate_id,team_count,team-med,...,"(med-dfs, ADCB)","(med-dfs, JNG)","(med-dfs, MID)","(med-dfs, SUP)","(med-dfs, TOP)","(70.0th-pctl-dfs, ADCB)","(70.0th-pctl-dfs, JNG)","(70.0th-pctl-dfs, MID)","(70.0th-pctl-dfs, SUP)","(70.0th-pctl-dfs, TOP)"
0,2020-07-31,classic,FIFTY_FIFTY,663.64,523.88,https://www.fanduel.com/entry/ACQTIUJTG,,,,,...,,,,,,,,,,
1,2020-07-26,classic,FIFTY_FIFTY,630.05,574.08,https://www.fanduel.com/entry/AEGKAQCUS,,,,,...,,,,,,,,,,
2,2020-07-26,classic,FIFTY_FIFTY,211.87,159.45,https://www.fanduel.com/entry/AFIFBZFXA,,,,,...,,,,,,,,,,
3,2020-07-19,classic,FIFTY_FIFTY,204.41,171.79,https://www.fanduel.com/entry/AGXWWXWWX,,,,,...,,,,,,,,,,
4,2020-07-24,classic,FIFTY_FIFTY,190.1,105.2,https://www.fanduel.com/entry/AJPKUMCXQ,,,,,...,,,,,,,,,,
5,2020-07-18,classic,FIFTY_FIFTY,312.49,232.35,https://www.fanduel.com/entry/ALTBFBLFC,,,,,...,,,,,,,,,,
6,2020-07-24,classic,FIFTY_FIFTY,575.54,514.25,https://www.fanduel.com/entry/AOYLIDULC,585.3807,346.0,8.0,1.5,...,67.4098,57.339999,62.210049,54.139602,55.540149,68.73996,73.14378,74.290188,57.499518,65.449832
7,2020-06-24,classic,FIFTY_FIFTY,621.03,564.89,https://www.fanduel.com/entry/AVJRZDWFH,,,,,...,,,,,,,,,,
8,2020-07-31,classic,FIFTY_FIFTY,216.82,189.66,https://www.fanduel.com/entry/AYOSOZXKL,,,,,...,,,,,,,,,,
9,2020-07-19,classic,FIFTY_FIFTY,673.6,643.72,https://www.fanduel.com/entry/BFNLHRGDF,698.649902,340.0,8.0,1.5,...,63.519801,51.049949,62.6599,35.7798,49.89995,83.18596,87.37799,80.25198,74.67397,78.256


Processing lol, fanduel, classic, <class 'fantasy_py.lineup.strategy.bet_lineup.GeneralPrizePool'>




Writing data to file 'lol-fanduel-CLASSIC-GPP.csv'


Unnamed: 0,date,style,type,top_score,last_winning_score,link,best-possible-score,slate_id,team_count,team-med,...,"(med-dfs, ADCB)","(med-dfs, JNG)","(med-dfs, MID)","(med-dfs, SUP)","(med-dfs, TOP)","(70.0th-pctl-dfs, ADCB)","(70.0th-pctl-dfs, JNG)","(70.0th-pctl-dfs, MID)","(70.0th-pctl-dfs, SUP)","(70.0th-pctl-dfs, TOP)"
0,2020-07-04,classic,GPP,178.02,104.02,https://www.fanduel.com/entry/ABAZCWVAF,,,,,...,,,,,,,,,,
1,2020-07-31,classic,GPP,663.64,553.5,https://www.fanduel.com/entry/AOYNPNMJG,,,,,...,,,,,,,,,,
2,2020-06-26,classic,GPP,229.92,112.7,https://www.fanduel.com/entry/AQOOFQQQK,,,,,...,,,,,,,,,,
3,2020-07-24,classic,GPP,561.55,520.93,https://www.fanduel.com/entry/ARPYYGDGQ,585.3807,346.0,8.0,1.5,...,67.4098,57.339999,62.210049,54.139602,55.540149,68.73996,73.14378,74.290188,57.499518,65.449832
4,2020-06-20,classic,GPP,670.54,563.26,https://www.fanduel.com/entry/ASEGPNCBS,,,,,...,,,,,,,,,,
5,2020-06-14,classic,GPP,286.51,152.37,https://www.fanduel.com/entry/AUVUETBCK,,,,,...,,,,,,,,,,
6,2020-08-01,classic,GPP,249.49,183.11,https://www.fanduel.com/entry/AYAWAWBDG,,,,,...,,,,,,,,,,
7,2020-04-29,classic,GPP,738.64,616.32,https://www.fanduel.com/entry/BDKDJFNHR,,,,,...,,,,,,,,,,
8,2020-07-30,classic,GPP,675.29,530.69,https://www.fanduel.com/entry/BDXTOPBOB,677.37,354.0,8.0,1.5,...,59.65,50.5803,68.260248,36.090048,49.559799,78.85373,69.27601,90.92603,56.24405,65.97998
9,2020-07-17,classic,GPP,247.36,200.14,https://www.fanduel.com/entry/BFNMCELNL,,,,,...,,,,,,,,,,


Processing lol, fanduel, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>
********************* Error for lol, fanduel, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>: No slates ids found (based on teams contest df)
Processing lol, fanduel, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.GeneralPrizePool'>
********************* Error for lol, fanduel, showdown, <class 'fantasy_py.lineup.strategy.bet_lineup.GeneralPrizePool'>: No slates ids found (based on teams contest df)
