# This notebook creates a dataset for min/max winning scores
- min win df score
- max win df score
- median team score (real game scores)
- 75th percentile team score (real game scores)
- number of slate games
- median df score for each player position
- 75th percentile df score for each player position
- median df score of top 50% of players for each position over the previous W weeks
- 75th percentile df score of top 50% for each position over the previous W weeks

In [59]:
# Load all daily fantasy contest data
from datetime import date
import re
from typing import Optional

import pandas as pd

from fantasy_py import ContestStyle
from fantasy_py.lineup.strategy import GeneralPrizePool, FiftyFifty

MIN_DATE: date   # contests on or after this date
MAX_DATE: date   # contests BEFORE (exclusive of) this date
    
# days to use to identify top players going into a slate
TOP_PLAYER_DAYS = 21
# players above this percentil over the last TOP_PLAYER_DAYS are considered top players
TOP_PLAYER_PERCENTILE = .70

# positions to ignore, default is to use everything
COST_POS_DROP = set()
# rename positions in cost data
COST_POS_RENAME = {}

# SPORT = 'nfl'
# MIN_DATE = date(2020, 8, 1)
# MAX_DATE = date(2021, 4, 1)

SPORT = 'mlb'
MIN_DATE = date(2019, 1, 1)
MAX_DATE = date(2021, 1, 1)
DB_FILENAME = f"/home/delano/working/fantasy/mlb_hist_20082020.scored.db"
COST_POS_DROP = {'DH', 'RP'}
COST_POS_RENAME = {'SP': 'P'}

# SPORT = 'nba'
# MIN_DATE = date(2019, 8, 1)
# MAX_DATE = date(2020, 8, 1)

# SPORT = 'nhl'
# MIN_DATE = date(2020, 8, 1)
# MAX_DATE = date(2021, 4, 1)

# fanduel/draftkings/yahoo
SERVICE = 'draftkings'
STYLE: Optional[ContestStyle] = ContestStyle.CLASSIC
# GeneralPrizePool/FiftyFifty
CONTEST_TYPE = FiftyFifty

def infer_contest_style(title) -> ContestStyle:
    if SERVICE == 'draftkings':
        if ('Showdown' in title or
            re.match('.*.{2,3} vs .{2,3}\)', title)):
           return ContestStyle.SHOWDOWN
        return ContestStyle.CLASSIC
    if SERVICE == 'fanduel':
        return ContestStyle.SHOWDOWN if '@' in title else ContestStyle.CLASSIC
    if SERVICE == 'yahoo':
        if (' Cup ' in title or 
            ' to 1st]' in title or 
            ' 50/50' in title or
            'QuickMatch vs ' in title or 
            'H2H vs ' in title or
            '-Team' in title or   # N-team contests are classic
            'Freeroll' in title or
            'Quadruple Up' in title or
            title.endswith('Guaranteed [No Management Fee]')):
           return ContestStyle.CLASSIC
    raise NotImplementedError(f"Could not infer contest style for {SERVICE=} {title=}")    
    
def infer_contest_type(title) -> str:
    if SERVICE == 'draftkings':
        if re.match('.* vs\. [^)]+$', title):
            return 'H2H'
        return FiftyFifty.NAME if 'Double Up' in title else GeneralPrizePool.NAME
    if SERVICE == 'fanduel':
        if 'Head-to-head' in title:
            return 'H2H'
        return FiftyFifty.NAME if title.startswith('50/50') else GeneralPrizePool.NAME
    if SERVICE == 'yahoo':
        if (' QuickMatch vs ' in title or 
            'H2H vs ' in title):
            return 'H2H'
        if ' 50/50' in title:
            return FiftyFifty.NAME
        if (' Cup ' in title or 
            ' to 1st]' in title or 
            'Freeroll' in title or
            'Quadruple Up' in title or
            title.endswith('-Team') or                        # multi-team games are GPP if not caught by 50/50
            title.endswith('Team Winner Takes All') or        # treat winner takes all like a gpp
            title.endswith('Guaranteed [No Management Fee]')):
           return GeneralPrizePool.NAME
    raise NotImplementedError(f"Could not infer contest type for {SERVICE=} {title=}")
    

contest_df = pd.read_csv(SERVICE + ".contest.csv", parse_dates=['date']) \
               .query('sport == @SPORT and @MIN_DATE <= date < @MAX_DATE') \
               [['contest_id', 'date', 'title', 'top_score', 'last_winning_score']]
contest_df.date = contest_df.date.dt.normalize()

# add style and type
contest_df['style'] = contest_df.title.map(infer_contest_style)
contest_df['type'] = contest_df.title.map(infer_contest_type)
queries = []
if STYLE is not None:
    print(f"Filtering for {STYLE=}")
    queries.append('style == @STYLE')
if CONTEST_TYPE is not None:
    print(f"Filtering for {CONTEST_TYPE=}")
    queries.append('type == @CONTEST_TYPE.NAME')
if len(queries) > 0:
    contest_df = contest_df.query(' and '.join(queries))

# with pd.option_context('max_rows', 1000, 'max_colwidth', 100):
display(contest_df.sort_values(['style', 'type']))

Filtering for STYLE=<ContestStyle.CLASSIC: 'classic'>
Filtering for CONTEST_TYPE=<class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>


Unnamed: 0,contest_id,date,title,top_score,last_winning_score,style,type
104,94463668,2020-10-16,MLB $1 Double Up,168.05,108.7,classic,FIFTY_FIFTY
159,91671215,2020-09-10,MLB $1 Double Up (Night),90.95,68.95,classic,FIFTY_FIFTY
175,91369755,2020-09-08,MLB $1 Double Up (Night),126.25,95.3,classic,FIFTY_FIFTY
208,91052666,2020-09-04,MLB $1 Double Up,164.45,137.25,classic,FIFTY_FIFTY
225,90921967,2020-09-02,MLB $1 Double Up,167.4,116.2,classic,FIFTY_FIFTY
227,90868059,2020-09-01,MLB $1 Double Up,205.45,167.6,classic,FIFTY_FIFTY
240,90725946,2020-08-30,MLB $1 Double Up,165.7,134.7,classic,FIFTY_FIFTY
268,90488711,2020-08-26,MLB $1 Double Up,187.55,135.55,classic,FIFTY_FIFTY
273,90476696,2020-08-25,MLB $1 Double Up,154.95,121.5,classic,FIFTY_FIFTY
277,90339911,2020-08-23,MLB $1 Double Up,131.5,91.5,classic,FIFTY_FIFTY


In [60]:
draft_df = pd.read_csv(SERVICE + ".draft.csv", parse_dates=['date']) \
             .query('sport == @SPORT and @MIN_DATE <= date < @MAX_DATE')
display(draft_df)
assert len(draft_df) > 0, "no draft data found"

draft_df['service'] = draft_df.contest.map(lambda contest: contest.split('-', 1)[0])
if SERVICE == 'fanduel':
    SERVICE_ABBR = 'fd'
elif SERVICE == 'draftkings':
    SERVICE_ABBR = 'dk'
elif SERVICE == 'yahoo':
    SERVICE_ABBR = 'y'
else:
    raise NotImplementedError()
draft_df = draft_df.query('service == @SERVICE_ABBR and team_abbr.notnull()') \
    [['position', 'name', 'team_abbr', 'contest_id']]
display(draft_df)

Unnamed: 0,position,name,team_abbr,draft_pct,contest,date,sport,contest_id
2272,CPT,T. Gonsolin,LAD,6.7,dk-mlb-20201021-MLB Showdown $2.5K Quarter Juk...,2020-10-21 20:08:00,mlb,94886275
2273,,T. Gonsolin,LAD,19.9,dk-mlb-20201021-MLB Showdown $2.5K Quarter Juk...,2020-10-21 20:08:00,mlb,94886275
2274,CPT,B. Snell,TB,16.3,dk-mlb-20201021-MLB Showdown $2.5K Quarter Juk...,2020-10-21 20:08:00,mlb,94886275
2275,,B. Snell,TB,39.9,dk-mlb-20201021-MLB Showdown $2.5K Quarter Juk...,2020-10-21 20:08:00,mlb,94886275
2276,CPT,J. Turner,LAD,5.4,dk-mlb-20201021-MLB Showdown $2.5K Quarter Juk...,2020-10-21 20:08:00,mlb,94886275
...,...,...,...,...,...,...,...,...
55808,2B,T. La Stella,LAA,5.2,dk-mlb-20190409-MLB $7.5K Quarter Jukebox [Jus...,2019-04-09 19:05:00,mlb,71238529
55809,3B,J. McNeil,NYM,4.9,dk-mlb-20190409-MLB $7.5K Quarter Jukebox [Jus...,2019-04-09 19:05:00,mlb,71238529
55810,SS,D. Swanson,ATL,7.3,dk-mlb-20190409-MLB $7.5K Quarter Jukebox [Jus...,2019-04-09 19:05:00,mlb,71238529
55811,OF,S. Piscotty,OAK,9.2,dk-mlb-20190409-MLB $7.5K Quarter Jukebox [Jus...,2019-04-09 19:05:00,mlb,71238529


Unnamed: 0,position,name,team_abbr,contest_id
2272,CPT,T. Gonsolin,LAD,94886275
2273,,T. Gonsolin,LAD,94886275
2274,CPT,B. Snell,TB,94886275
2275,,B. Snell,TB,94886275
2276,CPT,J. Turner,LAD,94886275
...,...,...,...,...
55808,2B,T. La Stella,LAA,71238529
55809,3B,J. McNeil,NYM,71238529
55810,SS,D. Swanson,ATL,71238529
55811,OF,S. Piscotty,OAK,71238529


In [61]:
from fantasy_py import FANTASY_SERVICE_DOMAIN, lineup, util

service_cls = util.CLSRegistry.get_class(FANTASY_SERVICE_DOMAIN, SERVICE)
abbr_remaps = service_cls.get_team_abbr_remapping(SPORT)

def fix_team_abbr(abbr) -> str:
    """ make team abbreviations consistent with DB """
    assert isinstance(abbr, str), "expected all players to have a team abbr!"
    return abbr_remaps.get(abbr) or abbr

# add team/lineup draft data
team_contest_df = pd.merge(contest_df, draft_df, on='contest_id')

team_contest_df.team_abbr = team_contest_df.team_abbr.map(fix_team_abbr)
print(f"{len(team_contest_df.contest_id.unique())} contests")
display(team_contest_df)

37 contests


Unnamed: 0,contest_id,date,title,top_score,last_winning_score,style,type,position,name,team_abbr
0,94463668,2020-10-16,MLB $1 Double Up,168.05,108.7,classic,FIFTY_FIFTY,P,F. Valdez,HOU
1,94463668,2020-10-16,MLB $1 Double Up,168.05,108.7,classic,FIFTY_FIFTY,P,B. Snell,TB
2,94463668,2020-10-16,MLB $1 Double Up,168.05,108.7,classic,FIFTY_FIFTY,C,T. d'Arnaud,ATL
3,94463668,2020-10-16,MLB $1 Double Up,168.05,108.7,classic,FIFTY_FIFTY,1B,F. Freeman,ATL
4,94463668,2020-10-16,MLB $1 Double Up,168.05,108.7,classic,FIFTY_FIFTY,2B,M. Brosseau,TB
...,...,...,...,...,...,...,...,...,...,...
1399,71312509,2019-04-10,MLB $1 Double Up,208.95,149.2,classic,FIFTY_FIFTY,2B,C. Owings,KC
1400,71312509,2019-04-10,MLB $1 Double Up,208.95,149.2,classic,FIFTY_FIFTY,1B,K. Morales,OAK
1401,71312509,2019-04-10,MLB $1 Double Up,208.95,149.2,classic,FIFTY_FIFTY,2B,C. Pinder,OAK
1402,71312509,2019-04-10,MLB $1 Double Up,208.95,149.2,classic,FIFTY_FIFTY,SS,J. Segura,PHI


In [62]:
import os

# group contests together and create team sets used in each contest
def common_title(titles):
    return os.path.commonprefix(titles.tolist())

teams_contest_df = pd.DataFrame(
    team_contest_df.groupby(
        ['contest_id', 'date', 'style', 'type']
    ).agg(
        {'team_abbr': set,
         'title': common_title,
         'top_score': lambda score: score.mean(),
         'last_winning_score': lambda score: score.mean()}
    )
).reset_index()
teams_contest_df = teams_contest_df.rename(columns={'team_abbr': 'teams'})
teams_contest_df['draft_team_count'] = teams_contest_df.teams.map(len)

display(f"{len(teams_contest_df)} team sets")
display(teams_contest_df)

'37 team sets'

Unnamed: 0,contest_id,date,style,type,teams,title,top_score,last_winning_score,draft_team_count
0,71312509,2019-04-10,classic,FIFTY_FIFTY,"{PIT, MIN, WAS, ARI, LAA, SEA, HOU, PHI, NYM, ...",MLB $1 Double Up,208.95,149.2,13
1,71447595,2019-04-13,classic,FIFTY_FIFTY,"{ARI, CIN, TEX, CLE, LAD, HOU, SD, ATL, NYM, S...",MLB $1 Double Up,147.65,106.2,13
2,72194788,2019-05-03,classic,FIFTY_FIFTY,"{NYY, BAL, CLE, SF, BOS, KC, OAK, CIN, LAD, PH...",MLB $1 Double Up [3 Entry Max],161.6,116.5,21
3,72232166,2019-05-04,classic,FIFTY_FIFTY,"{TOR, PIT, WAS, ARI, TB, CIN, LAD, SF, HOU, SD...",MLB $1 Double Up,206.1,136.3,15
4,72307906,2019-05-06,classic,FIFTY_FIFTY,"{TOR, MIN, WAS, BAL, TB, ARI, ATL, LAD, HOU, P...",MLB $1 Double Up [3 Entry Max],171.4,92.05,18
5,72373658,2019-05-07,classic,FIFTY_FIFTY,"{SF, HOU, BOS, KC, CIN, LAD, PHI, PIT, MIN, TB...",MLB $1 Double Up,191.2,87.5,19
6,75218402,2019-08-20,classic,FIFTY_FIFTY,"{BAL, SF, HOU, STL, KC, ARI, CIN, LAD, PHI, MI...",MLB $1 Double Up,167.4,101.15,19
7,75609990,2019-09-03,classic,FIFTY_FIFTY,"{TOR, PIT, ARI, ATL, CLE, LAA, LAD, CHC, BOS, ...",MLB $1 Double Up,163.35,114.7,14
8,75748632,2019-09-06,classic,FIFTY_FIFTY,"{SF, HOU, BOS, KC, OAK, CIN, ARI, LAD, PHI, PI...",MLB $1 Double Up,132.1,84.2,21
9,76011104,2019-09-10,classic,FIFTY_FIFTY,"{TOR, KC, ARI, TB, CIN, ATL, STL, CLE, LAD, CH...",MLB $1 Double Up,136.35,79.65,17


In [63]:
# load slate data from db
import sqlite3
import pandas as pd

conn = sqlite3.connect(DB_FILENAME)
sql = f"""
select distinct daily_fantasy_slate.id as slate_id, date, 
    daily_fantasy_slate.name as slate_name, style as contest_style, abbr
from daily_fantasy_slate 
    join daily_fantasy_cost on daily_fantasy_slate.id = daily_fantasy_cost.daily_fantasy_slate_id
    join team on team_id = team.id
where service = '{SERVICE}' and date between '{MIN_DATE}' and '{MAX_DATE}'
"""

if STYLE is not None:
    sql += f" and style = '{STYLE.name}'"

# print(sql)
db_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
# with pd.option_context('max_rows', 100):
#     display(db_df)
conn.close()

# get team sets
slate_db_df = pd.DataFrame(
    db_df.groupby(
        ['slate_id', 'date', 'slate_name', 'contest_style']
    ).agg(
        {'abbr': set}
    )
).reset_index()
slate_db_df = slate_db_df.set_index('date').rename(columns={'abbr': 'teams'})
slate_db_df['team_count'] = slate_db_df.teams.map(len)
with pd.option_context('max_rows', 100):
    display(slate_db_df)

Unnamed: 0_level_0,slate_id,slate_name,contest_style,teams,team_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-03-20,1983,Unnamed-CLASSIC-Slate-25456,CLASSIC,"{CHW, PIT, MIN, ARI, CIN, TB, CLE, LAA, COL, L...",18
2019-03-21,1986,Unnamed-CLASSIC-Slate-25456,CLASSIC,"{CHW, PIT, MIN, ARI, CIN, TB, CLE, LAA, COL, L...",18
2019-03-28,1987,Unnamed-CLASSIC-Slate-25456,CLASSIC,"{CHW, PIT, MIN, ARI, CIN, TB, CLE, LAA, COL, L...",18
2019-03-28,1988,(Early),CLASSIC,"{NYY, TOR, WAS, DET, BAL, COL, PHI, SF, ATL, N...",12
2019-03-28,1990,(All Day),CLASSIC,"{NYY, BAL, CLE, HOU, SF, STL, KC, OAK, ARI, CI...",28
...,...,...,...,...,...
2020-10-13,4592,Unnamed-CLASSIC-Slate-40698,CLASSIC,"{TB, HOU, LAD, ATL}",4
2020-10-14,4595,Unnamed-CLASSIC-Slate-40779,CLASSIC,"{TB, HOU, LAD, ATL}",4
2020-10-15,4598,Unnamed-CLASSIC-Slate-40850,CLASSIC,"{TB, HOU, LAD, ATL}",4
2020-10-16,4601,Unnamed-CLASSIC-Slate-40863,CLASSIC,"{TB, HOU, LAD, ATL}",4


In [64]:
import numpy as np
from typing import Optional


def get_slate_id(contest_row) -> Optional[pd.Series]:
    """ 
    guesses the db slate id contest_row
    returns - series of (slate_id, number of teams playing in slate)
    """
    try:
        date_slates = slate_db_df.loc[[contest_row.date]].sort_values('team_count')
    except KeyError as ke:
        print(f"Key error finding slates for {contest_row.date}")
        return None
    try:
        slates = date_slates.query("@contest_row.teams <= teams")
    except Exception as e:
        print(f"Unhandled exception querying for teams date {contest_row.date}")
        # display(date_slates)
        raise
        
    slates_found = len(slates)
    if slates_found == 0:
        print(f"On {contest_row.date} the {len(date_slates)} db slates don't match contest teams {contest_row.teams}. "
              "DB slate team sets were:")
        with pd.option_context('max_colwidth', None):
            display(date_slates[['slate_name', 'teams']])
        return None
    if slates_found > 1:
        # display(slates)
        slates = slates.head(1)
        print(f"{slates_found} slates matched contest {contest_row.date} '{contest_row.title}'. "
              f"Using '{slates.iloc[0].slate_name}'")
    return slates.iloc[0][['slate_id', 'team_count']]
    
slate_ids_df = teams_contest_df.apply(get_slate_id, axis=1)
display(slate_ids_df)

2 slates matched contest 2019-04-10 00:00:00 'MLB $1 Double Up'. Using 'Unnamed-CLASSIC-Slate-26158'
2 slates matched contest 2019-04-13 00:00:00 'MLB $1 Double Up'. Using 'Unnamed-CLASSIC-Slate-26250'
2 slates matched contest 2019-05-04 00:00:00 'MLB $1 Double Up'. Using 'Unnamed-CLASSIC-Slate-26828'
2 slates matched contest 2020-09-08 00:00:00 'MLB $1 Double Up (Night)'. Using 'Unnamed-CLASSIC-Slate-39402'
2 slates matched contest 2020-09-10 00:00:00 'MLB $1 Double Up (Night)'. Using '(Night)'


Unnamed: 0,slate_id,team_count
0,2111,18
1,2139,14
2,2331,26
3,2338,20
4,2365,18
5,2370,26
6,3286,30
7,3407,24
8,3432,30
9,3463,28


In [65]:
# slate game score info
conn = sqlite3.connect(DB_FILENAME)

# for mlb double headers this will cause inaccuracy for players that played in both games
slate_ids_str = ','.join(map(str, slate_ids_df.slate_id.dropna()))
sql = f"""
select distinct daily_fantasy_slate.id as slate_id, game.id as game_id, game.score_home, game.score_away
from daily_fantasy_slate
    join daily_fantasy_cost on daily_fantasy_slate.id = daily_fantasy_cost.daily_fantasy_slate_id
    join game on (game.date = daily_fantasy_slate.date and
                  game.season = daily_fantasy_slate.season and 
                  (daily_fantasy_cost.team_id in (game.away_team_id, game.home_team_id)))
where daily_fantasy_slate.id in ({slate_ids_str})
"""

# print(sql)
db_team_score_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
conn.close()
# display(db_team_score_df)
team_score_df = db_team_score_df.melt(id_vars=['slate_id', 'game_id'], value_vars=['score_home', 'score_away']) \
          .groupby(['slate_id']) \
          .agg({'value': ['median', lambda x: np.percentile(x, TOP_PLAYER_PERCENTILE * 100)]})
team_score_df.columns = ['team-med', f'team-{TOP_PLAYER_PERCENTILE * 100}th_pctl']
display(team_score_df)

Unnamed: 0_level_0,team-med,team-70.0th_pctl
slate_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2111,5.0,6.0
2139,3.5,4.7
2331,3.0,6.0
2338,5.5,8.0
2365,4.0,5.0
2370,4.5,6.0
3286,4.0,5.0
3407,5.0,6.0
4197,5.5,8.0
4225,5.0,7.0


In [66]:
# get position scores
conn = sqlite3.connect(DB_FILENAME)

stat_names = (
    f"'{SERVICE_ABBR}_score'"
    if SPORT != 'nfl' else
    f"'{SERVICE_ABBR}_score_off','{SERVICE_ABBR}_score_def'"
)

# for mlb double headers this will cause inaccuracy for players that played in both games
sql = f"""
select daily_fantasy_slate.id as slate_id, positions as position, value as score, 
    daily_fantasy_cost.team_id, daily_fantasy_cost.player_id
from daily_fantasy_slate
    join daily_fantasy_cost on daily_fantasy_slate.id = daily_fantasy_cost.daily_fantasy_slate_id
    join game on (game.date = daily_fantasy_slate.date and
                  game.season = daily_fantasy_slate.season and 
                  (daily_fantasy_cost.team_id in (game.away_team_id, game.home_team_id)))
    join calculation_datum on (calculation_datum.game_id = game.id and 
                               calculation_datum.player_id is daily_fantasy_cost.player_id and
                               calculation_datum.team_id = daily_fantasy_cost.team_id)
    join statistic on calculation_datum.statistic_id = statistic.id
where daily_fantasy_slate.id in ({slate_ids_str}) and
    statistic.name in ({stat_names})
"""
# print(sql)

db_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
conn.close()
db_exploded_pos_df = db_df.assign(position=db_df.position.str.split('/')) \
             .explode('position') \
             .query('position not in @POS_DROP')
for old_pos, new_pos in COST_POS_RENAME.items():
    db_exploded_pos_df.loc[db_exploded_pos_df.position == old_pos, 'position'] = new_pos
display(db_exploded_pos_df)

db_pos_scores_df = db_exploded_pos_df[['slate_id', 'position', 'score']] \
             .groupby(['slate_id', 'position']) \
             .agg(['median', lambda x: np.percentile(x, TOP_PLAYER_PERCENTILE * 100)]) 
db_pos_scores_df.columns = ['med-dfs', f'{TOP_PLAYER_PERCENTILE * 100}th-pctl-dfs']
db_pos_scores_df = db_pos_scores_df.reset_index(level='position') \
             .pivot(columns='position', values=['med-dfs', f'{TOP_PLAYER_PERCENTILE * 100}th-pctl-dfs'])
display(db_pos_scores_df)

Unnamed: 0,slate_id,position,score,team_id,player_id
0,2111,1B,3.0,1,929
1,2111,C,2.0,1,1091
2,2111,1B,16.0,1,2226
2,2111,2B,16.0,1,2226
3,2111,OF,35.0,1,2260
...,...,...,...,...,...
4845,4426,P,9.9,23,3743
4846,4426,C,7.0,23,3851
4847,4426,OF,10.0,23,3905
4848,4426,2B,0.0,23,4014


Unnamed: 0_level_0,med-dfs,med-dfs,med-dfs,med-dfs,med-dfs,med-dfs,med-dfs,70.0th-pctl-dfs,70.0th-pctl-dfs,70.0th-pctl-dfs,70.0th-pctl-dfs,70.0th-pctl-dfs,70.0th-pctl-dfs,70.0th-pctl-dfs
position,1B,2B,3B,C,OF,P,SS,1B,2B,3B,C,OF,P,SS
slate_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
2111,4.0,3.0,4.0,2.0,5.0,15.1,6.0,8.5,8.8,9.4,5.2,10.0,25.73,8.0
2139,5.5,3.0,5.0,2.5,3.5,13.9,5.0,8.1,8.0,9.2,8.4,8.0,16.25,7.2
2331,4.0,4.0,4.0,2.0,4.0,15.125,4.5,5.7,10.4,8.1,6.8,6.0,23.4755,9.5
2338,6.0,3.0,5.0,4.0,5.0,10.1,7.0,12.4,9.0,13.2,7.0,9.0,16.42,11.4
2365,4.5,3.0,3.0,2.5,3.0,22.75,3.0,7.0,5.0,7.0,6.4,6.9,26.86,11.4
2370,5.0,2.0,3.0,3.5,5.0,11.95,3.0,8.2,6.1,6.3,5.6,9.3,22.546,5.0
3286,4.5,3.0,4.0,2.0,4.5,15.425,5.0,8.5,7.0,6.8,5.0,9.0,18.7,10.2
3407,3.0,2.0,5.0,3.0,3.0,13.643,3.0,6.1,6.5,11.6,9.0,6.0,17.2524,6.8
4197,5.0,3.0,5.0,3.0,5.0,1.0,5.0,8.6,9.0,9.3,5.8,10.0,8.445,8.6
4225,6.0,3.0,5.0,3.0,7.0,16.5,4.0,8.0,7.0,7.0,4.6,10.0,25.28,6.0


In [67]:
# Get position score for top players (e.g. players that are likely to be highly drafted)
conn = sqlite3.connect(DB_FILENAME)

sql = f"""
select game.date, calculation_datum.player_id, calculation_datum.team_id, calculation_datum.value as score 
from game
    join calculation_datum on calculation_datum.game_id = game.id
    join statistic on calculation_datum.statistic_id = statistic.id
where statistic.name in ({stat_names}) 
    and date between date('{MIN_DATE}', '-{TOP_PLAYER_DAYS} days') and '{MAX_DATE}'
"""
# print(sql)
db_df = pd.read_sql_query(sql, conn, parse_dates=['date'])
conn.close()
# display(db_df)

db_filtered_df = db_df.query(
    '(player_id in @db_exploded_pos_df.player_id) '
    'or (player_id.isnull() and team_id in @db_exploded_pos_df.team_id)'
)
display(db_filtered_df)

Unnamed: 0,date,player_id,team_id,score
6,2019-03-30,12,3,2.0
7,2019-04-03,12,3,0.0
8,2019-04-07,12,3,14.0
9,2019-04-08,12,3,19.0
10,2019-04-09,12,3,0.0
...,...,...,...,...
67493,2020-09-12,4157,11,7.0
67494,2020-09-19,4157,11,4.0
67495,2020-09-23,4157,11,30.0
67496,2020-09-25,4157,11,6.0


In [68]:
predict_df = pd.concat(
    [teams_contest_df[['date', 'style', 'type', 'top_score', 'last_winning_score']],
     slate_ids_df],
    axis='columns',
).join(team_score_df, on='slate_id') \
 .join(db_pos_scores_df, on='slate_id')

display(predict_df)



Unnamed: 0,date,style,type,top_score,last_winning_score,slate_id,team_count,team-med,team-70.0th_pctl,"(med-dfs, 1B)",...,"(med-dfs, OF)","(med-dfs, P)","(med-dfs, SS)","(70.0th-pctl-dfs, 1B)","(70.0th-pctl-dfs, 2B)","(70.0th-pctl-dfs, 3B)","(70.0th-pctl-dfs, C)","(70.0th-pctl-dfs, OF)","(70.0th-pctl-dfs, P)","(70.0th-pctl-dfs, SS)"
0,2019-04-10,classic,FIFTY_FIFTY,208.95,149.2,2111,18,5.0,6.0,4.0,...,5.0,15.1,6.0,8.5,8.8,9.4,5.2,10.0,25.73,8.0
1,2019-04-13,classic,FIFTY_FIFTY,147.65,106.2,2139,14,3.5,4.7,5.5,...,3.5,13.9,5.0,8.1,8.0,9.2,8.4,8.0,16.25,7.2
2,2019-05-03,classic,FIFTY_FIFTY,161.6,116.5,2331,26,3.0,6.0,4.0,...,4.0,15.125,4.5,5.7,10.4,8.1,6.8,6.0,23.4755,9.5
3,2019-05-04,classic,FIFTY_FIFTY,206.1,136.3,2338,20,5.5,8.0,6.0,...,5.0,10.1,7.0,12.4,9.0,13.2,7.0,9.0,16.42,11.4
4,2019-05-06,classic,FIFTY_FIFTY,171.4,92.05,2365,18,4.0,5.0,4.5,...,3.0,22.75,3.0,7.0,5.0,7.0,6.4,6.9,26.86,11.4
5,2019-05-07,classic,FIFTY_FIFTY,191.2,87.5,2370,26,4.5,6.0,5.0,...,5.0,11.95,3.0,8.2,6.1,6.3,5.6,9.3,22.546,5.0
6,2019-08-20,classic,FIFTY_FIFTY,167.4,101.15,3286,30,4.0,5.0,4.5,...,4.5,15.425,5.0,8.5,7.0,6.8,5.0,9.0,18.7,10.2
7,2019-09-03,classic,FIFTY_FIFTY,163.35,114.7,3407,24,5.0,6.0,3.0,...,3.0,13.643,3.0,6.1,6.5,11.6,9.0,6.0,17.2524,6.8
8,2019-09-06,classic,FIFTY_FIFTY,132.1,84.2,3432,30,,,,...,,,,,,,,,,
9,2019-09-10,classic,FIFTY_FIFTY,136.35,79.65,3463,28,,,,...,,,,,,,,,,
