# Model Testing for Daily Fantasy Scores
Predict for the minimum and maximum winning scores for a slate

In [12]:
import pandas as pd

from fantasy_py import ContestStyle
from fantasy_py.lineup.strategy import GeneralPrizePool, FiftyFifty


# model_cols = None
model_cols = {'best-possible-score'}

# normal test run
# TRAIN_TIME = 600
# PER_RUN_TIME = 120

# short test run
TRAIN_TIME = 120
PER_RUN_TIME_LIMIT = 30


SPORT = 'mlb'
SERVICE = 'draftkings'
STYLE = ContestStyle.CLASSIC
CONTEST_TYPE = FiftyFifty  #  FiftyFifty

filename = f"{SPORT}-{SERVICE}-{STYLE.name}-{CONTEST_TYPE.NAME}.csv"

df = pd.read_csv(filename)
print(f"{len(df)} rows of data loaded")
nan_slate_rows = len(df.query('slate_id.isnull()'))
nan_best_score_rows = len(df.query('`best-possible-score`.isnull()'))
if nan_slate_rows > 0 or nan_best_score_rows > 0:
    print(f"dropping {nan_slate_rows + nan_best_score_rows} rows due to {nan_slate_rows=} {nan_best_score_rows=}")
    df = df.dropna()

with pd.option_context('max_rows', 1000, 'max_columns', 100):
    print(f"{len(df)} rows")
display(df)

37 rows of data loaded
37 rows


Unnamed: 0,date,style,type,top_score,last_winning_score,link,best-possible-score,slate_id,team_count,team-med,...,"('med-dfs', 'OF')","('med-dfs', 'P')","('med-dfs', 'SS')","('70.0th-pctl-dfs', '1B')","('70.0th-pctl-dfs', '2B')","('70.0th-pctl-dfs', '3B')","('70.0th-pctl-dfs', 'C')","('70.0th-pctl-dfs', 'OF')","('70.0th-pctl-dfs', 'P')","('70.0th-pctl-dfs', 'SS')"
0,2019-04-10,classic,FIFTY_FIFTY,208.95,149.2,https://www.draftkings.com/contest/gamecenter/...,287.4,6980,18,5.0,...,5.0,15.1,6.0,8.5,8.8,9.4,5.2,10.0,25.73,8.0
1,2019-04-13,classic,FIFTY_FIFTY,147.65,106.2,https://www.draftkings.com/contest/gamecenter/...,229.65,7008,14,3.5,...,3.5,13.9,5.0,8.1,8.0,9.2,8.4,8.0,16.25,7.2
2,2019-05-03,classic,FIFTY_FIFTY,161.6,116.5,https://www.draftkings.com/contest/gamecenter/...,306.3,7200,26,3.0,...,4.0,15.125,4.5,5.7,10.4,8.1,6.8,6.0,23.4755,9.5
3,2019-05-04,classic,FIFTY_FIFTY,206.1,136.3,https://www.draftkings.com/contest/gamecenter/...,296.842,7207,20,5.5,...,5.0,10.1,7.0,12.4,9.0,13.2,7.0,9.0,16.42,11.4
4,2019-05-06,classic,FIFTY_FIFTY,171.4,92.05,https://www.draftkings.com/contest/gamecenter/...,260.608,7234,18,4.0,...,3.0,22.75,3.0,7.0,5.0,7.0,6.4,6.9,26.86,11.4
5,2019-05-07,classic,FIFTY_FIFTY,191.2,87.5,https://www.draftkings.com/contest/gamecenter/...,331.9,7239,26,4.5,...,5.0,11.95,3.0,8.2,6.1,6.3,5.6,9.3,22.546,5.0
6,2019-08-20,classic,FIFTY_FIFTY,167.4,101.15,https://www.draftkings.com/contest/gamecenter/...,299.643,8155,30,4.0,...,4.5,15.425,5.0,8.5,7.0,6.8,5.0,9.0,18.7,10.2
7,2019-09-03,classic,FIFTY_FIFTY,163.35,114.7,https://www.draftkings.com/contest/gamecenter/...,279.55,8276,24,5.0,...,3.0,13.643,3.0,6.1,6.5,11.6,9.0,6.0,17.2524,6.8
8,2019-09-06,classic,FIFTY_FIFTY,132.1,84.2,https://www.draftkings.com/contest/gamecenter/...,287.0,8301,30,4.0,...,3.0,14.6,4.0,8.4,8.0,7.0,5.0,7.0,18.3492,8.4
9,2019-09-10,classic,FIFTY_FIFTY,136.35,79.65,https://www.draftkings.com/contest/gamecenter/...,319.45,8332,28,4.0,...,3.0,12.85,5.0,5.6,9.1,5.0,5.0,8.0,21.14,9.0


In [2]:
from typing import Optional
from sklearn.model_selection import train_test_split


COLS_TO_IGNORE = {
    'date', 'style', 'type', 'link', 'entries', 'slate_id', 
    'top_score', 'last_winning_score',
}


def generate_train_test(df, train_size: float = .5, 
                        random_state: Optional[int] = None,
                        model_cols: Optional[set[str]] = None) -> tuple:
    """ 
    create regression train test data 
    model_cols - if none then use all available columns
    return (X-train, X-test y-top-train, y-top-test, y-last-win-train, y-last-win-test)
    """
    x_cols = []
    assert (model_cols is None) or model_cols <= set(df.columns), \
        "Requested model columns not a subset of available data columns"
    for col in df.columns:
        if col in COLS_TO_IGNORE:
            continue
        assert col[0] == '(' or col.startswith('team') or col == 'best-possible-score', \
            f"Unexpected data column named '{col}'"
        
        if (model_cols is None) or col in model_cols:
            x_cols.append(col)

    X = df[x_cols]
    # display(X)
    y_top = df.top_score
    # display(y_top)
    y_last_win = df.last_winning_score
    # display(y_last_win)
    
    return train_test_split(X, y_top, y_last_win, 
                            random_state=random_state,
                            train_size=train_size)

In [3]:
import os
import shutil

import autosklearn.regression
import sklearn


def automl(X_train, y_train, X_test, y_test, model_name, 
           train_time=60, per_run_time_limit=10,
           overwrite: bool = False,
           seed=1):
    """ overwrite - overwrite the output folder """
    output_folder = '/tmp/autosklearn_regression_' + model_name
    if overwrite and os.path.isdir(output_folder):
        shutil.rmtree(output_folder)
    automl_model = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=train_time,
        per_run_time_limit=per_run_time_limit,
        output_folder=output_folder,
        seed=seed,
    )

    automl_model.fit(X_train, y_train, dataset_name=model_name)
    # print(automl_model.show_models())
    predictions = automl_model.predict(X_test)
    print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))
    return automl_model

In [13]:
(X_train, X_test, y_top_train, y_top_test,
 y_last_win_train, y_last_win_test) =  generate_train_test(
    df,
    model_cols=model_cols,
    random_state=5,
)

# with pd.option_context('max_rows', 1000, 'max_columns', 100, 'max_colwidth', 9999):
#     display(
#         # train and test input data
#         'x-train', X_train, 
#         'x-test', X_test, 
#         # answers for top score train/test
#         'y-top-train', y_top_train, 
#         'y-top-test', y_top_test,
#         # answers for min winning score train/test
#         'y-last-win-train', y_last_win_train, 
#         'y-last-win-test', y_last_win_test
#     )

In [14]:
automl(X_train, y_top_train, X_test, y_top_test, 'top-score', 
       train_time=TRAIN_TIME,
       per_run_time_limit=PER_RUN_TIME_LIMIT,
       seed=1,
       overwrite=True)

R2 score: -0.926984414798868


AutoSklearnRegressor(output_folder='/tmp/autosklearn_regression_top-score',
                     per_run_time_limit=60, time_left_for_this_task=120)

In [6]:
automl(X_train, y_last_win_train, X_test, y_last_win_test, 'last-win-score', 
       train_time=TRAIN_TIME,
       per_run_time_limit=PER_RUN_TIME_LIMIT,
       seed=1,
       overwrite=True)

R2 score: -0.05515051127275372


AutoSklearnRegressor(output_folder='/tmp/autosklearn_regression_last-win-score',
                     per_run_time_limit=10, time_left_for_this_task=60)

In [49]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
pca.fit(X_train)

print(f"Explained variance = {pca.explained_variance_ratio_}")
print(f"Singular varlues = {pca.singular_values_}")

# print("Original X:")
# display(X_train)

print("Transformed Xs")
X_train_pca = pd.DataFrame(pca.transform(X_train))
display(X_train_pca)
X_test_pca = pd.DataFrame(pca.transform(X_test))
display(X_test_pca)

Explained variance = [0.9423851  0.02649168 0.01734253 0.00454297 0.00386129]
Singular varlues = [408.05745849  68.41665717  55.35582866  28.33199078  26.12000581]
Original X:


Unnamed: 0,best-possible-score,team_count,team-med,team-70.0th_pctl,"('med-dfs', 'C')","('med-dfs', 'D')","('med-dfs', 'G')","('med-dfs', 'W')","('70.0th-pctl-dfs', 'C')","('70.0th-pctl-dfs', 'D')","('70.0th-pctl-dfs', 'G')","('70.0th-pctl-dfs', 'W')"
258,238.9,12.0,3.0,3.7,3.00,4.2,11.90,5.80,9.93,7.43,20.58,8.88
154,173.1,4.0,1.5,2.3,3.65,4.2,17.70,5.15,5.98,8.24,19.55,7.51
23,218.4,10.0,3.0,3.3,4.40,6.5,15.60,4.50,8.07,9.90,16.50,8.00
60,150.2,4.0,1.5,2.0,4.10,5.9,18.40,3.00,6.00,9.34,24.78,6.64
142,230.4,16.0,3.0,3.5,6.75,4.3,13.70,5.90,10.56,7.12,15.10,9.30
...,...,...,...,...,...,...,...,...,...,...,...,...
35,261.3,20.0,3.0,3.0,4.30,5.0,9.65,4.50,6.55,8.40,18.76,8.00
32,250.8,8.0,3.5,5.0,3.00,5.6,8.40,4.50,7.22,7.24,17.20,10.54
200,184.8,10.0,2.0,2.0,4.30,4.3,17.90,2.80,8.95,7.08,22.80,5.80
4,250.5,18.0,4.0,4.9,6.50,5.8,7.70,4.50,12.30,9.03,14.40,9.50


Transformed Xs


Unnamed: 0,0,1,2,3,4
0,-6.348810,0.216293,2.350460,1.924244,-0.515879
1,60.105021,-1.759445,1.706308,-2.386864,-2.638106
2,14.269629,0.247834,1.376713,-3.491300,-0.451407
3,82.907545,-5.975647,0.826259,1.375580,-1.639225
4,1.399137,-0.477603,-3.148208,-3.387667,2.441951
...,...,...,...,...,...
143,-29.719787,-1.412140,-2.512183,1.663809,-2.602365
144,-17.761006,6.963158,4.295379,1.375505,-3.480777
145,47.758689,-6.575068,0.012790,0.352788,0.042416
146,-19.028974,3.672087,-4.807381,0.009856,3.467133


Unnamed: 0,0,1,2,3,4
0,-28.562620,2.013060,3.307976,-3.577933,2.983817
1,-12.261288,6.772219,0.605038,4.205241,-4.712396
2,23.107597,4.682049,-0.847878,-1.470208,1.407366
3,49.853750,-2.087133,3.463115,0.542351,-4.243498
4,-21.744577,0.167369,2.596975,-0.364618,0.043587
...,...,...,...,...,...
143,24.558535,-1.415190,3.105355,-1.979301,-1.490218
144,42.028962,1.746529,2.904716,-2.102316,3.162213
145,12.824936,-3.685046,6.910365,1.329225,-0.430057
146,53.880990,5.218917,-2.094020,-0.766024,-2.000308


In [50]:
automl(X_train_pca, y_top_train, X_test_pca, y_top_test, 'top-score-pca', 
       train_time=TRAIN_TIME,
       per_run_time_limit=PER_RUN_TIME_LIMIT,
       seed=1,
       overwrite=True)

[(1.000000, MyDummyRegressor(config=1, init_params={'instance': None}, random_state=1)),
]
R2 score: -0.05090301775451689

[(1.000000, MyDummyRegressor(config=1, init_params={'instance': None}, random_state=1)),
]
R2 score: -0.0340993091600299


AutoSklearnRegressor(output_folder='/tmp/autosklearn_regression_last-win-score-pca',
                     per_run_time_limit=10, time_left_for_this_task=60)

In [51]:
automl(X_train_pca, y_last_win_train, X_test_pca, y_last_win_test, 'last-win-score-pca', 
       train_time=TRAIN_TIME,
       per_run_time_limit=PER_RUN_TIME_LIMIT,
       seed=1,
       overwrite=True)

[(1.000000, MyDummyRegressor(config=1, init_params={'instance': None}, random_state=1)),
]
R2 score: -0.0340993091600299


AutoSklearnRegressor(output_folder='/tmp/autosklearn_regression_last-win-score-pca',
                     per_run_time_limit=10, time_left_for_this_task=60)