# Model Testing for Daily Fantasy Scores
Predict for the minimum and maximum winning scores for a slate

In [35]:
import pandas as pd

from fantasy_py import ContestStyle
from fantasy_py.lineup.strategy import GeneralPrizePool, FiftyFifty

# normal test run
# TRAIN_TIME = 600
# PER_RUN_TIME_LIMIT = 120
# STYLE = ContestStyle.CLASSIC

# short test run
# TRAIN_TIME = 120
# PER_RUN_TIME_LIMIT = 30


# model_cols = {'best-possible-score'}
# SPORT = 'mlb'
# SERVICE = 'fanduel'
# CONTEST_TYPE = GeneralPrizePool

def load_csv(sport, service, style: ContestStyle, contest_type) -> pd.DataFrame:
    filename = f"{sport}-{service}-{style.name}-{contest_type.NAME}.csv"
    print(f"loading {filename=}")

    df = pd.read_csv(filename)
    print(f"{len(df)} rows of data loaded")
    nan_slate_rows = len(df.query('slate_id.isnull()'))
    nan_best_score_rows = len(df.query('`best-possible-score`.isnull()'))
    if nan_slate_rows > 0 or nan_best_score_rows > 0:
        print(f"dropping {nan_slate_rows + nan_best_score_rows} rows due to {nan_slate_rows=} {nan_best_score_rows=}")
        df = df.dropna()
    return df

# df = load_csv(SPORT, SERVICE, STYLE, CONTEST_TYPE)
# with pd.option_context('max_rows', 1000, 'max_columns', 100):
#     print(f"{len(df)} rows")
# display(df)

In [47]:
from typing import Optional
from sklearn.model_selection import train_test_split


COLS_TO_IGNORE = {
    'date', 'style', 'type', 'link', 'entries', 'slate_id', 
    'top_score', 'last_winning_score',
}


def generate_train_test(df, train_size: float = .5, 
                        random_state: Optional[int] = None,
                        model_cols: Optional[set[str]] = None) -> Optional[tuple]:
    """ 
    create regression train test data 
    model_cols - if none then use all available columns
    return (X-train, X-test y-top-train, y-top-test, y-last-win-train, y-last-win-test)
    """
    x_cols = []
    assert (model_cols is None) or model_cols <= set(df.columns), \
        "Requested model columns not a subset of available data columns"
    for col in df.columns:
        if col in COLS_TO_IGNORE:
            continue
        assert col[0] == '(' or col.startswith('team') or col == 'best-possible-score', \
            f"Unexpected data column named '{col}'"
        
        if (model_cols is None) or col in model_cols:
            x_cols.append(col)

    X = df[x_cols]
    if len(X) == 0:
        return None
    # display(X)
    y_top = df.top_score
    # display(y_top)
    y_last_win = df.last_winning_score
    # display(y_last_win)
    
    return train_test_split(X, y_top, y_last_win, 
                            random_state=random_state,
                            train_size=train_size)

In [37]:
import os
import shutil

import autosklearn.regression
import sklearn


def automl(X_train, y_train, X_test, y_test, model_name, 
           train_time=60, per_run_time_limit=10,
           overwrite: bool = False,
           error_graph=False,
           seed=1):
    """ 
    overwrite - overwrite the output folder 
    error_graph - if true then graph the errors
    
    return the model
    """
    output_folder = '/tmp/autosklearn_regression_' + model_name
    if overwrite and os.path.isdir(output_folder):
        shutil.rmtree(output_folder)
    automl_model = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=train_time,
        per_run_time_limit=per_run_time_limit,
        output_folder=output_folder,
        seed=seed,
    )

    automl_model.fit(X_train, y_train, dataset_name=model_name)
    return automl_model

In [38]:
# (X_train, X_test, y_top_train, y_top_test,
#  y_last_win_train, y_last_win_test) =  generate_train_test(
#     df,
#     model_cols=model_cols,
#     random_state=5,
# )

# with pd.option_context('max_rows', 1000, 'max_columns', 100, 'max_colwidth', 9999):
    #     display(
    #         # train and test input data
    #         'x-train', X_train, 
    #         'x-test', X_test, 
    #         # answers for top score train/test
    #         'y-top-train', y_top_train, 
    #         'y-top-test', y_top_test,
    #         # answers for min winning score train/test
    #         'y-last-win-train', y_last_win_train, 
    #         'y-last-win-test', y_last_win_test
    #     )

In [39]:
from math import sqrt

import matplotlib.pyplot as plt


def error_report(model, X_test, y_test, desc: str):
    print(desc)
    # print(model.show_models())
    predictions = model.predict(X_test)
    print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))
    print("RMSE score:", sqrt(sklearn.metrics.mean_squared_error(y_test, predictions)))
    print("MAE score:", sqrt(sklearn.metrics.mean_absolute_error(y_test, predictions)))

    plot_data = pd.DataFrame({
        'truth': y_test,
        'prediction': predictions
    })
    plot_data['error'] = plot_data.prediction - plot_data.truth
    # display(plot_data)
    
    fig, axs = plt.subplots(1,2, figsize=(10, 5))
    fig.suptitle(desc)
    for ax in axs:
        ax.axis('equal')
    
    min_v = min(plot_data.truth.min(), plot_data.prediction.min())
    max_v = max(plot_data.truth.max(), plot_data.prediction.max())

    axs[0].plot((min_v, max_v), 
                (min_v, max_v), 
                '-g', linewidth=1) 
    plot_data.plot(kind='scatter', x='truth', y='prediction', ax=axs[0])

    axs[1].yaxis.set_label_position("right")
    axs[1].plot((min_v, max_v), 
                (0, 0), 
                '-g', linewidth=1) 
    plot_data.plot(kind='scatter', x='truth', y='error', ax=axs[1])

In [40]:
# model_name = 'top-score'

# model_top = automl(
#     X_train, y_top_train, X_test, y_top_test, model_name, 
#     train_time=TRAIN_TIME,
#     per_run_time_limit=PER_RUN_TIME_LIMIT,
#     seed=1,
#     overwrite=True
# )
# error_report(model_top, X_test, y_top_test, 
#              f"{SPORT}-{SERVICE}-{STYLE.name}-{CONTEST_TYPE.NAME}-{model_name}: {model_cols=}")

In [41]:
# model_name = 'last-win-score'

# model_last_win = automl(
#     X_train, y_last_win_train, X_test, y_last_win_test, model_name, 
#     train_time=TRAIN_TIME,
#     per_run_time_limit=PER_RUN_TIME_LIMIT,
#     seed=1,
#     overwrite=True
# )
# error_report(model_last_win, X_test, y_last_win_test, 
#              f"{SPORT}-{SERVICE}-{STYLE.name}-{CONTEST_TYPE.NAME}-{model_name}: {model_cols=}")

In [42]:
from sklearn.decomposition import PCA

def pca_data(X_train, X_test) -> tuple[pd.DataFrame, pd.DataFrame]: 
    pca = PCA(n_components=5)
    pca.fit(X_train)

    print(f"Explained variance = {pca.explained_variance_ratio_}")
    print(f"Singular varlues = {pca.singular_values_}")

    # print("Original X:")
    # display(X_train)

    # print("Transformed Xs")
    X_train_pca = pd.DataFrame(pca.transform(X_train))
    # display(X_train_pca)
    X_test_pca = pd.DataFrame(pca.transform(X_test))
    # display(X_test_pca)
    
    return X_train_pca, X_test_pca

In [43]:
# model_name = 'top-score-pca'
# model_top_pca = automl(
#     X_train_pca, y_top_train, X_test_pca, y_top_test, model_name, 
#     train_time=TRAIN_TIME,
#     per_run_time_limit=PER_RUN_TIME_LIMIT,
#     seed=1,
#     overwrite=True
# )
# error_report(model_top_pca, X_test_pca, y_top_test, 
#              f"{SPORT}-{SERVICE}-{STYLE.name}-{CONTEST_TYPE.NAME}-{model_name}: {model_cols=}")

In [44]:
# model_name = 'last-win-score-pca'
# model_last_win_pca = automl(
#     X_train_pca, y_last_win_train, X_test_pca, y_last_win_test, model_name, 
#     train_time=TRAIN_TIME,
#     per_run_time_limit=PER_RUN_TIME_LIMIT,
#     seed=1,
#     overwrite=True
# )
# error_report(model_last_win_pca, X_test_pca, y_last_win_test, 
#              f"{SPORT}-{SERVICE}-{STYLE.name}-{CONTEST_TYPE.NAME}-{model_name}: {model_cols=}")

In [None]:
# normal test run
TRAIN_TIME = 600
PER_RUN_TIME_LIMIT = 120
STYLE = ContestStyle.CLASSIC

SPORT = 'nba'
SERVICE = 'fanduel'

models = {}

for contest_type in [GeneralPrizePool, FiftyFifty]:
    df = load_csv(SPORT, SERVICE, STYLE, contest_type)
    with pd.option_context('max_rows', 1000, 'max_columns', 100):
        print(f"{len(df)} rows")
    display(df)        

    # generate 6 models, top and last winning score models 
    # using 1) all data columns, 2) pca reduction of all data and 3) just the best possible score
    for model_cols in [None, {'best-possible-score'}]:
        model_data = generate_train_test(
            df,
            model_cols=model_cols,
            random_state=5,
        )
        
        if model_data is None or len(model_data[0]) < 5:
            print("Not enough training data available!")
            continue
            
        (X_train, X_test, y_top_train, y_top_test,
         y_last_win_train, y_last_win_test) = model_data
        
        model_ys = [
            ('top-score', y_top_train, y_top_test), 
            ('last-win-score', y_last_win_train, y_last_win_test), 
        ]
        # models for top and last winning score
        for model_name, y_train, y_test in model_ys:
            print()
            print(f"training {model_name=} {model_cols=}")
            model = automl(
                X_train, y_train, X_test, y_test, model_name, 
                train_time=TRAIN_TIME,
                per_run_time_limit=PER_RUN_TIME_LIMIT,
                seed=1,
                overwrite=True
            )
            model_desc = f"{SPORT}-{SERVICE}-{STYLE.name}-{contest_type.NAME}-{model_name}: {model_cols=}"
            error_report(model, X_test, y_test, model_desc)
            models[model_desc] = model
            
        # only pca models for when using all input data columns
        if model_cols == {'best-possible-score'}:
            continue
                
        X_train_pca, X_test_pca = pca_data(X_train, X_test)
        display("PCA data", X_train_pca, X_test_pca)    
        
        for model_name, y_train, y_test in model_ys:
            model_name += '-pca'
            print(f"training {model_name=} {model_cols=}")
            model = automl(
                X_train_pca, y_train, X_test_pca, y_test, model_name, 
                train_time=TRAIN_TIME,
                per_run_time_limit=PER_RUN_TIME_LIMIT,
                seed=1,
                overwrite=True
            )
            model_desc = f"{SPORT}-{SERVICE}-{STYLE.name}-{contest_type.NAME}-{model_name}: {model_cols=}"
            error_report(model, X_test_pca, y_test, model_desc)
            models[model_desc] = model
            
print("Done")

loading filename='nba-fanduel-CLASSIC-GPP.csv'
38 rows of data loaded
38 rows


Unnamed: 0,date,style,type,top_score,last_winning_score,link,best-possible-score,slate_id,team_count,team-med,...,"('med-dfs', 'C')","('med-dfs', 'PF')","('med-dfs', 'PG')","('med-dfs', 'SF')","('med-dfs', 'SG')","('70.0th-pctl-dfs', 'C')","('70.0th-pctl-dfs', 'PF')","('70.0th-pctl-dfs', 'PG')","('70.0th-pctl-dfs', 'SF')","('70.0th-pctl-dfs', 'SG')"
0,2019-10-29,classic,GPP,333.8,256.0,https://www.fanduel.com/entry/ACCBQBZGM,357.25,6719,6,107.5,...,20.4,26.5,19.9,14.6,16.1,23.45,35.18,24.64,21.06,23.34
1,2019-10-27,classic,GPP,405.4,320.2,https://www.fanduel.com/entry/AEFRESUNV,411.0,6712,8,119.5,...,21.05,19.7,25.8,18.0,12.8,30.98,27.5,35.7,23.14,21.72
2,2019-12-13,classic,GPP,402.7,318.9,https://www.fanduel.com/entry/AFWXUUGOG,453.0,6860,18,110.0,...,18.8,22.2,16.1,17.35,16.85,27.6,27.99,22.7,28.3,25.66
3,2019-11-19,classic,GPP,383.9,383.9,https://www.fanduel.com/entry/AHMNEMODE,389.0,6782,8,113.0,...,29.8,29.55,12.0,24.2,18.45,31.56,36.27,17.21,31.78,32.15
4,2019-10-28,classic,GPP,393.1,307.8,https://www.fanduel.com/entry/AMUDHQRDI,439.0,6716,22,104.5,...,26.5,17.3,17.1,20.75,20.7,34.44,24.43,25.32,25.69,27.8
5,2019-11-10,classic,GPP,402.2,318.4,https://www.fanduel.com/entry/AQAWZWZZJ,430.25,6757,14,112.5,...,25.55,14.2,23.4,15.7,18.7,33.78,16.8,30.48,23.92,25.1
6,2019-11-05,classic,GPP,392.1,293.8,https://www.fanduel.com/entry/AUURIZHRE,400.5,6738,12,110.5,...,25.2,22.3,18.8,17.95,19.7,30.6,28.52,24.24,22.49,22.32
7,2019-12-20,classic,GPP,380.6,380.6,https://www.fanduel.com/entry/AWZVCNOYW,430.5,6882,20,111.5,...,22.05,21.4,18.3,18.4,15.4,28.85,27.3,26.0,24.28,25.0
8,2020-01-18,classic,GPP,454.3,345.2,https://www.fanduel.com/entry/AXNJTEEKL,452.25,6987,18,115.5,...,23.2,17.75,17.65,16.8,18.3,30.12,26.67,31.06,27.45,28.34
9,2019-11-20,classic,GPP,389.3,389.3,https://www.fanduel.com/entry/BAFDQLUNA,439.25,6784,22,104.5,...,22.2,19.1,21.25,17.7,19.25,25.9,31.43,25.23,23.6,25.82



training model_name='top-score' model_cols=None
nba-fanduel-CLASSIC-GPP-top-score: model_cols=None
R2 score: 0.2932414157168677
RMSE score: 23.667681668712376
MAE score: 3.8275693135691706

training model_name='last-win-score' model_cols=None
nba-fanduel-CLASSIC-GPP-last-win-score: model_cols=None
R2 score: -0.7406924577566956
RMSE score: 64.39086447914765
MAE score: 7.678269512966548
Explained variance = [0.8407337  0.05341273 0.03425763 0.02664039 0.01444173]
Singular varlues = [135.17553643  34.07149935  27.28648083  24.06241054  17.71652299]


'PCA data'

Unnamed: 0,0,1,2,3,4
0,14.769518,-11.549619,-7.475629,-5.616018,-2.533813
1,31.11668,6.72484,-6.625561,1.641092,9.451388
2,36.120447,19.798526,-9.752971,6.138447,-2.604215
3,-28.380708,-6.511024,3.536096,1.595921,6.161089
4,33.607584,0.593502,-0.009515,0.227469,0.924329
5,-14.875859,6.415105,8.290096,4.296557,2.259364
6,33.607584,0.593502,-0.009515,0.227469,0.924329
7,-6.676256,-3.080116,-6.843391,-1.62242,1.485448
8,-28.380708,-6.511024,3.536096,1.595921,6.161089
9,-6.211125,3.113754,1.74194,-3.391269,-5.458839


Unnamed: 0,0,1,2,3,4
0,36.120447,19.798526,-9.752971,6.138447,-2.604215
1,-28.182723,6.774693,2.851398,-2.089087,-4.906184
2,-6.676256,-3.080116,-6.843391,-1.62242,1.485448
3,-12.02176,-11.826573,-1.194102,19.458821,-4.874601
4,-14.875859,6.415105,8.290096,4.296557,2.259364
5,24.624491,3.768904,0.906789,0.194624,-3.006185
6,-6.676256,-3.080116,-6.843391,-1.62242,1.485448
7,68.307209,7.731451,4.179065,-6.567364,-8.371033
8,-4.95595,-6.798816,3.969407,0.206886,4.368332
9,68.307209,7.731451,4.179065,-6.567364,-8.371033


training model_name='top-score-pca' model_cols=None
nba-fanduel-CLASSIC-GPP-top-score-pca: model_cols=None
R2 score: 0.34804979239564315
RMSE score: 22.731462968816587
MAE score: 3.7191737909118063
training model_name='last-win-score-pca' model_cols=None
nba-fanduel-CLASSIC-GPP-last-win-score-pca: model_cols=None
R2 score: -1.0282135700365211
RMSE score: 69.50564438297941
MAE score: 7.929751690304831

training model_name='top-score' model_cols={'best-possible-score'}
