# Model Testing for Daily Fantasy Scores

In [1]:
import pandas as pd

from fantasy_py import ContestStyle
from fantasy_py.lineup.strategy import GeneralPrizePool, FiftyFifty


SPORT = 'mlb'
SERVICE = 'draftkings'
STYLE = ContestStyle.CLASSIC
CONTEST_TYPES = GeneralPrizePool

filename = f"{SPORT}-{SERVICE}-{STYLE.name}-{FiftyFifty.NAME}.csv"

df = pd.read_csv(filename)
with pd.option_context('max_rows', 1000, 'max_columns', 100):
    print(f"{len(df)} rows")
    display(df)

37 rows


Unnamed: 0,date,style,type,top_score,last_winning_score,slate_id,team_count,team-med,team-70.0th_pctl,"('med-dfs', '1B')","('med-dfs', '2B')","('med-dfs', '3B')","('med-dfs', 'C')","('med-dfs', 'OF')","('med-dfs', 'P')","('med-dfs', 'SS')","('70.0th-pctl-dfs', '1B')","('70.0th-pctl-dfs', '2B')","('70.0th-pctl-dfs', '3B')","('70.0th-pctl-dfs', 'C')","('70.0th-pctl-dfs', 'OF')","('70.0th-pctl-dfs', 'P')","('70.0th-pctl-dfs', 'SS')"
0,2019-04-10,classic,FIFTY_FIFTY,208.95,149.2,6980,18,5.0,6.0,4.0,3.0,4.0,2.0,5.0,15.1,6.0,8.5,8.8,9.4,5.2,10.0,25.73,8.0
1,2019-04-13,classic,FIFTY_FIFTY,147.65,106.2,7008,14,3.5,4.7,5.5,3.0,5.0,2.5,3.5,13.9,5.0,8.1,8.0,9.2,8.4,8.0,16.25,7.2
2,2019-05-03,classic,FIFTY_FIFTY,161.6,116.5,7200,26,3.0,6.0,4.0,4.0,4.0,2.0,4.0,15.125,4.5,5.7,10.4,8.1,6.8,6.0,23.4755,9.5
3,2019-05-04,classic,FIFTY_FIFTY,206.1,136.3,7207,20,5.5,8.0,6.0,3.0,5.0,4.0,5.0,10.1,7.0,12.4,9.0,13.2,7.0,9.0,16.42,11.4
4,2019-05-06,classic,FIFTY_FIFTY,171.4,92.05,7234,18,4.0,5.0,4.5,3.0,3.0,2.5,3.0,22.75,3.0,7.0,5.0,7.0,6.4,6.9,26.86,11.4
5,2019-05-07,classic,FIFTY_FIFTY,191.2,87.5,7239,26,4.5,6.0,5.0,2.0,3.0,3.5,5.0,11.95,3.0,8.2,6.1,6.3,5.6,9.3,22.546,5.0
6,2019-08-20,classic,FIFTY_FIFTY,167.4,101.15,8155,30,4.0,5.0,4.5,3.0,4.0,2.0,4.5,15.425,5.0,8.5,7.0,6.8,5.0,9.0,18.7,10.2
7,2019-09-03,classic,FIFTY_FIFTY,163.35,114.7,8276,24,5.0,6.0,3.0,2.0,5.0,3.0,3.0,13.643,3.0,6.1,6.5,11.6,9.0,6.0,17.2524,6.8
8,2019-09-06,classic,FIFTY_FIFTY,132.1,84.2,8301,30,4.0,5.0,5.0,3.5,5.0,2.0,3.0,14.6,4.0,8.4,8.0,7.0,5.0,7.0,18.3492,8.4
9,2019-09-10,classic,FIFTY_FIFTY,136.35,79.65,8332,28,4.0,5.0,3.0,3.0,3.0,2.0,3.0,12.85,5.0,5.6,9.1,5.0,5.0,8.0,21.14,9.0


In [2]:
from sklearn.model_selection import train_test_split

COLS_TO_IGNORE = {'date', 'style', 'type', 'top_score', 'last_winning_score', 'slate_id'}

def generate_train_test(df, train_size: float = .5) -> tuple:
    """ 
    create regression train test data 
    return (X-train, X-test y-top-train, y-top-test, y-last-win-train, y-last-win-test)
    """
    x_cols = []
    for col in df.columns:
        if col in COLS_TO_IGNORE:
            continue
        assert col[0] == '(' or col.startswith('team'), \
            f"Unexpected data column named '{col}'"
        x_cols.append(col)

    X = df[x_cols]
    # display(X)
    y_top = df.top_score
    # display(y_top)
    y_last_win = df.last_winning_score
    # display(y_last_win)
    
    return train_test_split(X, y_top, y_last_win, train_size=train_size)


(X_train, X_test, y_top_train, y_top_test,
 y_last_win_train, y_last_win_test) =  generate_train_test(df)

display(X_train, X_test, y_top_train, y_top_test,
        y_last_win_train, y_last_win_test)

Unnamed: 0,team_count,team-med,team-70.0th_pctl,"('med-dfs', '1B')","('med-dfs', '2B')","('med-dfs', '3B')","('med-dfs', 'C')","('med-dfs', 'OF')","('med-dfs', 'P')","('med-dfs', 'SS')","('70.0th-pctl-dfs', '1B')","('70.0th-pctl-dfs', '2B')","('70.0th-pctl-dfs', '3B')","('70.0th-pctl-dfs', 'C')","('70.0th-pctl-dfs', 'OF')","('70.0th-pctl-dfs', 'P')","('70.0th-pctl-dfs', 'SS')"
36,4,5.5,7.0,6.0,5.0,2.0,4.5,3.0,7.15,12.0,8.0,7.9,3.0,8.1,5.0,7.81,14.7
5,26,4.5,6.0,5.0,2.0,3.0,3.5,5.0,11.95,3.0,8.2,6.1,6.3,5.6,9.3,22.546,5.0
20,8,4.5,5.0,4.0,7.5,7.0,2.0,3.0,8.308,6.0,8.0,10.7,8.8,4.3,7.1,13.91,12.8
4,18,4.0,5.0,4.5,3.0,3.0,2.5,3.0,22.75,3.0,7.0,5.0,7.0,6.4,6.9,26.86,11.4
17,30,4.0,5.0,4.5,3.0,3.0,2.0,3.0,12.25,3.0,7.0,5.0,6.5,3.0,6.0,17.8,8.0
29,20,5.0,6.0,7.0,3.0,3.0,4.0,5.0,16.15,5.0,8.0,6.7,7.0,5.0,9.0,21.96,7.0
26,26,4.5,6.1,5.0,3.0,4.5,5.0,5.0,6.704,4.5,9.0,6.0,7.0,7.0,9.0,13.9129,7.1
33,18,5.0,6.0,6.0,3.0,5.5,2.0,5.0,13.0755,6.5,8.7,7.3,7.0,5.0,7.3,21.525,10.6
32,20,3.5,5.0,3.0,3.0,3.0,2.5,3.5,16.642,2.0,7.0,5.0,5.0,5.0,7.0,21.7,4.1
34,6,7.5,9.5,9.0,3.5,10.0,6.0,9.0,3.604,10.5,11.2,7.5,14.2,10.6,13.0,3.7,13.5


Unnamed: 0,team_count,team-med,team-70.0th_pctl,"('med-dfs', '1B')","('med-dfs', '2B')","('med-dfs', '3B')","('med-dfs', 'C')","('med-dfs', 'OF')","('med-dfs', 'P')","('med-dfs', 'SS')","('70.0th-pctl-dfs', '1B')","('70.0th-pctl-dfs', '2B')","('70.0th-pctl-dfs', '3B')","('70.0th-pctl-dfs', 'C')","('70.0th-pctl-dfs', 'OF')","('70.0th-pctl-dfs', 'P')","('70.0th-pctl-dfs', 'SS')"
23,14,6.0,6.1,8.0,5.0,7.0,3.0,5.0,12.008,8.0,14.8,8.0,8.0,9.6,9.6,13.572,15.7
22,24,5.0,7.0,6.0,3.0,5.0,3.0,7.0,16.5,4.0,8.0,7.0,7.0,4.6,10.0,25.28,6.0
18,4,6.0,6.1,12.0,4.0,5.5,0.0,4.0,17.475,5.0,18.9,12.4,8.0,1.6,7.0,25.34,6.0
25,22,5.5,6.3,5.0,5.0,5.0,4.0,4.5,11.4,5.0,11.0,7.0,7.0,7.6,10.0,14.7936,8.4
15,10,3.5,4.3,6.0,4.0,4.0,3.5,3.0,8.143,4.0,7.6,7.0,5.0,5.7,7.1,25.14,5.0
30,20,5.0,7.0,7.0,3.5,4.0,3.0,4.0,12.15,5.0,9.0,7.0,5.0,9.5,8.0,18.421,7.0
3,20,5.5,8.0,6.0,3.0,5.0,4.0,5.0,10.1,7.0,12.4,9.0,13.2,7.0,9.0,16.42,11.4
7,24,5.0,6.0,3.0,2.0,5.0,3.0,3.0,13.643,3.0,6.1,6.5,11.6,9.0,6.0,17.2524,6.8
10,26,5.0,8.1,5.0,6.0,3.0,3.0,5.0,9.9215,5.0,8.7,8.0,7.0,6.7,8.0,16.285,8.0
19,8,3.5,4.0,4.0,5.0,7.0,3.5,5.0,18.6285,3.0,5.2,7.0,12.8,5.0,8.0,30.79,10.0


36    168.05
5     191.20
20    159.60
4     171.40
17    156.90
29    187.55
26    184.90
33    164.45
32    167.40
34    126.25
21    195.00
1     147.65
27    131.50
8     132.10
31    205.45
14    169.60
16    135.85
6     167.40
Name: top_score, dtype: float64

23    149.05
22    145.75
18    138.95
25    182.55
15    146.10
30    165.70
3     206.10
7     163.35
10    224.25
19    169.50
24    161.90
12    149.00
28    154.95
11    153.45
9     136.35
2     161.60
13    164.25
35     90.95
0     208.95
Name: top_score, dtype: float64

36    108.70
5      87.50
20    102.60
4      92.05
17    103.95
29    135.55
26    114.90
33    137.25
32    116.20
34     95.30
21    122.60
1     106.20
27     91.50
8      84.20
31    167.60
14    124.85
16     97.60
6     101.15
Name: last_winning_score, dtype: float64

23     86.15
22     80.85
18    115.80
25    132.05
15    104.00
30    134.70
3     136.30
7     114.70
10    142.25
19    106.25
24    106.90
12    116.00
28    121.50
11    114.95
9      79.65
2     116.50
13     99.25
35     68.95
0     149.20
Name: last_winning_score, dtype: float64

In [3]:
import autosklearn.regression
import sklearn

TRAIN_TIME = 180
PER_RUN_TIME = None

def automl(X_train, y_train, X_test, y_test, model_name):
    automl_model = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=TRAIN_TIME,
        per_run_time_limit=PER_RUN_TIME,
        output_folder=f'/tmp/autosklearn_regression_{model_name}',
    )

    automl_model.fit(X_train, y_train, dataset_name=model_name)
    print(automl_model.show_models())
    predictions = automl_model.predict(X_test)
    print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))
    return automl_model

In [5]:
automl(X_train, y_top_train, X_test, y_top_test, 'top-score')

[(0.420000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'minmax', 'feature_preprocessor:__choice__': 'polynomial', 'regressor:__choice__': 'gaussian_process', 'feature_preprocessor:polynomial:degree': 3, 'feature_preprocessor:polynomial:include_bias': 'True', 'feature_preprocessor:polynomial:interaction_only': 'True', 'regressor:gaussian_process:alpha': 0.283161627129086, 'regressor:gaussian_process:thetaL': 7.245332579977274e-08, 'regressor:gaussian_process:thetaU': 36.28453043772396},
dataset_properties={
  'task': 4,
  'sparse': False,
  'multioutput': False,
  'target_type': 'regression',
  'signed': False})),
(0.400000, SimpleRegressionPipeline({'data_preprocessing:cate

AutoSklearnRegressor(output_folder='/tmp/autosklearn_regression_top-score',
                     per_run_time_limit=18, time_left_for_this_task=180)

In [6]:
automl(X_train, y_last_win_train, X_test, y_last_win_test, 'last-win-score')

[(0.680000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'median', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'none', 'feature_preprocessor:__choice__': 'extra_trees_preproc_for_regression', 'regressor:__choice__': 'mlp', 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.010000000000000004, 'feature_preprocessor:extra_trees_preproc_for_regression:bootstrap': 'True', 'feature_preprocessor:extra_trees_preproc_for_regression:criterion': 'friedman_mse', 'feature_preprocessor:extra_trees_preproc_for_regression:max_depth': 'None', 'feature_preprocessor:extra_trees_preproc_for_regression:max_features': 0.969236220539822, 'feature_preprocessor:extra_trees_preproc_for_regression:m

AutoSklearnRegressor(output_folder='/tmp/autosklearn_regression_last-win-score',
                     per_run_time_limit=18, time_left_for_this_task=180)

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
pca.fit(X_train)

print(f"Explained variance = {pca.explained_variance_ratio_}")
print(f"Singular varlues = {pca.singular_values_}")

print("Original X:")
display(X_train)

print("Transformed Xs")
X_train_pca = pd.DataFrame(pca.transform(X_train))
display(X_train_pca)
X_test_pca = pd.DataFrame(pca.transform(X_test))
display(X_test_pca)

Explained variance = [0.61102445 0.18160576 0.07276474 0.05371082 0.03275761]
Singular varlues = [42.79966357 23.3332841  14.76969603 12.68942548  9.90985261]
Original X:


Unnamed: 0,team_count,team-med,team-70.0th_pctl,"('med-dfs', '1B')","('med-dfs', '2B')","('med-dfs', '3B')","('med-dfs', 'C')","('med-dfs', 'OF')","('med-dfs', 'P')","('med-dfs', 'SS')","('70.0th-pctl-dfs', '1B')","('70.0th-pctl-dfs', '2B')","('70.0th-pctl-dfs', '3B')","('70.0th-pctl-dfs', 'C')","('70.0th-pctl-dfs', 'OF')","('70.0th-pctl-dfs', 'P')","('70.0th-pctl-dfs', 'SS')"
36,4,5.5,7.0,6.0,5.0,2.0,4.5,3.0,7.15,12.0,8.0,7.9,3.0,8.1,5.0,7.81,14.7
5,26,4.5,6.0,5.0,2.0,3.0,3.5,5.0,11.95,3.0,8.2,6.1,6.3,5.6,9.3,22.546,5.0
20,8,4.5,5.0,4.0,7.5,7.0,2.0,3.0,8.308,6.0,8.0,10.7,8.8,4.3,7.1,13.91,12.8
4,18,4.0,5.0,4.5,3.0,3.0,2.5,3.0,22.75,3.0,7.0,5.0,7.0,6.4,6.9,26.86,11.4
17,30,4.0,5.0,4.5,3.0,3.0,2.0,3.0,12.25,3.0,7.0,5.0,6.5,3.0,6.0,17.8,8.0
29,20,5.0,6.0,7.0,3.0,3.0,4.0,5.0,16.15,5.0,8.0,6.7,7.0,5.0,9.0,21.96,7.0
26,26,4.5,6.1,5.0,3.0,4.5,5.0,5.0,6.704,4.5,9.0,6.0,7.0,7.0,9.0,13.9129,7.1
33,18,5.0,6.0,6.0,3.0,5.5,2.0,5.0,13.0755,6.5,8.7,7.3,7.0,5.0,7.3,21.525,10.6
32,20,3.5,5.0,3.0,3.0,3.0,2.5,3.5,16.642,2.0,7.0,5.0,5.0,5.0,7.0,21.7,4.1
34,6,7.5,9.5,9.0,3.5,10.0,6.0,9.0,3.604,10.5,11.2,7.5,14.2,10.6,13.0,3.7,13.5


Transformed Xs


Unnamed: 0,0,1,2,3,4
0,18.341885,6.547666,2.678228,7.765265,-4.242604
1,-8.92433,-2.68564,-1.972634,-1.624404,-2.741024
2,11.476102,5.190728,-1.685804,2.568469,5.325043
3,-9.462413,11.986542,0.901511,-2.716041,0.265731
4,-9.729448,-5.86406,0.562513,3.065216,0.225982
5,-5.311378,3.726851,-0.317206,-2.679876,-2.584067
6,-1.229408,-8.910878,-0.183164,0.520774,-2.516518
7,-1.594565,3.918366,1.57609,-1.083838,0.218999
8,-7.62009,4.098605,-4.928258,0.365058,-1.681433
9,24.136348,-2.925694,3.083343,-7.491935,-1.111789


Unnamed: 0,0,1,2,3,4
0,8.427121,2.736029,5.474016,-3.013582,-0.876617
1,-9.780692,1.915202,-0.081244,-4.911562,-1.268905
2,4.062031,15.588733,-5.442373,-5.767996,-1.678858
3,-0.068813,-3.272602,0.733418,-1.223057,-1.670899
4,1.770385,7.774198,-7.716267,0.712445,-2.026692
5,-1.530532,0.50844,-0.5607,-0.605982,-4.040712
6,2.753958,-2.325221,3.053041,-4.426833,0.598014
7,-4.642899,-2.310327,-1.302245,-2.025995,1.331846
8,-3.313322,-5.631527,1.210139,1.069569,-1.314321
9,-2.188826,16.595517,-4.267899,-7.364406,5.574959


In [9]:
automl(X_train_pca, y_top_train, X_test_pca, y_top_test, 'top-score-pca')
automl(X_train_pca, y_last_win_train, X_test_pca, y_last_win_test, 'last-win-score-pca')

[(0.600000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'median', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'extra_trees_preproc_for_regression', 'regressor:__choice__': 'mlp', 'feature_preprocessor:extra_trees_preproc_for_regression:bootstrap': 'True', 'feature_preprocessor:extra_trees_preproc_for_regression:criterion': 'mse', 'feature_preprocessor:extra_trees_preproc_for_regression:max_depth': 'None', 'feature_preprocessor:extra_trees_preproc_for_regression:max_features': 0.4311529077698709, 'feature_preprocessor:extra_trees_preproc_for_regression:max_leaf_nodes': 'None', 'feature_preprocessor:extra_trees_preproc_for_regression:min_samples_leaf': 2, 'feature_preprocessor:extra_tree

AutoSklearnRegressor(output_folder='/tmp/autosklearn_regression_last-win-score-pca',
                     per_run_time_limit=18, time_left_for_this_task=180)