# Model Testing for Daily Fantasy Scores
Predict for the minimum and maximum winning scores for a slate

In [None]:
import logging

FORMAT = '%(asctime)-15s :: %(message)s'
logging.basicConfig(format=FORMAT)
LOGGER = logging.getLogger('dfscore')
LOGGER.setLevel(logging.INFO)
LOGGER.info("logger ready")

In [None]:
from itertools import product
from datetime import datetime
import os

import pandas as pd

from generate_train_test import generate_train_test, load_csv
from automl import create_automl_model, error_report


EVAL_RESULT_COL_ORDER = [
    'Sport', 'Service', 'Type', 'Style', 'y', 'R2',
    'RMSE', 'MAE', 'ModelType', 'Date', 'Params'
]

def eval_results_to_csv(eval_results: list[dict], model_name: str, 
                        csv_folder: str = "eval_results") -> pd.DataFrame:
    """
    write all evaluation results to csv file in the temp folder and return the dataframe
    """
    if len(eval_results) == 0:
        LOGGER.warn("No evaluation results to save")
        return

    df = pd.DataFrame(eval_results)[EVAL_RESULT_COL_ORDER] \
        .sort_values(['Sport', 'Service', 'Type', 'Style', 'y', 'ModelType'])
    if not os.path.isdir(csv_folder):
        os.mkdir(csv_folder)
    results_filepath = os.path.join(csv_folder, model_name + ".csv")
    df.to_csv(results_filepath, index=False)
    LOGGER.info(f"Evaluation results written to '{results_filepath}'")
    return df


def finalize_error_results(results, shared_results_dict, model_name, automl_params, model_cols) -> dict:
    """ update and return results to include model information """
    results['y'] = model_name.split('-')[0]
    results['Params'] = dict(automl_params)
    if model_cols:
        results['Params']['model_cols'] = model_cols
    results.update(shared_results_dict)
    return results


def evaluate_models(
    sport, service, style, contest_type, framework, automl_params,
    pca_components=5, seed=0, data_folder="data"
) -> tuple[dict, list]:
    """
    evaluate and return 6 models
    for the top score and last winning score evaluate 
    1) a standard model using all features, 
    2) a pca model that reduces the features to pca_components components
    3) a model based on best score

    returns tuple of (models, evaluation results)
    """
    models = {}
    eval_results = []
    shared_results_dict = {
        'Sport': sport,
        'Service': service,
        'Style': style.name,
        'Type': contest_type.NAME,
        'ModelType': framework,
        'Date': datetime.now().strftime("%Y%m%d"),
    }
    df = load_csv(sport, service, style, contest_type, data_folder=data_folder)
    # with pd.option_context('max_rows', 1000, 'max_columns', 100):
    #     LOGGER.info(f"{len(df)} rows")
    # display(df)

    # generate 6 models, top and last winning score models
    # using 1) all data columns, 2) pca reduction of all data and 3) just the best possible score
    for model_cols in [None, {'best-possible-score'}]:
        model_data = generate_train_test(
            df,
            model_cols=model_cols,
            random_state=seed,
        )

        if model_data is None or len(model_data[0]) < 5:
            LOGGER.error("Not enough training data available!")
            return None, None

        (X_train, X_test, y_top_train, y_top_test,
         y_last_win_train, y_last_win_test) = model_data

        model_ys = [
            ('top-score', y_top_train, y_top_test),
            ('last-win-score', y_last_win_train, y_last_win_test),
        ]
        # models for top and last winning score
        for model_name, y_train, y_test in model_ys:
            LOGGER.info("training model=%s cols=%s", model_name, model_cols)
            model, fit_params = create_automl_model(
                model_name,
                seed=seed,
                framework=framework,
                **automl_params
            )
            model.fit(X_train, y_train, **fit_params)
            model_desc = f"{sport}-{service}-{style.name}-{contest_type.NAME}-{model_name}-{framework}-ftrs:{model_cols}"

            results = error_report(model, X_test, y_test, model_desc)
            results = finalize_error_results(
                results, shared_results_dict, model_name,
                automl_params, model_cols
            )
            eval_results_to_csv([results], model_desc)
            eval_results.append(results)

            models[model_desc] = model

        # pca models only when using multiple data columns
        if model_cols is not None and len(model_cols) == 1:
            # return
            continue

        for model_name, y_train, y_test in model_ys:
            pca_model_name = model_name + '-pca'
            LOGGER.info("training model=%s cols=%s",
                        pca_model_name, model_cols)
            model, fit_params = create_automl_model(
                pca_model_name,
                pca_components=pca_components,
                seed=seed,
                framework=framework,
                **automl_params
            )
            model.fit(X_train, y_train, **fit_params)
            model_desc = f"{sport}-{service}-{style.name}-{contest_type.NAME}-{pca_model_name}-{framework}: {model_cols=}"

            results = error_report(model, X_test, y_test, model_desc)
            results = finalize_error_results(
                results, shared_results_dict, model_name,
                automl_params, model_cols
            )
            results['Params']['n_components'] = pca_components
            results['ModelType'] += '-pca'
            eval_results_to_csv([results], model_desc)
            eval_results.append(results)

            models[model_desc] = model

    return models, eval_results


def multi_run(model_params: dict, styles, sports, services, contest_types, seed):
    LOGGER.info("starting multirun")
    models = {}
    eval_results = []
    try:
        for framework, params in model_params.items():
            for (sport, service, style, contest_type) in product(sports, services, styles, contest_types):
                try:
                    (new_models, new_eval_results) = evaluate_models(
                        sport, service, style, contest_type, framework, params, seed=seed)
                    if new_models is None:
                        LOGGER.warning("No models generated for %s-%s-%s-%s",
                                       sport, service, style.name, contest_type.NAME)
                    else:
                        models.update(new_models)
                        eval_results += new_eval_results
                except FileNotFoundError as ex:
                    LOGGER.error(
                        "Data file required for modeling not found", exc_info=ex)
    except (Exception, KeyboardInterrupt) as ex:
        LOGGER.error("Unhandled exception! ", exc_info=ex)
        return models, eval_results, ex
    LOGGER.info("finished multirun")
    return models, eval_results, None


In [None]:
import pandas as pd

from fantasy_py import ContestStyle
from fantasy_py.lineup.strategy import GeneralPrizePool, FiftyFifty

# normal test run
STYLES = [ContestStyle.CLASSIC, ContestStyle.SHOWDOWN]
SPORTS = ['nhl', 'nfl', 'mlb', 'nba', 'lol']
RANDOM_SEED = 0
SERVICES = ['fanduel', 'draftkings', 'yahoo']
CONTEST_TYPES = [FiftyFifty, GeneralPrizePool]

AUTOML_PARAMS = {
    # 'skautoml': {
    #     'per_run_time_limit': 120,
    #     'max_train_time': 600,
    #     'n_jobs': 4,
    # },
    'tpot': {
        'population_size': 100, 
        'n_jobs': 4,
        'verbosity': 2,
        'max_train_time': 1200,
        'generations': 100,
        'early_stop': 10,
        'template': 'Selector-Transformer-Regressor',
    }
}

(models, eval_results, unhandled_exception) = multi_run(AUTOML_PARAMS, STYLES, SPORTS, SERVICES, CONTEST_TYPES, RANDOM_SEED)

In [None]:
# display(models)
eval_result_col_order = [
    'Sport', 'Service', 'Type', 'Style', 'y', 'R2',
    'RMSE', 'MAE', 'ModelType', 'Date', 'Params'
]
eval_results_df = pd.DataFrame(eval_results)[eval_result_col_order] \
    .sort_values(['Sport', 'Service', 'Type', 'Style', 'y', 'ModelType'])

eval_results_df = eval_results_to_csv(
    eval_results,
    "all_eval_results.csv")

if unhandled_exception:
    import traceback
    print(traceback.format_exc(limit=None, chain=True))

with pd.option_context(
    'display.max_rows', 1000,
    'display.max_columns', 100,
    'display.max_colwidth', None
):
    display(eval_results_df)

print(eval_results_df.to_csv(index=False, sep="\t"))
