# Serialize/export models to ONNX
Read the contents of results.txt and serialize all the top models

In [None]:
import pandas as pd
import numpy as np
import logging

from serialize import SerializeFailure, COL_SEP
import log

log.setup()

LOGGER = logging.getLogger('Model.Export')
LOGGER.info("logger ready")


ONNX_FAILS: list[SerializeFailure] = []

# load the results
df = pd.read_csv(
    "results.tsv", 
    index_col=False, 
    sep=COL_SEP,
    usecols=['Sport', 'Service', 'Style', 'Type', 'Target', 'ModelType', 'R2', 'Date', 'Params'],
)

# drop everything after the seperator
seperator_idx = np.where(df['Sport'].str.startswith('*'))[0][0]
df = df.iloc[:seperator_idx]

# df = df.query('ModelType == "tpot"')

with pd.option_context('display.max_rows', 1000, 'display.max_colwidth', 1000):
    display(df)

In [None]:
import ast
import os
import tempfile
import json

import numpy as np

from fantasy_py import ContestStyle, CLSRegistry, CONTEST_DOMAIN, lineup

from automl import create_automl_model, error_report
from serialize import serialize_model, SerializeFailure, SUPPORTED_EXPORT_MODELS, get_serialized_file_path
from generate_train_test import generate_train_test, load_csv


DEFAULT_PCA_COMPONENTS = 5


def train_export(
    sport, service, style: ContestStyle,
    contest_type: str, model_type: str,
    target,
    model_def_: dict,
    skip_fit=False,
    datapath=".",
    modelpath=".",
    overwrite=True,
    description_path=None,
):
    if not os.path.isdir(modelpath):
        LOGGER.info(f"Creating model path '{modelpath}'")
        os.makedirs(modelpath)
    if description_path and not os.path.isdir(description_path):
        LOGGER.info(f"Creating model description path '{description_path}'")
        os.makedirs(description_path)

    contest_style = ContestStyle[style.upper()]
    full_model_name = f'{sport}_{service}_{contest_style}_{contest_type}_{model_type}_{target}'
    LOGGER.info("Running train_export for %s", full_model_name)
    model_filepath = get_serialized_file_path(full_model_name, modelpath)

    if os.path.isfile(model_filepath) and not overwrite:
        LOGGER.info(f"Model '{model_filepath}' already exists, skipping")
        return

    contest_type_cls = CLSRegistry.get_class(CONTEST_DOMAIN, contest_type)
    data_df = load_csv(sport, service, contest_style,
                       contest_type_cls, data_folder=datapath)
    assert len(data_df) > 0, "CSV load returned no data"

    model_def = dict(model_def_)
    random_state = model_def.pop("random_state", None)
    model_cols = model_def.pop(
        'model_cols'
    ) if 'model_cols' in model_def else None
    train_test_data = generate_train_test(
        data_df,
        model_cols=model_cols,
        random_state=random_state,
    )
    if train_test_data is None:
        LOGGER.error("Failed to generate a train/test data set from data: %s", data_df)
    (X_train, X_test, y_top_train, y_top_test,
     y_last_win_train, y_last_win_test) = train_test_data

    create_model_params = {
        'random_state': random_state,
    }
    if model_type.endswith('-pca'):
        create_model_params['pca_components'] = (
            model_def.pop('n_components')
            if 'n_components' in model_def else
            DEFAULT_PCA_COMPONENTS
        )

    if model_type.startswith('skautoml'):
        create_model_params.update({
            'framework': 'skautoml',
        })
    elif model_type.startswith('tpot'):
        create_model_params = {
            'framework': 'tpot',
        }
    else:
        raise ValueError(f"Don't know how to process model type {model_type}")

    if target == 'top':
        y_train = y_top_train
        y_test = y_top_test
    elif target == 'last':
        y_train = y_last_win_train
        y_test = y_last_win_test
    else:
        raise ValueError(f"Unexpected y of {target}")

    # add all remaining
    create_model_params.update(model_def)
    if not skip_fit:
        create_model_params['X_train'] = X_train
        create_model_params['y_train'] = y_train
    cam_result = create_automl_model(
        target,
        model_desc=full_model_name,
        X_test=X_test, y_test=y_test,
        **create_model_params,
    )

    if skip_fit:
        return

    if description_path:
        result_filepath = os.path.join(description_path, f"{full_model_name}.score.json")
        with open(result_filepath, 'w') as fp:
            json.dump(cam_result['eval_result'], fp)        

    serialize_model(cam_result['model'], model_type, X_train, y_train,
                    full_model_name, 
                    model_folder=modelpath,
                    model_desc_folder=description_path)
    LOGGER.info("Model %s successfully exported", full_model_name)


In [None]:
from tqdm.notebook import tqdm

OVERWRITE = False
# skip fit and serialize... dryrun
DRYRUN = False

PARAM_OVERRIDES = {
    'generations': 5, 
    # 'early_stop': 1, 
    # 'max_train_time': 60
}

pbar = tqdm(df.iterrows(), total=len(df))
for _, row in pbar:
    model_desc = f"sport={row.Sport} service={row.Service} style={row.Style} type={row.Type} y={row.Target}"
    pbar.set_postfix_str(model_desc)
    if row.ModelType not in SUPPORTED_EXPORT_MODELS:
        LOGGER.error(
            f"Failed to train+export model {model_desc} of type {row.ModelType=}. Export of this type is not supported."
        )
        continue

    try:
        model_def: dict = ast.literal_eval(row.Params)
    except Exception as ex:
        LOGGER.exception("Failed to parse params: %s", row.Params, exc_info=ex)
        raise

    try:
        model_def.update(PARAM_OVERRIDES)
        train_export(row.Sport, row.Service, row.Style,
                     row.Type, row.ModelType, row.Target,
                     model_def, 
                     skip_fit=DRYRUN, overwrite=OVERWRITE,
                     datapath="data", modelpath="models", description_path="eval_results")
    except SerializeFailure as se:
        LOGGER.exception(
            f"Failed to serialize: {model_desc} {row.ModelType=} {row.Params=}"
        , exc_info=se)
        import traceback
        print(traceback.format_exc())
    except Exception as ex:
        LOGGER.exception(
            f"Failed to train+export: {model_desc} {row.ModelType=} {row.Params=}", exc_info=ex
        )
        import traceback
        print(traceback.format_exc())

print("Done!")
