# Use this notebook to serialize/export models to ONNX

In [None]:
import pandas as pd
import numpy as np

COL_SEP = '\t'
ONNX_FAILS: list[dict] = []

# load the results
df = pd.read_csv(
    "results.tsv", 
    # quotechar="'",
    index_col=False, 
    sep=COL_SEP,
    usecols=['Sport', 'Service', 'Style', 'Type', 'y', 'ModelType', 'Params'],
)

# drop everything after the seperator
seperator_idx = np.where(df['Sport'].str.startswith('*'))[0][0]
df = df.iloc[:seperator_idx]

with pd.option_context('display.max_rows', 1000, 'display.max_colwidth', 1000):
    display(df)

In [None]:
import ast
import os
import tempfile

from skl2onnx import to_onnx
import numpy as np
from skl2onnx.common.data_types import FloatTensorType

from fantasy_py import ContestStyle, CLSRegistry, CONTEST_DOMAIN, lineup

from automl import create_automl_model, error_report
from generate_train_test import generate_train_test, load_csv


DEFAULT_PCA_COMPONENTS = 5


def serialize_model(
    model, model_type, X_train: pd.DataFrame, y: pd.Series, 
    full_model_name, tmp_path=None
):
    print(f"Serializing {full_model_name=}")
    try:
        export_code = None
        print(model_type)
        if model_type.startswith('tpot'):
            df = X_train.copy()
            df['target'] = y
            tpot_data_file = os.path.join(
                tmp_path or tempfile.gettempdir(),
                'tpot-data.csv'
            )
            df.to_csv(tpot_data_file, index=False, sep=COL_SEP)

            # TODO: this is messy af, but exported_pipeline is getting dropped from locals for some reason, so need to assign to another variable in the exec code
            export_code = model.export(data_file_path=tpot_data_file) \
                .replace("COLUMN_SEPARATOR", COL_SEP) + """
exported_model = exported_pipeline
print("Finished execution of export code!!!")
"""
            print("###### EXPORT CODE ######")
            print(export_code)
            print("#########################")
            print("running exported code...")

            # following code should add exported_pipeline to locals
            exec(export_code)

            print(f"!!!{sorted(locals().keys())=} {sorted(globals().keys())=}")
            if 'exported_model' not in locals():
                raise ValueError(f"exported_model not defined in locals... {locals().keys()}")
            if not (locals()['exported_model']):
                raise ValueError("exported_model is None")
            exported_pipeline = locals()['exported_model']
        else:
            raise NotImplementedError()

        print(f"Converting to ONNX... {exported_pipeline=}")
        onnx_model = to_onnx(exported_pipeline, X=X_train, 
                             name=full_model_name,
                             final_types=[('variable1', FloatTensorType([1, 1]))])
        with open(full_model_name + ".onnx", "wb") as f:
            print(f"Exporting model to {full_model_name}.onnx")
            f.write(onnx_model.SerializeToString())
            print(f"Exported model to {full_model_name}.onnx")
    except Exception as ex:
        global LAST_ONNX_EX, LAST_ONNX_ARGS
        ONNX_FAILS.append({
            'ex': ex,
            'model': model, 
            'exported_pipeline': locals().get('exported_pipeline'),
            'name': full_model_name, 
            'X': X_train, 
            'y': y,
            'final_types': [('variable1', FloatTensorType([1, 1]))],
            'export_code': export_code,
        })
        raise


def train_export(
    sport, service, style: ContestStyle,
    contest_type: str, model_type: str,
    y_type, model_def: dict,
    skip_fit=False,
):
    contest_style = ContestStyle[style.upper()]
    contest_type_cls = CLSRegistry.get_class(CONTEST_DOMAIN, contest_type)
    full_model_name = f'{sport}_{service}_{contest_style}_{contest_type}_{model_type}_{y_type}'
    print(f"Exporting model '{full_model_name}'")

    data_df = load_csv(sport, service, contest_style, contest_type_cls)
    assert len(data_df) > 0, "CSV load returned no data"

    model_cols = model_def.pop(
        'model_cols') if 'model_cols' in model_def else None
    train_test_data = generate_train_test(
        data_df,
        model_cols=model_cols,
        random_state=5,
    )
    if train_test_data is None:
        display("Failed to generate a train/test data set from...", data_df)
    (X_train, X_test, y_top_train, y_top_test,
     y_last_win_train, y_last_win_test) = train_test_data

    create_model_params = {}
    if model_type.endswith('-pca'):
        create_model_params['pca_components'] = (
            model_def.pop('n_components')
            if 'n_components' in model_def else
            DEFAULT_PCA_COMPONENTS
        )

    if model_type.startswith('skautoml'):
        create_model_params.update({
            'framework': 'skautoml',
            # 'overwrite': True,
        })
    elif model_type.startswith('tpot'):
        create_model_params = {
            'framework': 'tpot',
        }
    else:
        raise ValueError(f"Don't know how to process model type {model_type}")

    if y_type == 'top':
        y_train = y_top_train
        y_test = y_top_test
    elif y_type == 'last':
        y_train = y_last_win_train
        y_test = y_last_win_test
    else:
        raise ValueError(f"Unexpected y of {y_type}")

    # add all remaining
    create_model_params.update(model_def)
    model, fit_params = create_automl_model(
        full_model_name,
        seed=1,
        **create_model_params,
    )
    if not skip_fit:
        print("Training model...")
        model.fit(X_train, y_train, **fit_params)
        error_report(model, X_test, y_test,
                    f"{full_model_name}: model_cols={model_def.get('model_cols')}")
    else:
        print("Skipping fit...")

    serialize_model(model, model_type, X_train, y_train, full_model_name)

In [None]:
from collections import namedtuple

SKIP_FIT = False
row = None
models_df = df.query("Type == 'GPP' and Sport == 'nhl' and Service == 'draftkings' and ModelType == 'tpot'")
with pd.option_context('display.max_rows', 1000, 'display.max_colwidth', 1000):
    display(models_df)
# models = df.iterrows()
# models_dict = {
#     'Sport': 'nhl', 
#     'Service': 'fanduel',
#     'Style': 'classic',
#     'Type': 'GPP', 
#     'ModelType': 'tpot', 
#     'y': 'top',
#     'Params': '{"generations": 100, "early_stop": 10, "population_size": 100, "n_jobs": 3}'
# }
# models = [
#     (None, namedtuple("test_model", models_dict.keys())(*models_dict.values()))
# ]
PARAM_OVERRIDES = {} # {'generations': 10, 'early_stop': 1}

for _, row in models_df.iterrows():
    try:
        model_def: dict = ast.literal_eval(row.Params)
    except Exception:
        print("Failed to parse params", row.Params)
        raise

    try:
        model_def.update(PARAM_OVERRIDES)
        train_export(row.Sport, row.Service, row.Style,
                    row.Type, row.ModelType, row.y,
                    model_def, skip_fit=SKIP_FIT)
        break
    except Exception:
        display(f"Failed to train+export: {row.Sport=} {row.Service=} {row.Style=} {row.Type=} {row.ModelType=} {row.y=} {row.Params=}")
        raise

print("Done!")


In [None]:
if len(ONNX_FAILS):
    try:
        print("###### attempting to serialize last failed model!!! ####")
        serialize_model(
            ONNX_FAILS[-1]['model'],
            'tpot',
            ONNX_FAILS[-1]['X'],
            ONNX_FAILS[-1]['y'],
            ONNX_FAILS[-1]['name']
        )
        print("#### serialization successful! ###")
    except Exception as ex:
        print("#### serializeation failed!!!", ex)
        raise
        # export_data_df = pd.read_csv('/tmp/tpot-data.csv', sep=COL_SEP, dtype=np.float64)
        # display("export df", export_data_df)
        # display("exported pipeline", ONNX_FAILS[-1].get('exported_pipeline'))
        # display("ONNX_FAILS[-1]", ONNX_FAILS[-1])
else:
    print("no previous errors found")