# Use this notebook to serialize/export models to ONNX

In [1]:
import pandas as pd
import numpy as np

# load the results
df = pd.read_csv(
    "results.txt", 
    quotechar="'",
    index_col=False, 
    delim_whitespace=True,
    usecols=['Sport', 'Service', 'Style', 'Type', 'y', 'ModelType', 'Params'],
)

# drop everything after the seperator
seperator_idx = np.where(df['Sport'].str.startswith('*'))[0][0]
df = df.iloc[:seperator_idx]

with pd.option_context('max_rows', 1000, 'max_columns', 100, 'max_colwidth', 100):
    display(df)

Unnamed: 0,Sport,Service,Type,Style,y,ModelType,Params
0,nhl,draftkings,FIFTY_FIFTY,classic,top,automl-pca,"{""train_time"": 600, ""per_run_time"": 120, ""model_cols"": null, ""n_components"": 5}"
1,nhl,draftkings,FIFTY_FIFTY,classic,last,automl-pca,"{""train_time"": 600, ""per_run_time"": 120, ""model_cols"": null, ""n_components"": 5}"
2,nhl,draftkings,GPP,classic,top,automl,"{""train_time"": 600, ""per_run_time"": 120, ""model_cols"": null}"
3,nhl,draftkings,GPP,classic,last,automl-pca,"{""train_time"": 120, ""per_run_time"": 30, ""model_cols"": null, ""n_components"": 5}"
4,nhl,fanduel,GPP,classic,top,automl,"{""train_time"": 600, ""per_run_time"": 120, ""model_cols"": null}"
5,nhl,fanduel,FIFTY_FIFTY,classic,top,automl,"{""train_time"": 600, ""per_run_time"": 120, ""model_cols"": null}"
6,nhl,fanduel,FIFTY_FIFTY,classic,last,automl,"{""train_time"": 600, ""per_run_time"": 120, ""model_cols"": null}"
7,nhl,yahoo,FIFTY_FIFTY,classic,top,automl,"{""train_time"": 600, ""per_run_time"": 120, ""model_cols"": ""best-possible-score""}"
8,nhl,yahoo,GPP,classic,top,automl-pca,"{""train_time"": 600, ""per_run_time"": 120, ""model_cols"": null, ""n_components"": 5}"
9,nhl,yahoo,GPP,classic,last,automl,"{""train_time"": 600, ""per_run_time"": 120, ""model_cols"": ""best-possible-score""}"


In [2]:
import json

from skl2onnx import convert_sklearn
import numpy as np
from skl2onnx.common.data_types import FloatTensorType

import fantasy_py.lineup.strategy
from fantasy_py import ContestStyle, CLSRegistry, CONTEST_DOMAIN

from automl import create_automl_model, error_report, get_df_types
from generate_train_test import generate_train_test, load_csv


PCA_COMPONENTS = 5


def func(sport, service, style, contest_type, model_type, params):
    contest_style = ContestStyle[style.upper()]
    contest_type = CLSRegistry.get_class(CONTEST_DOMAIN, contest_type)
    full_model_name = f'{sport}_{service}_{contest_style}_{contest_type}_{model_type}'
    print(f"Exporting model '{full_model_name}'")
    
    data_df = load_csv(sport, service, contest_style, contest_type)
    
    model_def = json.loads(params)
    
    (X_train, X_test, y_top_train, y_top_test,
     y_last_win_train, y_last_win_test) = generate_train_test(
        data_df,
        model_cols=model_def['model_cols'],
        random_state=5,
    )
    
    if model_type == 'automl-pca':
        pca_components = PCA_COMPONENTS
    elif model_type == 'automl':
        pca_components = None
    else:
        raise ValueError(f"Don't know how to process model type {model_type}")
    
    if row.y == 'top':
        y_train = y_top_train
        y_test = y_top_test
        model_name = 'top-score'
    elif row.y == 'last':
        y_train = y_last_win_train
        y_test = y_last_win_test
        model_name = 'last-win-score'
    else:
        raise ValueError(f"Unexpected y of {row.y}")
       
    sk_model, fit_params = create_automl_model(
        model_name, 
        train_time=30, # model_def['train_time'],
        per_run_time_limit=2, # model_def['per_run_time'],
        seed=1,
        overwrite=True,
        pca_components=pca_components,
    )
    print("Training model...")
    sk_model.fit(X_train, y_train, **fit_params)
    error_report(sk_model, X_test, y_test, 
                 full_model_name + f": model_cols={model_def['model_cols']}")
    
    print(f"Exporting model to {full_model_name}.onnx")
    df_types = get_df_types(X_train)
    display(df_types)
    onnx_model = convert_sklearn(sk_model, model_name, 
                                 df_types,
                                 final_types=[('variable1',FloatTensorType([1,1]))])
    with open(full_model_name + ".onnx", "wb") as f:
        f.write(onnx_model.SerializeToString())

        
for _, row in df.iterrows():
    func(row.Sport, row.Service, row.Style, row.Type, row.ModelType, row.Params)
    break
    
    
print("Done!")

Exporting model 'nhl_draftkings_classic_<class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>_automl-pca'
loading filename='nhl-draftkings-CLASSIC-FIFTY_FIFTY.csv'
127 rows of data loaded
dropping 4 rows due to nan_slate_rows=2 nan_best_score_rows=2
Training model...
nhl_draftkings_classic_<class 'fantasy_py.lineup.strategy.bet_lineup.FiftyFifty'>_automl-pca: model_cols=None


AttributeError: 'PCA' object has no attribute 'predict'