# Generate Cost Prediction Model
Use TPOT to automl a model for predicting cost

In [None]:
# load data and create train/test data sets
import os
import pandas as pd
from sklearn.model_selection import train_test_split

RANDOM_STATE = 0
COST_FILENAME = os.path.join(os.environ["FANTASY_ARCHIVE_BASE"], "lol", "lol-cost.csv.gz")

data_df = pd.read_csv(COST_FILENAME)[
    ["pos", "cost", "service", "pts-1", "pts-mean-5", "pts-mean-std"]
].dropna()
X = data_df[["pos", "service", "pts-1", "pts-mean-5", "pts-mean-std"]]
target = data_df.cost
(X_train, X_test, target_train, target_test) = train_test_split(
    X, target, random_state=RANDOM_STATE
)
display(X_train, X_test, target_train, target_test)

In [None]:
# create model and pipeline
from sklearn.pipeline import Pipeline
from tpot import TPOTRegressor
from tpot.config import regressor_config_dict
from sklearn2pmml.tpot import make_pmml_config
from sklearn.preprocessing import OneHotEncoder
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn_pandas import DataFrameMapper

model_kwargs = {
    "population_size": 100,
    "n_jobs": 6,
    "verbosity": 2,
    "max_time_mins": 90,  # minutes
    "generations": 100,
    "early_stop": 15,
    "template": "Selector-Transformer-Regressor",
    "random_state": RANDOM_STATE,
    "config_dict": make_pmml_config(regressor_config_dict.copy()),
}

domains = [
    (
        ["pts-1", "pts-mean-5", "pts-mean-std"],
        ContinuousDomain(invalid_value_treatment="as_is"),
    ),
    (["service"], [CategoricalDomain(), OneHotEncoder(categories="auto")]),
    (["pos"], [CategoricalDomain(), OneHotEncoder(categories="auto")]),
]

# preprocesssing stage of pipeline
feature_eng_pipeline = Pipeline([("mapper", DataFrameMapper(domains))])

model = TPOTRegressor(**model_kwargs)

In [None]:
# train model
X_train_transformed = feature_eng_pipeline.fit_transform(X_train).astype(float)
X_test_transformed = feature_eng_pipeline.transform(X_test).astype(float)
model.fit(X_train_transformed, target_train)

In [None]:
# post processing steps to finalize/serialize model
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml

MODEL_FILEPATH = "lol-cost.pmml"
final_pipeline = Pipeline(feature_eng_pipeline.steps + model.fitted_pipeline_.steps)
final_model = make_pmml_pipeline(
    final_pipeline,
    active_fields=X_train.columns,
    target_fields=[target_train.name],
)
sklearn2pmml(final_model, MODEL_FILEPATH, with_repr=True)

In [None]:
from math import sqrt

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from pypmml import Model as PmmlModel

# evaluate against test data
def error_report(
    model,
    X_test,
    y_test,
    y_fallback: None = None,
    desc: str = None,
    show_results=True,
) -> dict:
    """
    display the error report for the model, also return a dict with the scores
    """
    predictions = model.predict(X_test)

    if isinstance(predictions, pd.DataFrame):
        predictions = predictions[predictions.columns[0]]
    if y_fallback is not None and (nans := pd.isna(predictions)).any():
        nan_count = np.count_nonzero(nans)
        print(
            "Predictions for %s contain NaN. Filling %i of %i predictions with fallback value %f.",
            desc,
            nan_count,
            len(predictions),
            y_fallback,
        )
        predictions.fillna(y_fallback, inplace=True)
    r2, rmse, mae = None, None, None
    try:
        r2 = round(sklearn.metrics.r2_score(y_test, predictions), 4)
        rmse = round(sqrt(sklearn.metrics.mean_squared_error(y_test, predictions)), 4)
        mae = round(sqrt(sklearn.metrics.mean_absolute_error(y_test, predictions)), 4)
    except ValueError as e:
        print(f"Error calculating error metrics for %s: {e}", desc)
        return None, predictions

    result = {
        "R2": r2,
        "RMSE": rmse,
        "MAE": mae,
    }
    if show_results:
        print("**** Error Report for %s ****: %s", desc, result)
        assert isinstance(predictions, pd.Series) or isinstance(predictions, np.ndarray)
        assert isinstance(y_test, pd.Series) or isinstance(y_test, np.ndarray)

        truth = pd.Series(y_test) if isinstance(y_test, np.ndarray) else y_test
        truth = y_test.reset_index(drop=True)
        pred = (
            pd.Series(predictions)
            if isinstance(predictions, np.ndarray)
            else predictions
        )
        pred = pred.reset_index(drop=True)

        plot_data = pd.concat([truth, pred], axis=1)
        plot_data.columns = ["truth", "prediction"]
        plot_data["error"] = plot_data.prediction - plot_data.truth
        print(plot_data)

        fig, axs = plt.subplots(1, 2, figsize=(10, 5))
        fig.suptitle(f"{desc or 'unknown model'} : {r2=} {rmse=} {mae=}")
        for ax in axs:
            ax.axis("equal")

        min_v = min(plot_data.truth.min(), plot_data.prediction.min())
        max_v = max(plot_data.truth.max(), plot_data.prediction.max())

        axs[0].plot((min_v, max_v), (min_v, max_v), "-g", linewidth=1)
        plot_data.plot(kind="scatter", x="truth", y="prediction", ax=axs[0])

        axs[1].yaxis.set_label_position("right")
        axs[1].plot((min_v, max_v), (0, 0), "-g", linewidth=1)
        plot_data.plot(kind="scatter", x="truth", y="error", ax=axs[1])

    return result, predictions


loaded_model = PmmlModel.load(MODEL_FILEPATH)
error_report(loaded_model, X_test, target_test)