# Advanced usage of surrogates

Here, we present an advanced use case of a data-driven problem.
There are four objectives in total, three of which are expensive to evaluate, and one is cheap.
The three expensive objectives are approximated by a surrogate model.
However, the dataset for each such objective is different.
The cheap objective is evaluated using an analytical function.

In [27]:
# Importing necessary modules

from pathlib import Path
from warnings import filterwarnings

import numpy as np
import plotly.express as ex
import polars as pl
from joblib import dump
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor

from desdeo.emo.hooks.archivers import NonDominatedArchive
from desdeo.emo.methods.EAs import nsga3
from desdeo.problem import Objective, ObjectiveTypeEnum, Problem, Variable

# Filter out warnings
filterwarnings("ignore")


In [28]:
# Importing the data and calculating the bounds for the decision variables
root = Path.cwd().parent.parent
ysdata = pl.read_csv(root / "datasets" / "MetallApplication" / "ysdata.csv", infer_schema_length=10000)
utsdata = pl.read_csv(root / "datasets" / "MetallApplication" / "utsdata.csv", infer_schema_length=10000)
elondata = pl.read_csv(root / "datasets" / "MetallApplication" / "elondata.csv", infer_schema_length=10000)

use_cols = ["C", "Si", "Mn", "P", "S", "Mo", "Ni", "Al", "N", "Nb", "V", "B", "Ti", "Cr", "Ce", "Cu", "Zr"]

lower_bounds = np.max((
    ysdata.describe().filter(pl.col("statistic") == "min")[use_cols],
    utsdata.describe().filter(pl.col("statistic") == "min")[use_cols],
    elondata.describe().filter(pl.col("statistic") == "min")[use_cols],
), axis=0)

lower_bounds = {name: value for name, value in zip(use_cols, lower_bounds.flatten())}

upper_bounds = np.min((
    ysdata.describe().filter(pl.col("statistic") == "max")[use_cols],
    utsdata.describe().filter(pl.col("statistic") == "max")[use_cols],
    elondata.describe().filter(pl.col("statistic") == "max")[use_cols],
), axis=0)

upper_bounds = {name: value for name, value in zip(use_cols, upper_bounds.flatten())}

In [29]:
# Train surrogates and save them to disk

for data, obj, technique in zip(
    (ysdata, utsdata, elondata),
    ("YS", "UTS", "ELON"),
    (ExtraTreesRegressor, GradientBoostingRegressor, ExtraTreesRegressor),
    strict=True):
    X = data[use_cols]
    y = data[obj]

    model = technique(n_estimators=100)
    model.fit(X, y)

    # Check if path exists
    if not (root / "surrogatemodels").exists():
        (root / "surrogatemodels").mkdir()

    dump(model, root / "surrogatemodels" / f"{obj}.joblib")

In [30]:
# Model the problem.
# Note that the three surrogate objectives are to be maximized, while the carbon equivalent is to be minimized.

Variables = [Variable(
    name=name,
    symbol=name,
    lowerbound=lower_bounds[name],
    upperbound=upper_bounds[name],
    variable_type="real"
    )
    for name in use_cols]

Objectives = [
    Objective(
        name=name,
        symbol=name,
        maximize=True,
        surrogates=[root / "surrogatemodels" / f"{name}.joblib"],
        objective_type=ObjectiveTypeEnum.surrogate
        )
    for name in ("YS", "UTS", "ELON")]

carbon_eqv= "C + Mn/6 + (Cr + Mo + V)/5 + (Ni + Cu)/15"

Objectives.append(Objective(
    name="Carbon Equivalent",
    symbol="CE",
    maximize=False,
    func=carbon_eqv))

problem = Problem(
    name="Metallurgical Application",
    description="A problem from the metallurgical domain.",
    variables=Variables,
    objectives=Objectives,
    constraints=[], extra_funcs=[]  # Bug in code.
    )

In [31]:
# Initialize the NSGA-III algorithm and hook up the archive
solver, pub = nsga3(problem=problem)
archive = NonDominatedArchive(problem=problem, publisher=pub)

pub.auto_subscribe(archive)

# Run the algorithm

results = solver()

# Visualize the results
ex.parallel_coordinates(
    archive.archive.to_pandas()[["CE", "YS", "UTS", "ELON"]],
    ).show()