# Optimization and Simulation with DR

## optunaによる単目的関数の最適化

### 1. ライブラリ類のインポートとファイルのロード

In [None]:
%matplotlib inline

import os
import copy
import warnings

import numpy as np
import pandas as pd
from pandas import DataFrame

import matplotlib.pyplot as plt
import datarobot as dr
from datarobot_predict.deployment import predict

import optuna

import japanize_matplotlib

if not os.getenv("DATAROBOT_NOTEBOOK_IMAGE"):
    print("not running in DataRobot Notebook")
    from dotenv import load_dotenv
    load_dotenv("../.env", override=True)

client = dr.Client()

seed = 71
np.random.seed(seed)
warnings.filterwarnings("ignore")
plt.rcParams.update({"figure.max_open_warning": 0})
optuna.visualization.is_available()
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option("display.precision", 8)

In [None]:
targets = ["降伏強度", "引張強度"]

df = pd.read_csv("../data/opt_steel_strength.csv")

In [None]:
y_train1 = df["降伏強度"]
y_train2 = df["引張強度"]
X_train = df.drop(["ID", "降伏強度", "引張強度"], axis=1)

### 2. 予測APIを用いる上での必要な情報を準備しておきます。

In [None]:
# 予測APIを叩くための諸々を設定する必要があります
dr.Client()
deployment_ids = ["67bc6ab2f1b3f6b73d560ff8", "67bc6ab999dbbff523b1152a"]
deployment_quant_ids = ["67bc6ad2fd8c6942c3561092", "67bc6ad599dbbff523b11589"]
deployments = [dr.Deployment.get(deployment_id) for deployment_id in deployment_ids]
deployments_quant = [
    dr.Deployment.get(deployment_id) for deployment_id in deployment_quant_ids
]
# 実験時間、実験回数（n_trials）での制御も可能
timeout = 15

# 結果格納用
result = []

### 3. `datarobot-predict`を使って予測APIを叩く

In [None]:
# 一回叩いてみる
predictions, _ = predict(deployments[0], X_train.iloc[:1])
predictions

In [None]:
# 予測の戻り値から予測値を取り出す
predictions.loc[0, "降伏強度_PREDICTION"]

In [None]:
# 結果記録用のヘルパーメソッド
def update_result(result, study, sampler, pruner, n_jobs):
    dict_trial = {
        "sampler": sampler,
        "pruner": pruner,
        "n_jobs": n_jobs,
        "値": study.best_trial.value,
        "回数": len(study.trials),
        **study.best_trial.params,
    }
    result.append(dict_trial)
    print(dict_trial)
    return result

### 4. 最適化する対象設定します。この関数の戻り値が最小化されるようにoptunaが探索空間からいい感じの入力データを見つけてくれます。

In [None]:
std_1 = y_train1.std()
stat = X_train.describe().T


def objective(trial):

    df_target = DataFrame(index=[0], columns=X_train.columns)
    for col in X_train.columns:
        low = stat.loc[col, "min"] * 0.8
        high = stat.loc[col, "max"] * 1.2
        df_target[col] = trial.suggest_float(col, low, high, step=0.01)

    pred_1 = predict(deployments[0], df_target)[0].loc[0, "降伏強度_PREDICTION"]

    return pred_1

### 5. 最適化を実行する

In [None]:
%%time
dr.Client()
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=timeout, gc_after_trial=True)

In [None]:
# 結果を一度保存します
trails_bk = [copy.deepcopy(trials) for trials in study.trials]
print(study.best_trial.params)
print(study.best_trial.value)
print(study.best_trial.number)

In [None]:
# 使われてる最適化手法を確認
print(f"Sampler is {study.sampler.__class__.__name__}")

In [None]:
# シミュレーション結果を確認
# optuna.visualization.plot_optimization_history(study)


def custom_plot_optimization_history(study):
    # Extract trial numbers and values
    trials = study.trials_dataframe()
    trial_numbers = trials["number"]
    values = trials["value"]

    # Calculate the best value until each trial
    best_values = values.cummax()

    # Create the scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(trial_numbers, values, label="Trial Values", color="blue")

    # Plot the best values
    plt.plot(
        trial_numbers,
        best_values,
        color="red",
        linestyle="--",
        label="Best Value Until Each Trial",
    )

    # Add labels and title
    plt.xlabel("Trial Number")
    plt.ylabel("Value")
    plt.title("Optimization History")
    plt.legend()
    plt.grid(True)
    plt.show()


# Example usage
custom_plot_optimization_history(study)

### 6. `Sampler`を変更してみる

CMA-ES (一旦時間を倍にする)

In [None]:
%%time
dr.Client()
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.CmaEsSampler())
study.optimize(objective, timeout=timeout*2, gc_after_trial=True)

In [None]:
print(study.best_trial.params)
print(study.best_trial.value)
print(study.best_trial.number)

In [None]:
# シミュレーション結果を確認
# optuna.visualization.plot_optimization_history(study)
custom_plot_optimization_history(study)

もしTPE Samplerの結果をベースにCMA-ESで最適化したら

In [None]:
%%time
# TPE Samplerの結果を既知の結果として投入
sampler = optuna.samplers.CmaEsSampler(seed=seed, source_trials=trails_bk)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, timeout=timeout, gc_after_trial=True)

In [None]:
# シミュレーション結果を確認
# optuna.visualization.plot_optimization_history(study)
custom_plot_optimization_history(study)

In [None]:
# どの変数が一番聞いているのかをみてみましょう
# optuna.visualization.plot_param_importances(study)
def custom_plot_param_importances(study):
    importances = optuna.importance.get_param_importances(study)
    params = list(importances.keys())
    values = list(importances.values())

    plt.figure(figsize=(10, 6))
    plt.barh(params, values, color="skyblue")
    plt.xlabel("Importance")
    plt.title("Hyperparameter Importances")
    plt.gca().invert_yaxis()
    plt.grid(True)
    plt.show()


# Example usage
custom_plot_param_importances(study)

In [None]:
# 上位３変数とターゲットの関係を可視化してみる
# optuna.visualization.plot_parallel_coordinate(
#     study, params=["ニオブ", "チタン", "コバルト"]
# )

In [None]:
# 上位２変数の履歴を可視化してみる
# https://qiita.com/maskot1977/items/ed698a67b091b5516ab4
x = "ニオブ"
y = "チタン"
fig, ax = plt.subplots(figsize=(5, 5))

ax.plot(
    [trial.params[x] for trial in study.trials],
    [trial.params[y] for trial in study.trials],
    alpha=0.4,
    marker="x",
)
ax.scatter(
    study.trials[0].params[x],
    study.trials[0].params[y],
    marker=">",
    label="start",
    s=100,
)
ax.scatter(
    study.trials[-1].params[x],
    study.trials[-1].params[y],
    marker="s",
    label="end",
    s=100,
)
ax.scatter(study.best_params[x], study.best_params[y], marker="o", label="best", s=100)

ax.axis("equal")
ax.set_xlabel(x)
ax.set_ylabel(y)
fig.tight_layout()
plt.grid()
plt.legend()
plt.show()

### 7. `Pruner`も変更してみる

In [None]:
# 打ち止めするかを確認するため、途中経過をフィードバックするようにobjectiveを編集
n_train_iter = 10


def objective(trial):

    df_target = DataFrame(index=[0], columns=X_train.columns)

    for col in X_train.columns:
        low = stat.loc[col, "min"] * 0.8
        high = stat.loc[col, "max"] * 1.2
        df_target[col] = trial.suggest_float(col, low, high, step=0.01)

    for step in range(n_train_iter):
        pred_1 = predict(deployments[0], df_target)[0].loc[0, "降伏強度_PREDICTION"]

        # 打ち止めかどうかを判断
        pred_temp = pred_1

        trial.report(pred_temp, step)

        if trial.should_prune():
            raise optuna.TrialPruned()

    return pred_1

In [None]:
%%time
# 打ち切りしているため、トライごとの時間がかかる
# 試行回数が減るため効果が出にくい
dr.Client()
sampler = optuna.samplers.TPESampler(seed=seed)
pruner = optuna.pruners.HyperbandPruner(min_resource=1, max_resource=n_train_iter)
study = optuna.create_study(direction="maximize",sampler=sampler, pruner=pruner)
study.optimize(objective, timeout=timeout, gc_after_trial=True)

In [None]:
%%time
# 並列数を増やして、時間内のトライアル数を増やす
dr.Client()
sampler = optuna.samplers.TPESampler(seed=seed)
pruner = optuna.pruners.HyperbandPruner(min_resource=1, max_resource=n_train_iter)
study = optuna.create_study(direction="maximize",sampler=sampler, pruner=pruner)
study.optimize(objective, timeout=timeout, gc_after_trial=True, n_jobs=2)

In [None]:
print(study.best_trial.params)
print(study.best_trial.value)
print(study.best_trial.number)

In [None]:
# シミュレーション結果を確認
# optuna.visualization.plot_optimization_history(study)
custom_plot_optimization_history(study)

### 8.不確実性を確認

In [None]:
# 最適化と同時に分位点予測も行う


def objective(trial):

    df_target = DataFrame(index=[0], columns=X_train.columns)
    for col in X_train.columns:
        low = stat.loc[col, "min"] * 0.8
        high = stat.loc[col, "max"] * 1.2
        df_target[col] = trial.suggest_float(col, low, high, step=0.01)

    pred_2 = predict(deployments[0], df_target)[0].loc[0, "降伏強度_PREDICTION"]
    pred_p25 = predict(deployments_quant[0], df_target)[0].loc[0, "降伏強度_PREDICTION"]
    pred_p75 = predict(deployments_quant[1], df_target)[0].loc[0, "降伏強度_PREDICTION"]

    trial.set_user_attr("p25", pred_p25)
    trial.set_user_attr("p75", pred_p75)

    return pred_2

In [None]:
%%time
dr.Client()
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(sampler=sampler,direction="maximize")
study.optimize(objective, timeout=timeout*2, gc_after_trial=True)

In [None]:
df_trials = study.trials_dataframe()
df_trials.head()

In [None]:
import matplotlib.pyplot as plt

# Extract columns with 'params_' prefix
params_columns = ["number"] + [
    col for col in df_trials.columns if col.startswith("params_")
]


# Function to plot the selected column
def plot_selected_column(column):
    _df = df_trials.sort_values(column)
    fig, ax = plt.subplots()
    ax.plot(_df[column], _df["value"], label="予測")
    ax.fill_between(
        _df[column],
        _df["user_attrs_p25"],
        _df["user_attrs_p75"],
        alpha=0.5,
        label="25%-75%",
    )
    ax.set_xlabel(column)
    ax.set_ylabel("Value")
    ax.legend()
    plt.show()


# Example usage: plot the first parameter column
plot_selected_column(params_columns[4])

In [None]:
# This is the end of this code...