In [1]:
import gym


#定义环境
class MyWrapper(gym.Wrapper):

    def __init__(self):
        env = gym.make('CartPole-v1')
        super().__init__(env)
        self.env = env

    def reset(self):
        state, _ = self.env.reset()
        return state

    def step(self, action):
        state, reward, done, _, info = self.env.step(action)
        return state, reward, done, info


MyWrapper().reset()

array([-0.02186339, -0.00224868, -0.04336443, -0.00508288], dtype=float32)

In [2]:
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor

#创建训练环境和测试环境
env_train = make_vec_env(MyWrapper, n_envs=4)
env_test = Monitor(MyWrapper())

env_train, env_test

(<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv at 0x7f072dfdb340>,
 <Monitor<MyWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<CartPoleEnv<CartPole-v1>>>>>>>)

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy


#测试超参数
def test_params(params):
    #定义一个模型
    model = PPO(
        policy='MlpPolicy',
        env=env_train,
        n_steps=1024,
        batch_size=64,
        #取超参数
        n_epochs=params['n_epochs'],
        #取超参数
        gamma=params['gamma'],
        gae_lambda=0.98,
        ent_coef=0.01,
        verbose=0,
    )

    #训练
    #取超参数
    model.learn(total_timesteps=params['total_timesteps'], progress_bar=True)

    #测试
    mean_reward, std_reward = evaluate_policy(model,
                                              env_test,
                                              n_eval_episodes=50,
                                              deterministic=True)

    #最终的分数就是简单的求差,这也是study要优化的数
    score = mean_reward - std_reward

    return score


test_params({'n_epochs': 2, 'gamma': 0.99, 'total_timesteps': 500})

Output()

71.1597838528824

In [4]:
import optuna
from optuna.samplers import TPESampler

#定义一个超参数学习器
study = optuna.create_study(sampler=TPESampler(),
                            study_name='PPO-LunarLander-v2',
                            direction='maximize')


#求最优超参数
def f(trial):
    #定义要找的超参数,并设置上下限
    params = {
        'n_epochs': trial.suggest_int('n_epochs', 3, 5),
        'gamma': trial.suggest_uniform('gamma', 0.99, 0.9999),
        'total_timesteps': trial.suggest_int('total_timesteps', 500, 2000),
    }

    #测试超参数
    return test_params(params)


study.optimize(f, n_trials=5)

#输出最佳分数和超参数
study.best_trial.values, study.best_trial.params

[32m[I 2023-01-17 17:10:12,768][0m A new study created in memory with name: PPO-LunarLander-v2[0m


Output()

  'gamma': trial.suggest_uniform('gamma', 0.99, 0.9999),


[32m[I 2023-01-17 17:10:26,030][0m Trial 0 finished with value: 102.40188282816155 and parameters: {'n_epochs': 5, 'gamma': 0.9963042134418639, 'total_timesteps': 580}. Best is trial 0 with value: 102.40188282816155.[0m


Output()

[32m[I 2023-01-17 17:10:39,938][0m Trial 1 finished with value: 76.35171294486264 and parameters: {'n_epochs': 3, 'gamma': 0.9901653456845795, 'total_timesteps': 1082}. Best is trial 0 with value: 102.40188282816155.[0m


Output()

[32m[I 2023-01-17 17:10:53,935][0m Trial 2 finished with value: 100.10383872771487 and parameters: {'n_epochs': 5, 'gamma': 0.9953326653547164, 'total_timesteps': 1338}. Best is trial 0 with value: 102.40188282816155.[0m


Output()

[32m[I 2023-01-17 17:11:00,310][0m Trial 3 finished with value: 88.37066386696605 and parameters: {'n_epochs': 3, 'gamma': 0.9988387230407516, 'total_timesteps': 1090}. Best is trial 0 with value: 102.40188282816155.[0m


Output()

[32m[I 2023-01-17 17:11:09,927][0m Trial 4 finished with value: 29.38088676826756 and parameters: {'n_epochs': 3, 'gamma': 0.9970100515891315, 'total_timesteps': 780}. Best is trial 0 with value: 102.40188282816155.[0m


([102.40188282816155],
 {'n_epochs': 5, 'gamma': 0.9963042134418639, 'total_timesteps': 580})

In [5]:
#用最优超参数训练一个模型
test_params(study.best_trial.params)

Output()

133.760702749365