In [1]:
import gymnasium as gym


#定义环境
class MyWrapper(gym.Wrapper):

    def __init__(self):
        env = gym.make('CartPole-v1')
        super().__init__(env)
        self.env = env

    def reset(self,**kwargs):

        return self.env.reset(**kwargs)

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        return state, reward, terminated, truncated, info


env = MyWrapper()

env.reset()

(array([0.03998401, 0.0491744 , 0.02577092, 0.02973928], dtype=float32), {})

In [2]:
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor

#创建训练环境和测试环境
env_train = make_vec_env(MyWrapper, n_envs=4)
env_test = Monitor(MyWrapper())

env_train, env_test

  from .autonotebook import tqdm as notebook_tqdm


(<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv at 0x246d973f430>,
 <Monitor<MyWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<CartPoleEnv<CartPole-v1>>>>>>>)

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy


#测试超参数
def test_params(params):
    #定义一个模型
    model = PPO(
        policy='MlpPolicy',
        env=env_train,
        n_steps=1024,
        batch_size=64,
        #取超参数
        n_epochs=params['n_epochs'],
        #取超参数
        gamma=params['gamma'],
        gae_lambda=0.98,
        ent_coef=0.01,
        verbose=0,
    )

    #训练
    #取超参数
    model.learn(total_timesteps=params['total_timesteps'], progress_bar=True)

    #测试
    mean_reward, std_reward = evaluate_policy(model,
                                              env_test,
                                              n_eval_episodes=50,
                                              deterministic=True)

    #最终的分数就是简单的求差,这也是study要优化的数
    score = mean_reward - std_reward

    return score


test_params({'n_epochs': 2, 'gamma': 0.99, 'total_timesteps': 500})

70.68603480598406

In [5]:
pip install optuna

Collecting optuna
  Using cached optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Using cached SQLAlchemy-2.0.31-cp310-cp310-win_amd64.whl.metadata (9.9 kB)
Collecting PyYAML (from optuna)
  Using cached PyYAML-6.0.1-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Using cached Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.3.0->optuna)
  Using cached greenlet-3.0.3-cp310-cp310-win_amd64.whl.metadata (3.9 kB)
Using cached optuna-3.6.1-py3-none-any.whl (380 kB)
Downloading alembic-1.13.2-py3-none-any.whl (232 kB)
   ---------------------------------------- 0.0/233.0 kB ? eta -:--:--
   --------------------------------- ------ 194.6/233.0 kB 3.9 MB/s eta 0:00

In [6]:
import optuna
from optuna.samplers import TPESampler

#定义一个超参数学习器
study = optuna.create_study(sampler=TPESampler(),
                            study_name='PPO-LunarLander-v2',
                            direction='maximize')


#求最优超参数
def f(trial):
    #定义要找的超参数,并设置上下限
    params = {
        'n_epochs': trial.suggest_int('n_epochs', 3, 5),
        'gamma': trial.suggest_uniform('gamma', 0.99, 0.9999),
        'total_timesteps': trial.suggest_int('total_timesteps', 500, 2000),
    }

    #测试超参数
    return test_params(params)


study.optimize(f, n_trials=5)

#输出最佳分数和超参数
study.best_trial.values, study.best_trial.params

[I 2024-07-01 11:31:37,972] A new study created in memory with name: PPO-LunarLander-v2


  'gamma': trial.suggest_uniform('gamma', 0.99, 0.9999),


[I 2024-07-01 11:31:45,410] Trial 0 finished with value: 68.82521186467373 and parameters: {'n_epochs': 4, 'gamma': 0.9926381149121989, 'total_timesteps': 1963}. Best is trial 0 with value: 68.82521186467373.


[I 2024-07-01 11:31:50,253] Trial 1 finished with value: 52.436948236435164 and parameters: {'n_epochs': 3, 'gamma': 0.9987168640972581, 'total_timesteps': 781}. Best is trial 0 with value: 68.82521186467373.


[I 2024-07-01 11:31:56,874] Trial 2 finished with value: 102.72596640660879 and parameters: {'n_epochs': 3, 'gamma': 0.9973739438959514, 'total_timesteps': 800}. Best is trial 2 with value: 102.72596640660879.


[I 2024-07-01 11:32:03,385] Trial 3 finished with value: 78.95993109393571 and parameters: {'n_epochs': 4, 'gamma': 0.9951302055348885, 'total_timesteps': 1497}. Best is trial 2 with value: 102.72596640660879.


[I 2024-07-01 11:32:09,269] Trial 4 finished with value: 76.33734502481651 and parameters: {'n_epochs': 4, 'gamma': 0.9953253384383769, 'total_timesteps': 837}. Best is trial 2 with value: 102.72596640660879.


([102.72596640660879],
 {'n_epochs': 3, 'gamma': 0.9973739438959514, 'total_timesteps': 800})

In [7]:
#用最优超参数训练一个模型
test_params(study.best_trial.params)

42.7395675499197