In [5]:

import gymnasium as gym

#自定义一个Wrapper
class Pendulum(gym.Wrapper):

    def __init__(self,env):

        super().__init__(env)

    def reset(self,**kwargs):

        return self.env.reset(**kwargs)

    def step(self, action):
        state, reward, terminated, truncated, info = self.env.step(action)
        return state, reward, terminated, truncated, info




env = gym.make('Pendulum-v1')

env = Pendulum(env)

env.reset()


(array([-0.93968606,  0.34203818,  0.8378868 ], dtype=float32), {})

In [6]:
#测试一个环境
def test(env, wrap_action_in_list=False):
    print(env)

    state = env.reset()
    over = False
    step = 0

    while not over:
        action = env.action_space.sample()

        if wrap_action_in_list:
            action = [action]

        next_state, reward, terminated, truncated, info = env.step(action)

        if step % 20 == 0:
            print(step, state, action, reward)

        if step > 200:
            break

        state = next_state
        step += 1


test(env)

<Pendulum<TimeLimit<OrderEnforcing<PassiveEnvChecker<PendulumEnv<Pendulum-v1>>>>>>
0 (array([ 0.7251202, -0.6886223,  0.6319107], dtype=float32), {}) [0.09473597] -0.6169130829046054
20 [ 0.06014106  0.99818987 -4.3009367 ] [-1.0366311] -4.132849996695271
40 [-0.94985515 -0.3126903   6.765589  ] [-1.76569] -12.552977268837097
60 [-0.9994354   0.03359844 -6.3827085 ] [1.9138559] -13.737148499817884
80 [-0.9995127  -0.03121652  6.5411453 ] [1.4109446] -13.955056970091478
100 [-0.10480989 -0.9944923  -4.628027  ] [1.1031991] -4.951383044248425
120 [0.18274213 0.98316085 2.563205  ] [0.23092084] -2.580883952906665
140 [ 0.53790003 -0.84300864 -1.1046811 ] [-1.2574584] -1.129325812540723
160 [ 0.53609085  0.8441603  -0.50069   ] [1.3799686] -1.0369920930097003
180 [ 0.41210884 -0.9111346   1.4649147 ] [1.7209471] -1.5309415195420417
200 [ 0.5269767   0.84987974 -3.844894  ] [-0.4150266] -2.510255523893626


In [10]:
#修改最大步数
class StepLimitWrapper(gym.Wrapper):

    def __init__(self, env):
        super().__init__(env)
        self.current_step = 0

    def reset(self):
        self.current_step = 0
        return self.env.reset()

    def step(self, action):
        self.current_step += 1
        state, reward, terminated, truncated, info = self.env.step(action)

        #修改done字段
        if self.current_step >= 100:
            done = True

        return state, reward, terminated, truncated, info


test(StepLimitWrapper(env))

<StepLimitWrapper<Pendulum<TimeLimit<OrderEnforcing<PassiveEnvChecker<PendulumEnv<Pendulum-v1>>>>>>>
0 (array([ 0.615082  ,  0.7884631 , -0.31675813], dtype=float32), {}) [-0.588578] -0.8354004206778554
20 [ 0.47905335 -0.8777858   2.9411707 ] [0.90504074] -2.0133806712601556
40 [-0.6984637   0.71564555 -7.1090546 ] [0.4106314] -10.548580158206635
60 [0.10262237 0.9947204  4.372786  ] [-0.6475478] -4.067548470342698
80 [ 0.6596029 -0.7516143 -1.0035911] [-0.80827516] -0.8247332677756085
100 [ 0.5947022  0.8039461 -1.1527305] [-0.38490862] -1.0051983453082127
120 [ 0.2186914 -0.9757941  4.011166 ] [0.8221919] -3.4329938541571146
140 [-0.72549313  0.6882294  -6.4105854 ] [0.18571568] -9.786125442605494
160 [-0.97686565  0.21385406  6.6089096 ] [-0.78359425] -12.93029208921303
180 [-0.78576165 -0.61852944 -6.066613  ] [-0.47249308] -9.804856105694457
200 [-0.3480743  0.937467   4.3219748] [-0.3518394] -5.578750330684813


In [21]:
import numpy as np


#修改动作空间
class NormalizeActionWrapper(gym.Wrapper):

    def __init__(self, env):
        #获取动作空间
        action_space = env.action_space

        #动作空间必须是连续值
        assert isinstance(action_space, gym.spaces.Box)

        #重新定义动作空间,在正负一之间的连续值
        #这里其实只影响env.action_space.sample的返回结果
        #实际在计算时,还是正负2之间计算的
        env.action_space = gym.spaces.Box(low=-1,
                                          high=1,
                                          shape=action_space.shape,
                                          dtype=np.float32)

        super().__init__(env)

    def reset(self):
        return self.env.reset()

    def step(self, action):
        #重新缩放动作的值域
        action = action * 2.0

        if action > 2.0:
            action = 2.0

        if action < -2.0:
            action = -2.0

        return self.env.step(action)

env = gym.make('Pendulum-v1')
test(NormalizeActionWrapper(env))

<NormalizeActionWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<PendulumEnv<Pendulum-v1>>>>>>
0 (array([ 0.98162663, -0.19081193,  0.11409011], dtype=float32), {}) [0.50907546] -0.0391981431290179
20 [-0.7588267  0.6512926 -7.3505583] [0.32093418] -11.319595516205856
40 [-0.48303744  0.8755997   6.1656275 ] [-0.98626465] -8.110666487439769
60 [ 0.84920865 -0.5280574  -0.62930065] [0.19350258] -0.34923407237200926
80 [ 0.36137825  0.9324193  -2.5541186 ] [-0.39311224] -2.095493200804351
100 [-4.7682617e-03 -9.9998862e-01  5.2046952e+00] [-0.503002] -5.192301253861725
120 [-0.73664594 -0.6762786  -6.538787  ] [-0.8995183] -10.03350072532808
140 [0.3133714 0.9496306 3.1349335] [-0.95959073] -2.554106075833723
160 [ 0.6556962  -0.7550248  -0.20849022] [-0.9087635] -0.7398589737489942
180 [ 0.526781    0.85000104 -1.834008  ] [-0.51753384] -1.3696600233936655
200 [ 0.15338081 -0.98816717  3.634122  ] [-0.571055] -3.3293326184681664


In [23]:
from gymnasium.wrappers import TimeLimit


#修改状态
class StateStepWrapper(gym.Wrapper):

    def __init__(self, env):

        #状态空间必须是连续值
        assert isinstance(env.observation_space, gym.spaces.Box)

        #增加一个新状态字段
        low = np.concatenate([env.observation_space.low, [0.0]])
        high = np.concatenate([env.observation_space.high, [1.0]])

        env.observation_space = gym.spaces.Box(low=low,
                                               high=high,
                                               dtype=np.float32)

        super().__init__(env)

        self.step_current = 0

    def reset(self):
        self.step_current = 0
        return np.concatenate([self.env.reset(), [0.0]])

    def step(self, action):
        self.step_current += 1
        state, reward, done, info = self.env.step(action)

        #根据step_max修改done
        if self.step_current >= 100:
            done = True

        return self.get_state(state), reward, done, info

    def get_state(self, state):
        #添加一个新的state字段
        state_step = self.step_current / 100

        return np.concatenate([state, [state_step]])

env = gym.make('Pendulum-v1')
test(StateStepWrapper(env))

<StateStepWrapper<TimeLimit<OrderEnforcing<PassiveEnvChecker<PendulumEnv<Pendulum-v1>>>>>>


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [19]:
from stable_baselines3 import A2C
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

#使用Monitor Wrapper,会在训练的过程中输出rollout/ep_len_mean和rollout/ep_rew_mean,就是增加些日志
#gym升级到0.26以后失效了,可能是因为使用了自定义的wapper

env = gym.make('Pendulum-v1')
env = DummyVecEnv([lambda: Monitor(env)])

A2C('MlpPolicy', env, verbose=1).learn(1000)

Using cuda device
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.34e+03 |
| time/                 |           |
|    fps                | 449       |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.41     |
|    explained_variance | -0.11     |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -13.6     |
|    std                | 0.987     |
|    value_loss         | 43.3      |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.31e+03 |
| time/                 |           |
|    fps                | 473       |
|    iterations         | 200       |
|    time_elapsed       | 2     

<stable_baselines3.a2c.a2c.A2C at 0x20ea78e17e0>

In [56]:
from stable_baselines3.common.vec_env import VecNormalize, VecFrameStack

env = gym.make('Pendulum-v1')

# VecNormalize,他会对state和reward进行Normalize 
# Wrap the environment in DummyVecEnv
env = DummyVecEnv([lambda: env])
env = VecNormalize(env)

# To test the environment, you can reset and take a random action
obs = env.reset()
print("Initial observation:", obs)

action = [env.action_space.sample()]
obs, reward, done, info = env.step(action)
print("Observation after one step:", obs)
print("Reward:", reward)
print("Done:", done)
print("Info:", info)


Initial observation: [[ 0.00128163 -0.00704087 -0.00235103]]
Observation after one step: [[-0.9634776 -0.2636338 -0.9998802]]
Reward: [-10.]
Done: [False]
Info: [{'TimeLimit.truncated': False}]


In [60]:
from stable_baselines3.common.vec_env import VecNormalize, VecFrameStack


n_envs = 3
# Use a lambda function to create the environments
env = DummyVecEnv([lambda: gym.make('CartPole-v1') for _ in range(n_envs)])
# VecNormalize,他会对state和reward进行Normalize 
# Wrap the environment in DummyVecEnv

env = VecNormalize(env)

# To test the environment, you can reset and take a random action
obs = env.reset()
print("Initial observation:", obs)

action = [env.action_space.sample() for _ in range(n_envs)]
print("Action:",action)
obs, reward, done, info = env.step(action)
print("Observation after one step:", obs)
print("Reward:", reward)
print("Done:", done)
print("Info:", info)


Initial observation: [[ 0.05012321  1.3666942  -0.8533298   0.64414805]
 [-1.135614   -0.4585393  -0.48197085 -1.3396527 ]
 [ 1.0856429  -0.9081749   1.3351983   0.69545174]]
Action: [1, 1, 1]
Observation after one step: [[ 0.1146841   1.3794004  -0.86218894 -0.9493386 ]
 [-1.2199925   0.8087723  -0.52086323 -1.1722691 ]
 [ 1.0906862   0.6635519   1.3682022  -0.8542514 ]]
Reward: [10. 10. 10.]
Done: [False False False]
Info: [{'TimeLimit.truncated': False}, {'TimeLimit.truncated': False}, {'TimeLimit.truncated': False}]


In [50]:
import gymnasium as gym
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO

# Define the number of environments
n_envs = 4

# Use a lambda function to create the environments
envs = DummyVecEnv([lambda: gym.make('CartPole-v1') for _ in range(n_envs)])
envs = VecNormalize(envs)


# Initialize the PPO model
model = PPO('MlpPolicy', envs, verbose=1)

# Train the model
model.learn(total_timesteps=10000)

# Test the trained model
obs = envs.reset()
for _ in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = envs.step(action)
    envs.render()

# Close the environments
envs.close()


Using cuda device
-----------------------------
| time/              |      |
|    fps             | 2926 |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 8192 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1533        |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.015060211 |
|    clip_fraction        | 0.279       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.679      |
|    explained_variance   | -0.133      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.121       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0234     |
|    value_loss           | 0.325       |
-----------------------------------------


