# gym


In [1]:
import gym
import numpy as np

e = gym.make("CartPole-v1")

In [2]:
e.reset()

(array([ 0.00070169, -0.04000503,  0.04951924, -0.04128506], dtype=float32),
 {})

## 行动空间

In [3]:
e.action_space

Discrete(2)

## 观测空间

In [4]:
e.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

## 采取行动

In [5]:
e.step(0)

(array([-9.8411765e-05, -2.3580082e-01,  4.8693545e-02,  2.6660129e-01],
       dtype=float32),
 1.0,
 False,
 False,
 {})

## 一个训练过程

In [6]:
import gym

env = gym.make("CartPole-v1")
total_rewards = 0.0
total_steps = 0
obs = env.reset()

while True:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(np.array(action))
    total_rewards += reward
    total_steps += 1
    if terminated:
        break

print("Episode done in %d step， total reward %.2f" % (total_steps, total_rewards))

Episode done in 30 step， total reward 30.00


## 包装器

继承继承自Env类。它的构造函数只有一个参数，即要被“包装”的Env类的实例。为了附加额外的功能，需要重新定义想扩展的方法，例如step()或reset()。唯一的要求就是需要调用超类中的原始方法。


ObservationWrapper：需要重新定义父类的observation(obs)方法。obs参数是被包装的环境给出的观察，这个方法需要返回给予智能体的观察。

RewardWrapper：它暴露了一个reward(rew)方法，可以修改给予智能体的奖励值。

ActionWrapper：需要覆盖action(act)方法，它能修改智能体传给被包装环境的动作。传给被包装环境的动作。

In [7]:
import gym
import numpy as np

为了让它更实用，假设有一个场景，我们想要以10%的概率干涉智能体发出的动作流，将当前动作替换成随机动作。

In [8]:
class RandomActionWrapper(gym.ActionWrapper):
    def __init__(self, env, p=0.1):
        super(RandomActionWrapper, self).__init__(env)
        self.p = p

    def action(self, action):
        if np.random.uniform(0,1,1) < self.p:
            print("Action changed randomly!")
            return self.env.action_space.sample()
        return action

In [9]:
env = RandomActionWrapper(gym.make("CartPole-v1"))

obs = env.reset()
total_reward = 0.0

while True:
    obs, reward, terminated, truncated, info = env.step(0)
    total_reward += reward
    if terminated:
        break

print("Reward got: %.2f" % total_reward)

Reward got: 9.00


## 监控器 Monitor

In [10]:
import gym
import numpy as np

In [11]:
env = gym.make("CartPole-v1",render_mode='rgb_array')
env=gym.wrappers.RecordVideo(env,video_folder='video',name_prefix='mario')

  logger.warn(


In [12]:
obs = env.reset()
total_reward = 0.0

for episode in range(10):
    while True:
        obs, reward, terminated, truncated, info = env.step(env.action_space.sample())
        total_reward += reward
        if terminated:
            break
    env.reset()

print("Reward got: %.2f" % total_reward)

Moviepy - Building video D:\code\python\reinforcement_learnging\video\mario-episode-0.mp4.
Moviepy - Writing video D:\code\python\reinforcement_learnging\video\mario-episode-0.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready D:\code\python\reinforcement_learnging\video\mario-episode-0.mp4




Moviepy - Building video D:\code\python\reinforcement_learnging\video\mario-episode-1.mp4.
Moviepy - Writing video D:\code\python\reinforcement_learnging\video\mario-episode-1.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready D:\code\python\reinforcement_learnging\video\mario-episode-1.mp4
Moviepy - Building video D:\code\python\reinforcement_learnging\video\mario-episode-8.mp4.
Moviepy - Writing video D:\code\python\reinforcement_learnging\video\mario-episode-8.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready D:\code\python\reinforcement_learnging\video\mario-episode-8.mp4
Reward got: 197.00


