## 1. 依赖

In [37]:
# suppress all DeprecationWarnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import gym
from gym import envs
import time
from tqdm import tnrange
import numpy as np

## 2. 环境

In [27]:
env_name = 'CartPole-v1'
# gym.make用以初始化环境，整个gym的入口函数
env = gym.make(env_name)
print(env)

<TimeLimit<OrderEnforcing<StepAPICompatibility<PassiveEnvChecker<CartPoleEnv<CartPole-v1>>>>>>


In [25]:
# 查看gym已有的所有环境
env_specs = envs.registry.values()
envs_ids = [env_spec.id for env_spec in env_specs]
print(envs_ids)

['CartPole-v0', 'CartPole-v1', 'MountainCar-v0', 'MountainCarContinuous-v0', 'Pendulum-v1', 'Acrobot-v1', 'LunarLander-v2', 'LunarLanderContinuous-v2', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3', 'CarRacing-v2', 'Blackjack-v1', 'FrozenLake-v1', 'FrozenLake8x8-v1', 'CliffWalking-v0', 'Taxi-v3', 'Reacher-v2', 'Reacher-v4', 'Pusher-v2', 'Pusher-v4', 'InvertedPendulum-v2', 'InvertedPendulum-v4', 'InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v4', 'HalfCheetah-v2', 'HalfCheetah-v3', 'HalfCheetah-v4', 'Hopper-v2', 'Hopper-v3', 'Hopper-v4', 'Swimmer-v2', 'Swimmer-v3', 'Swimmer-v4', 'Walker2d-v2', 'Walker2d-v3', 'Walker2d-v4', 'Ant-v2', 'Ant-v3', 'Ant-v4', 'Humanoid-v2', 'Humanoid-v3', 'Humanoid-v4', 'HumanoidStandup-v2', 'HumanoidStandup-v4']


### 2.1 env成员
- 环境定义了动作空间及状态空间

In [13]:
# 动作空间（这个环境下是离散的 - env.spaces.Discrete class）
env.action_space

Discrete(2)

In [17]:
# 从动作空间中采样一个随机动作
env.action_space.sample()
# 检查一个动作是否在动作空间中
env.action_space.contains([0.5, 0.1])

False

In [15]:
# 观测空间 （这个环境下是连续的 - env.spaces.Box class）
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [20]:
# 观测空间的下边界
env.observation_space.low
# 观测空间的上边界
env.observation_space.high
# 从观测空间中随机采样一个观测值
env.observation_space.sample()

array([ 1.3331118e+00, -6.9434040e+37, -3.4935260e-01,  7.5309936e+37],
      dtype=float32)

### 2.2 用action与env交互
- 单步更新需要(step(action)）:
$$
R(s_t, a_t) = r_t
$$

$$
P(s_t, a_t) = s_{t+1}
$$

In [None]:
done = False
score = 0
env.reset()

# 一次实验
while not done:
    # 环境开始渲染
    env.render()
    # policy - 随机选取一个动作
    action = env.action_space.sample()
    # 根据agent的action, 环境更新返回 observation, reward, done, info
    observation, reward, done, info = env.step(action)
    # 更新reward
    score += reward
print(f'total reward: {score}')

- 每次epoch前要重置环境，重置参数

In [33]:
# 设置epoch的次数
for epoch in range(1, 5+1):
    done = False
    score = 0
    # 每次实验前重置环境
    env.reset()
    
    # 进行一次实验（epoch）
    while not done:
        env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        score += reward
    print(f'epoch: {epoch}, total reward: {score}')

env.close()

epoch: 1, total reward: 23.0
epoch: 2, total reward: 23.0
epoch: 3, total reward: 15.0
epoch: 4, total reward: 26.0
epoch: 5, total reward: 12.0


## 3. 渲染

In [63]:
done = False
score = 0
frame = []
env.reset()

while not done:
    frame.append(env.render(mode='rgb_array'))
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    score += reward
    time.sleep(0.5)
print(f'total reward: {score}')

total reward: 17.0


## 4. MountainCar-v0 例子

In [29]:
env_name = 'MountainCar-v0'
env = gym.make(env_name)

In [30]:
class SimpleAgent:
    def __init__(self, env):
        pass
    
    def decide(self, observation):
        position, velocity = observation
        lb = min(-0.09 * (position + 0.25) **2 + 0.03, 
                0.3 * (position + 0.9) **4 - 0.008)
        ub = -0.07 * (position + 0.38) **2 + 0.07
        if lb < velocity < ub:
            action = 2
        else:
            action = 0
        return action
    
    def learn(self, *args):
        pass
    
agent = SimpleAgent(env)

In [31]:
def play(env, agent, render=False, train=False):
    episode_reward = 0
    observation = env.reset()
    
    while True:
        if render:
            env.render()
        action = agent.decide(observation)
        next_observation, reward, done, info = env.step(action)
        episode_reward += reward
        if train:
            agent.learn(observation, action, reward, done)
            
        if done:
            break
    
        observation = next_observation
    return episode_reward

In [24]:
env.seed(3) # 设置随机种子，让结果可复现
episode_reward = play(env, agent, render=True)
print(f'eposide reward: {episode_reward}')
env.close()

eposide reward: -105.0


In [47]:
env.reset()
# 根据习惯，学术界一般把设置为eposide数量设置为100
all_rewards = [play(env, agent, render=False) for _ in tnrange(10000)]
env.close()
print(f'mean reward: {np.mean(all_rewards)}')

  0%|          | 0/10000 [00:00<?, ?it/s]

mean reward: -106.1631
