RL依赖于智能体与环境交互，产生数据流，这就需要一个真实或者仿真的环境。gym就是这样一个重要的RL环境。

gym中最重要的三个函数：
- `reset()` 重新初始化
- `step()` 与环境交互：`s_next` `r`  `terminate`  `info=step(a)`
- `render()` 渲染函数，显示当前状态

In [1]:
import numpy as np
import gym
import random
import time

In [2]:
env = gym.make("Pendulum-v0")  #单摆

In [3]:
env.reset()

array([-0.98662724, -0.16299292, -0.40775522])

In [4]:
n=-0
while n<100:
    env.render() #显示出来（渲染环境）
    a = random.random() #产生0-1的随机数
    print("current torque is ",a)
    action = np.array([a])  #把随机数变成1维数组
    env.step(action) #和环境交互起来，这里action是力矩
    print(env.step(action))
    n += 1 


current torque is  0.17049684046028435
(array([-0.99402371, -0.10916435, -0.58239566]), -9.044026325425053, False, {})
current torque is  0.7006573023426795
(array([-0.99845081, -0.05564146, -0.51507214]), -9.396391360524838, False, {})
current torque is  0.737620078271055
(array([-0.99988241, -0.01533489, -0.36053422]), -9.681546525718321, False, {})
current torque is  0.5850716127794778
(array([-0.99996175,  0.00874589, -0.19735523]), -9.870980967974239, False, {})
current torque is  0.0029053010674632285
(array([-0.99963349,  0.02707205, -0.17622697]), -9.758805551100924, False, {})
current torque is  0.34916413975213245
(array([-0.9994355 ,  0.03359586, -0.02698831]), -9.66918865752635, False, {})
current torque is  0.32078248493915207
(array([-0.99967767,  0.02538796,  0.11790392]), -9.674325692073499, False, {})
current torque is  0.9061711914487758
(array([-0.99995829, -0.00913303,  0.41760642]), -9.80419977551902, False, {})
current torque is  0.10887281467756638
(array([-0.998

current torque is  0.14075155013325702
(array([-0.99918267,  0.04042269, -0.21369214]), -9.690200847819883, False, {})
current torque is  0.24810509818702542
(array([-0.99867935,  0.05137659, -0.07315093]), -9.574111697288524, False, {})
current torque is  0.9131229013061566
(array([-0.99946929,  0.0325751 ,  0.27401728]), -9.582831100080126, False, {})
current torque is  0.3365968526088471
(array([-0.99998539, -0.00540491,  0.41077769]), -9.787033088261046, False, {})
current torque is  0.6263804368094191
(array([-0.99825826, -0.05899533,  0.57181243]), -9.70473765167375, False, {})
current torque is  0.4572048655823645
(array([-0.99295794, -0.11846743,  0.59818729]), -9.35506556680811, False, {})
current torque is  0.6603027455491124
(array([-0.98402867, -0.17801002,  0.59596789]), -8.992069685453716, False, {})
current torque is  0.34492690071366494
(array([-0.9747162 , -0.22344646,  0.41350262]), -8.652051575485016, False, {})
current torque is  0.036092619554769056
(array([-0.9708

env.step的输出格式如下：
(array([-0.99967767,  0.02538796,  0.11790392]), -9.674325692073499, False, {})
- [-0.99967767,  0.02538796,  0.11790392] 是**下一个状态**
- -9.674325692073499是**回报**
- False：**不是终止状态**

env所在目录：
~\Anaconda3\Lib\site-packages\gym\envs
下面贴出单摆的env

In [5]:
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
from os import path

class PendulumEnv(gym.Env):
    metadata = {
        'render.modes' : ['human', 'rgb_array'],
        'video.frames_per_second' : 30
    }

    def __init__(self):
        self.max_speed=8
        self.max_torque=2.
        self.dt=.05
        self.viewer = None

        high = np.array([1., 1., self.max_speed])
        self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)

        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self,u):
        th, thdot = self.state # th := theta

        g = 10.
        m = 1.
        l = 1.
        dt = self.dt

        u = np.clip(u, -self.max_torque, self.max_torque)[0]
        self.last_u = u # for rendering
        costs = angle_normalize(th)**2 + .1*thdot**2 + .001*(u**2)

        newthdot = thdot + (-3*g/(2*l) * np.sin(th + np.pi) + 3./(m*l**2)*u) * dt
        newth = th + newthdot*dt
        newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) #pylint: disable=E1111

        self.state = np.array([newth, newthdot])
        return self._get_obs(), -costs, False, {}

    def reset(self):
        high = np.array([np.pi, 1])
        self.state = self.np_random.uniform(low=-high, high=high)
        self.last_u = None
        return self._get_obs()

    def _get_obs(self):
        theta, thetadot = self.state
        return np.array([np.cos(theta), np.sin(theta), thetadot])

    def render(self, mode='human'):

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(500,500)
            self.viewer.set_bounds(-2.2,2.2,-2.2,2.2)
            rod = rendering.make_capsule(1, .2)
            rod.set_color(.8, .3, .3)
            self.pole_transform = rendering.Transform()
            rod.add_attr(self.pole_transform)
            self.viewer.add_geom(rod)
            axle = rendering.make_circle(.05)
            axle.set_color(0,0,0)
            self.viewer.add_geom(axle)
            fname = path.join(path.dirname(__file__), "assets/clockwise.png")
            self.img = rendering.Image(fname, 1., 1.)
            self.imgtrans = rendering.Transform()
            self.img.add_attr(self.imgtrans)

        self.viewer.add_onetime(self.img)
        self.pole_transform.set_rotation(self.state[0] + np.pi/2)
        if self.last_u:
            self.imgtrans.scale = (-self.last_u/2, np.abs(self.last_u)/2)

        return self.viewer.render(return_rgb_array = mode=='rgb_array')

    def close(self):
        if self.viewer:
            self.viewer.close()
            self.viewer = None

def angle_normalize(x):
    return (((x+np.pi) % (2*np.pi)) - np.pi)
