https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/8_Actor_Critic_Advantage/AC_CartPole.py

结合了 Policy Gradient (Actor) 和 Function Approximation (Critic) 的方法. `Actor` 基于概率选行为, `Critic` 基于 `Actor` 的行为评判行为的得分, `Actor` 根据 `Critic` 的评分修改选行为的概率.

**Actor Critic 方法的优势**: 可以进行单步更新, 比传统的 Policy Gradient 要快.

**Actor Critic 方法的劣势**: 取决于 Critic 的价值判断, 但是 Critic 难收敛, 再加上 Actor 的更新, 就更难收敛. 为了解决收敛问题, Google Deepmind 提出了 `Actor Critic` 升级版 `Deep Deterministic Policy Gradient`. 后者融合了 DQN 的优势, 解决了收敛难的问题. 我们之后也会要讲到 Deep Deterministic Policy Gradient. 不过那个是要以 `Actor Critic` 为基础, 懂了 `Actor Critic`, 后面那个就好懂了.

In [1]:
import numpy as np
import tensorflow as tf
import gym

In [3]:
print(gym.__version__)
print(tf.__version__)

0.17.2
1.15.0


In [2]:
np.random.seed(2)
tf.set_random_seed(2)  # reproducible

# Superparameters
OUTPUT_GRAPH = False
MAX_EPISODE = 3000
DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
MAX_EP_STEPS = 1000   # maximum time step in one episode
RENDER = False  # rendering wastes time
GAMMA = 0.9     # reward discount in TD error
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic

env = gym.make('CartPole-v0')
env.seed(1)  # reproducible
env = env.unwrapped

N_F = env.observation_space.shape[0]
N_A = env.action_space.n

`Actor` 在运用 Policy Gradient 的方法进行 Gradient ascent 的时候, 由 `Critic` 来告诉他, 这次的 Gradient ascent 是不是一次正确的 ascent, 如果这次的得分不好, 那么就不要 ascent 那么多.

In [7]:
# 基于概率选行为
class Actor(object):
    def __init__(self, sess, n_features, n_actions, lr=0.001):
        '''
        用tf建立actor网络，搭建训练的graph
        '''
        self.sess = sess

        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.a = tf.placeholder(tf.int32, None, "act")
        self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error

        with tf.variable_scope('Actor'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=20,    # number of hidden units
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., .1),    # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            self.acts_prob = tf.layers.dense(
                inputs=l1,
                units=n_actions,    # output units
                activation=tf.nn.softmax,   # get action probabilities
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='acts_prob'
            )

        with tf.variable_scope('exp_v'):
            log_prob = tf.log(self.acts_prob[0, self.a])
            self.exp_v = tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss

        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v)  # minimize(-exp_v) = maximize(exp_v)

    def learn(self, s, a, td):
        '''
        s,a产生gradient ascent的方向，td来自critic，告诉actor这个方向对不对
        '''
        s = s[np.newaxis, :]
        feed_dict = {self.s: s, self.a: a, self.td_error: td}
        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, s):
        '''
        根据s选择a
        '''
        s = s[np.newaxis, :]
        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
        return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())   # return a int

In [4]:
class Critic(object):
    def __init__(self, sess, n_features, lr=0.01):
        self.sess = sess

        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
        self.r = tf.placeholder(tf.float32, None, 'r')

        with tf.variable_scope('Critic'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=20,  # number of hidden units
                activation=tf.nn.relu,  # None
                # have to be linear to make sure the convergence of actor.
                # But linear approximator seems hardly learns the correct Q.
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            self.v = tf.layers.dense(
                inputs=l1,
                units=1,  # output units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='V'
            )

        with tf.variable_scope('squared_TD_error'):
            self.td_error = self.r + GAMMA * self.v_ - self.v
            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, s, r, s_):
        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]

        v_ = self.sess.run(self.v, {self.s: s_})
        td_error, _ = self.sess.run([self.td_error, self.train_op],
                                          {self.s: s, self.v_: v_, self.r: r})
        return td_error

In [5]:
sess = tf.Session()

In [8]:
actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
critic = Critic(sess, n_features=N_F, lr=LR_C)     # we need a good teacher, so the teacher should learn faster than the actor

sess.run(tf.global_variables_initializer())

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [9]:

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        if RENDER: env.render()

        a = actor.choose_action(s)

        s_, r, done, info = env.step(a)

        if done: r = -20

        track_r.append(r)

        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
        actor.learn(s, a, td_error)     # true_gradient = grad[logPi(s,a) * td_error]

        s = s_
        t += 1

        if done or t >= MAX_EP_STEPS:
            ep_rs_sum = sum(track_r)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break

episode: 0   reward: -7
episode: 1   reward: -6
episode: 2   reward: -5
episode: 3   reward: -5
episode: 4   reward: -5
episode: 5   reward: -5
episode: 6   reward: -6
episode: 7   reward: -6
episode: 8   reward: -6
episode: 9   reward: -6
episode: 10   reward: -6
episode: 11   reward: -6
episode: 12   reward: -5
episode: 13   reward: -5
episode: 14   reward: -5
episode: 15   reward: -5
episode: 16   reward: -5
episode: 17   reward: -5
episode: 18   reward: -5
episode: 19   reward: -4
episode: 20   reward: -4
episode: 21   reward: -3
episode: 22   reward: -3
episode: 23   reward: -3
episode: 24   reward: -4
episode: 25   reward: -3
episode: 26   reward: -3
episode: 27   reward: -4
episode: 28   reward: -3
episode: 29   reward: -3
episode: 30   reward: -2
episode: 31   reward: -2
episode: 32   reward: -2
episode: 33   reward: -1
episode: 34   reward: -1
episode: 35   reward: -2
episode: 36   reward: -1
episode: 37   reward: -2
episode: 38   reward: 0
episode: 39   reward: 0
episode: 40 

episode: 318   reward: 161
episode: 319   reward: 158
episode: 320   reward: 154
episode: 321   reward: 151
episode: 322   reward: 144
episode: 323   reward: 141
episode: 324   reward: 138
episode: 325   reward: 133
episode: 326   reward: 128
episode: 327   reward: 126
episode: 328   reward: 123
episode: 329   reward: 119
episode: 330   reward: 115
episode: 331   reward: 115
episode: 332   reward: 115
episode: 333   reward: 115
episode: 334   reward: 119
episode: 335   reward: 120
episode: 336   reward: 121
episode: 337   reward: 136
episode: 338   reward: 137
episode: 339   reward: 144
episode: 340   reward: 174
episode: 341   reward: 215
episode: 342   reward: 255
episode: 343   reward: 292
episode: 344   reward: 327
episode: 345   reward: 331
episode: 346   reward: 364
episode: 347   reward: 396
episode: 348   reward: 426
episode: 349   reward: 424
episode: 350   reward: 453
episode: 351   reward: 480
episode: 352   reward: 481
episode: 353   reward: 464
episode: 354   reward: 444
e

episode: 632   reward: 116
episode: 633   reward: 117
episode: 634   reward: 118
episode: 635   reward: 118
episode: 636   reward: 117
episode: 637   reward: 117
episode: 638   reward: 116
episode: 639   reward: 117
episode: 640   reward: 119
episode: 641   reward: 119
episode: 642   reward: 119
episode: 643   reward: 119
episode: 644   reward: 118
episode: 645   reward: 117
episode: 646   reward: 118
episode: 647   reward: 118
episode: 648   reward: 116
episode: 649   reward: 115
episode: 650   reward: 111
episode: 651   reward: 109
episode: 652   reward: 108
episode: 653   reward: 114
episode: 654   reward: 114
episode: 655   reward: 111
episode: 656   reward: 110
episode: 657   reward: 112
episode: 658   reward: 122
episode: 659   reward: 122
episode: 660   reward: 138
episode: 661   reward: 153
episode: 662   reward: 166
episode: 663   reward: 161
episode: 664   reward: 156
episode: 665   reward: 154
episode: 666   reward: 153
episode: 667   reward: 153
episode: 668   reward: 150
e

episode: 936   reward: 117
episode: 937   reward: 115
episode: 938   reward: 115
episode: 939   reward: 115
episode: 940   reward: 114
episode: 941   reward: 113
episode: 942   reward: 112
episode: 943   reward: 112
episode: 944   reward: 112
episode: 945   reward: 113
episode: 946   reward: 113
episode: 947   reward: 112
episode: 948   reward: 113
episode: 949   reward: 113
episode: 950   reward: 114
episode: 951   reward: 118
episode: 952   reward: 126
episode: 953   reward: 129
episode: 954   reward: 130
episode: 955   reward: 130
episode: 956   reward: 131
episode: 957   reward: 131
episode: 958   reward: 132
episode: 959   reward: 133
episode: 960   reward: 136
episode: 961   reward: 137
episode: 962   reward: 136
episode: 963   reward: 136
episode: 964   reward: 135
episode: 965   reward: 133
episode: 966   reward: 131
episode: 967   reward: 130
episode: 968   reward: 126
episode: 969   reward: 123
episode: 970   reward: 122
episode: 971   reward: 122
episode: 972   reward: 121
e

episode: 1234   reward: 157
episode: 1235   reward: 156
episode: 1236   reward: 158
episode: 1237   reward: 160
episode: 1238   reward: 166
episode: 1239   reward: 168
episode: 1240   reward: 166
episode: 1241   reward: 166
episode: 1242   reward: 164
episode: 1243   reward: 161
episode: 1244   reward: 158
episode: 1245   reward: 155
episode: 1246   reward: 152
episode: 1247   reward: 150
episode: 1248   reward: 147
episode: 1249   reward: 144
episode: 1250   reward: 143
episode: 1251   reward: 141
episode: 1252   reward: 139
episode: 1253   reward: 137
episode: 1254   reward: 136
episode: 1255   reward: 133
episode: 1256   reward: 130
episode: 1257   reward: 128
episode: 1258   reward: 125
episode: 1259   reward: 125
episode: 1260   reward: 123
episode: 1261   reward: 122
episode: 1262   reward: 121
episode: 1263   reward: 120
episode: 1264   reward: 118
episode: 1265   reward: 116
episode: 1266   reward: 115
episode: 1267   reward: 114
episode: 1268   reward: 113
episode: 1269   rewa

episode: 1531   reward: 119
episode: 1532   reward: 117
episode: 1533   reward: 115
episode: 1534   reward: 111
episode: 1535   reward: 109
episode: 1536   reward: 108
episode: 1537   reward: 107
episode: 1538   reward: 106
episode: 1539   reward: 105
episode: 1540   reward: 102
episode: 1541   reward: 101
episode: 1542   reward: 101
episode: 1543   reward: 101
episode: 1544   reward: 99
episode: 1545   reward: 98
episode: 1546   reward: 99
episode: 1547   reward: 102
episode: 1548   reward: 107
episode: 1549   reward: 109
episode: 1550   reward: 111
episode: 1551   reward: 113
episode: 1552   reward: 115
episode: 1553   reward: 119
episode: 1554   reward: 120
episode: 1555   reward: 121
episode: 1556   reward: 165
episode: 1557   reward: 164
episode: 1558   reward: 168
episode: 1559   reward: 193
episode: 1560   reward: 233
episode: 1561   reward: 230
episode: 1562   reward: 268
episode: 1563   reward: 305
episode: 1564   reward: 303
episode: 1565   reward: 337
episode: 1566   reward:

episode: 1825   reward: 159
episode: 1826   reward: 158
episode: 1827   reward: 155
episode: 1828   reward: 153
episode: 1829   reward: 150
episode: 1830   reward: 149
episode: 1831   reward: 148
episode: 1832   reward: 145
episode: 1833   reward: 145
episode: 1834   reward: 143
episode: 1835   reward: 141
episode: 1836   reward: 138
episode: 1837   reward: 135
episode: 1838   reward: 132
episode: 1839   reward: 130
episode: 1840   reward: 128
episode: 1841   reward: 127
episode: 1842   reward: 126
episode: 1843   reward: 124
episode: 1844   reward: 122
episode: 1845   reward: 122
episode: 1846   reward: 120
episode: 1847   reward: 119
episode: 1848   reward: 118
episode: 1849   reward: 117
episode: 1850   reward: 116
episode: 1851   reward: 115
episode: 1852   reward: 114
episode: 1853   reward: 113
episode: 1854   reward: 111
episode: 1855   reward: 109
episode: 1856   reward: 107
episode: 1857   reward: 106
episode: 1858   reward: 105
episode: 1859   reward: 105
episode: 1860   rewa

episode: 2120   reward: 138
episode: 2121   reward: 134
episode: 2122   reward: 131
episode: 2123   reward: 127
episode: 2124   reward: 124
episode: 2125   reward: 119
episode: 2126   reward: 115
episode: 2127   reward: 110
episode: 2128   reward: 105
episode: 2129   reward: 99
episode: 2130   reward: 95
episode: 2131   reward: 90
episode: 2132   reward: 85
episode: 2133   reward: 81
episode: 2134   reward: 81
episode: 2135   reward: 81
episode: 2136   reward: 82
episode: 2137   reward: 85
episode: 2138   reward: 87
episode: 2139   reward: 92
episode: 2140   reward: 94
episode: 2141   reward: 95
episode: 2142   reward: 98
episode: 2143   reward: 100
episode: 2144   reward: 101
episode: 2145   reward: 107
episode: 2146   reward: 116
episode: 2147   reward: 121
episode: 2148   reward: 124
episode: 2149   reward: 126
episode: 2150   reward: 132
episode: 2151   reward: 150
episode: 2152   reward: 151
episode: 2153   reward: 150
episode: 2154   reward: 148
episode: 2155   reward: 146
episod

episode: 2418   reward: 63
episode: 2419   reward: 65
episode: 2420   reward: 64
episode: 2421   reward: 65
episode: 2422   reward: 62
episode: 2423   reward: 60
episode: 2424   reward: 62
episode: 2425   reward: 65
episode: 2426   reward: 69
episode: 2427   reward: 70
episode: 2428   reward: 71
episode: 2429   reward: 74
episode: 2430   reward: 79
episode: 2431   reward: 81
episode: 2432   reward: 87
episode: 2433   reward: 92
episode: 2434   reward: 96
episode: 2435   reward: 99
episode: 2436   reward: 99
episode: 2437   reward: 100
episode: 2438   reward: 99
episode: 2439   reward: 99
episode: 2440   reward: 95
episode: 2441   reward: 93
episode: 2442   reward: 92
episode: 2443   reward: 93
episode: 2444   reward: 94
episode: 2445   reward: 94
episode: 2446   reward: 95
episode: 2447   reward: 95
episode: 2448   reward: 94
episode: 2449   reward: 92
episode: 2450   reward: 91
episode: 2451   reward: 89
episode: 2452   reward: 88
episode: 2453   reward: 88
episode: 2454   reward: 89


episode: 2718   reward: 89
episode: 2719   reward: 88
episode: 2720   reward: 86
episode: 2721   reward: 85
episode: 2722   reward: 85
episode: 2723   reward: 86
episode: 2724   reward: 86
episode: 2725   reward: 87
episode: 2726   reward: 88
episode: 2727   reward: 89
episode: 2728   reward: 90
episode: 2729   reward: 90
episode: 2730   reward: 90
episode: 2731   reward: 91
episode: 2732   reward: 91
episode: 2733   reward: 92
episode: 2734   reward: 93
episode: 2735   reward: 93
episode: 2736   reward: 94
episode: 2737   reward: 94
episode: 2738   reward: 94
episode: 2739   reward: 94
episode: 2740   reward: 94
episode: 2741   reward: 94
episode: 2742   reward: 93
episode: 2743   reward: 91
episode: 2744   reward: 88
episode: 2745   reward: 88
episode: 2746   reward: 86
episode: 2747   reward: 83
episode: 2748   reward: 80
episode: 2749   reward: 80
episode: 2750   reward: 80
episode: 2751   reward: 79
episode: 2752   reward: 79
episode: 2753   reward: 80
episode: 2754   reward: 81
e