In [1]:
import numpy as np
import tensorflow as tf
import gym

In [2]:
np.random.seed(2)
tf.set_random_seed(2)  # reproducible

In [3]:
# Superparameters
OUTPUT_GRAPH = False
MAX_EPISODE = 3000
DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
MAX_EP_STEPS = 1000   # maximum time step in one episode
RENDER = False  # rendering wastes time
GAMMA = 0.9     # reward discount in TD error
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic

In [4]:
env = gym.make('CartPole-v0')
env.seed(1)  # reproducible
env = env.unwrapped

In [5]:
N_F = env.observation_space.shape[0]
N_A = env.action_space.n

In [6]:
class Actor(object):
    def __init__(self, sess, n_features, n_actions, lr=0.001):
        self.sess = sess

        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.a = tf.placeholder(tf.int32, None, "act")
        self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error

        with tf.variable_scope('Actor'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=20,    # number of hidden units
                activation=tf.nn.relu,
                kernel_initializer= tf.random_normal_initializer(mean=0.0, stddev=0.1), 
                bias_initializer= tf.constant_initializer(value=0.1, dtype=tf.float32),
                name='l1'
            )

            self.acts_prob = tf.layers.dense(
                inputs=l1,#'TODO: Add the preceding layer',
                units=n_actions,    # output units
                activation=tf.nn.softmax,#'TODO: Use softmax',   # get action probabilities
                kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1),#'TODO: Use tensorflow random normal initializer (tf.random_normal_initializer)',  # weights
                bias_initializer=tf.constant_initializer(value=0.1, dtype=tf.float32),#'TODO: Use tensorflow constant initializer (tf.constant_initializer)',  # biases
                name='acts_prob'
            )

        with tf.variable_scope('exp_v'):
            log_prob = tf.log(self.acts_prob[0, self.a])
            self.exp_v = tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss

        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v)  # minimize(-exp_v) = maximize(exp_v)

    def learn(self, s, a, td):
        s = s[np.newaxis, :]
        feed_dict = {self.s: s, self.a: a, self.td_error: td}
        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, s):
        s = s[np.newaxis, :]
        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
        return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())   # return a int



In [7]:
class Critic(object):
    def __init__(self, sess, n_features, lr=0.01):
        self.sess = sess

        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
        self.r = tf.placeholder(tf.float32, None, 'r')

        with tf.variable_scope('Critic'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=20,  # number of hidden units
                activation=tf.nn.relu,  # relu
                # have to be linear to make sure the convergence of actor.
                # But linear approximator seems to hardly learn the correct Q.
                kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1),  # weights
                bias_initializer=tf.constant_initializer(value=0.1, dtype=tf.float32),  # biases
                name='l1'
            )

            self.v = tf.layers.dense(
                inputs=l1,
                units=1,  # output units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1),  # weights
                bias_initializer=tf.constant_initializer(value=0.1, dtype=tf.float32),  # biases
                name='V'
            )

        with tf.variable_scope('squared_TD_error'):
            self.td_error = self.r + GAMMA * self.v_ - self.v
            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, s, r, s_):
        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]

        v_ = self.sess.run(self.v, {self.s: s_})
        td_error, _ = self.sess.run([self.td_error, self.train_op],
                                          {self.s: s, self.v_: v_, self.r: r})
        return td_error


In [8]:
sess = tf.Session()

actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
critic = Critic(sess, n_features=N_F, lr=LR_C)     # we need a good teacher, so the teacher should learn faster than the actor

sess.run(tf.global_variables_initializer())


Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [9]:
if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        if RENDER: env.render()

        a = actor.choose_action(s)
        s_, r, done, info = env.step(a)

        if done: r = -20

        track_r.append(r)

        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
        actor.learn(s, a, td_error)     # true_gradient = grad[logPi(s,a) * td_error]

        s = s_
        t += 1

        if done or t >= MAX_EP_STEPS:
            ep_rs_sum = sum(track_r)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break


episode: 0   reward: -7
episode: 1   reward: -6
episode: 2   reward: -5
episode: 3   reward: -5
episode: 4   reward: -5
episode: 5   reward: -5
episode: 6   reward: -6
episode: 7   reward: -6
episode: 8   reward: -6
episode: 9   reward: -6
episode: 10   reward: -6
episode: 11   reward: -6
episode: 12   reward: -5
episode: 13   reward: -5
episode: 14   reward: -5
episode: 15   reward: -5
episode: 16   reward: -5
episode: 17   reward: -5
episode: 18   reward: -5
episode: 19   reward: -4
episode: 20   reward: -4
episode: 21   reward: -3
episode: 22   reward: -3
episode: 23   reward: -3
episode: 24   reward: -4
episode: 25   reward: -3
episode: 26   reward: -3
episode: 27   reward: -4
episode: 28   reward: -3
episode: 29   reward: -3
episode: 30   reward: -2
episode: 31   reward: -2
episode: 32   reward: -2
episode: 33   reward: -1
episode: 34   reward: -1
episode: 35   reward: -2
episode: 36   reward: -1
episode: 37   reward: -2
episode: 38   reward: 0
episode: 39   reward: 0
episode: 40 

episode: 316   reward: 145
episode: 317   reward: 142
episode: 318   reward: 140
episode: 319   reward: 133
episode: 320   reward: 127
episode: 321   reward: 125
episode: 322   reward: 123
episode: 323   reward: 121
episode: 324   reward: 119
episode: 325   reward: 119
episode: 326   reward: 117
episode: 327   reward: 115
episode: 328   reward: 113
episode: 329   reward: 110
episode: 330   reward: 108
episode: 331   reward: 104
episode: 332   reward: 103
episode: 333   reward: 103
episode: 334   reward: 103
episode: 335   reward: 98
episode: 336   reward: 96
episode: 337   reward: 96
episode: 338   reward: 96
episode: 339   reward: 97
episode: 340   reward: 107
episode: 341   reward: 112
episode: 342   reward: 113
episode: 343   reward: 113
episode: 344   reward: 114
episode: 345   reward: 115
episode: 346   reward: 118
episode: 347   reward: 123
episode: 348   reward: 141
episode: 349   reward: 153
episode: 350   reward: 165
episode: 351   reward: 165
episode: 352   reward: 176
episod

episode: 621   reward: 123
episode: 622   reward: 118
episode: 623   reward: 117
episode: 624   reward: 116
episode: 625   reward: 115
episode: 626   reward: 115
episode: 627   reward: 115
episode: 628   reward: 114
episode: 629   reward: 109
episode: 630   reward: 103
episode: 631   reward: 99
episode: 632   reward: 94
episode: 633   reward: 91
episode: 634   reward: 86
episode: 635   reward: 86
episode: 636   reward: 82
episode: 637   reward: 82
episode: 638   reward: 83
episode: 639   reward: 83
episode: 640   reward: 83
episode: 641   reward: 84
episode: 642   reward: 83
episode: 643   reward: 80
episode: 644   reward: 78
episode: 645   reward: 77
episode: 646   reward: 74
episode: 647   reward: 74
episode: 648   reward: 72
episode: 649   reward: 73
episode: 650   reward: 73
episode: 651   reward: 74
episode: 652   reward: 76
episode: 653   reward: 78
episode: 654   reward: 79
episode: 655   reward: 81
episode: 656   reward: 83
episode: 657   reward: 84
episode: 658   reward: 89
ep

episode: 927   reward: 153
episode: 928   reward: 156
episode: 929   reward: 165
episode: 930   reward: 168
episode: 931   reward: 169
episode: 932   reward: 169
episode: 933   reward: 166
episode: 934   reward: 166
episode: 935   reward: 165
episode: 936   reward: 163
episode: 937   reward: 161
episode: 938   reward: 154
episode: 939   reward: 147
episode: 940   reward: 143
episode: 941   reward: 136
episode: 942   reward: 133
episode: 943   reward: 131
episode: 944   reward: 148
episode: 945   reward: 150
episode: 946   reward: 148
episode: 947   reward: 141
episode: 948   reward: 133
episode: 949   reward: 127
episode: 950   reward: 127
episode: 951   reward: 130
episode: 952   reward: 133
episode: 953   reward: 134
episode: 954   reward: 128
episode: 955   reward: 127
episode: 956   reward: 125
episode: 957   reward: 119
episode: 958   reward: 113
episode: 959   reward: 108
episode: 960   reward: 104
episode: 961   reward: 106
episode: 962   reward: 111
episode: 963   reward: 116
e

episode: 1224   reward: 164
episode: 1225   reward: 175
episode: 1226   reward: 171
episode: 1227   reward: 163
episode: 1228   reward: 165
episode: 1229   reward: 158
episode: 1230   reward: 154
episode: 1231   reward: 146
episode: 1232   reward: 138
episode: 1233   reward: 132
episode: 1234   reward: 127
episode: 1235   reward: 120
episode: 1236   reward: 115
episode: 1237   reward: 111
episode: 1238   reward: 110
episode: 1239   reward: 109
episode: 1240   reward: 107
episode: 1241   reward: 104
episode: 1242   reward: 99
episode: 1243   reward: 95
episode: 1244   reward: 92
episode: 1245   reward: 92
episode: 1246   reward: 97
episode: 1247   reward: 98
episode: 1248   reward: 99
episode: 1249   reward: 99
episode: 1250   reward: 98
episode: 1251   reward: 98
episode: 1252   reward: 99
episode: 1253   reward: 98
episode: 1254   reward: 98
episode: 1255   reward: 97
episode: 1256   reward: 97
episode: 1257   reward: 96
episode: 1258   reward: 94
episode: 1259   reward: 91
episode: 1

episode: 1521   reward: 126
episode: 1522   reward: 126
episode: 1523   reward: 127
episode: 1524   reward: 126
episode: 1525   reward: 125
episode: 1526   reward: 124
episode: 1527   reward: 125
episode: 1528   reward: 125
episode: 1529   reward: 124
episode: 1530   reward: 125
episode: 1531   reward: 128
episode: 1532   reward: 129
episode: 1533   reward: 128
episode: 1534   reward: 128
episode: 1535   reward: 129
episode: 1536   reward: 131
episode: 1537   reward: 132
episode: 1538   reward: 133
episode: 1539   reward: 137
episode: 1540   reward: 140
episode: 1541   reward: 143
episode: 1542   reward: 142
episode: 1543   reward: 143
episode: 1544   reward: 142
episode: 1545   reward: 140
episode: 1546   reward: 139
episode: 1547   reward: 137
episode: 1548   reward: 136
episode: 1549   reward: 135
episode: 1550   reward: 132
episode: 1551   reward: 131
episode: 1552   reward: 129
episode: 1553   reward: 127
episode: 1554   reward: 124
episode: 1555   reward: 122
episode: 1556   rewa

episode: 1817   reward: 84
episode: 1818   reward: 82
episode: 1819   reward: 81
episode: 1820   reward: 82
episode: 1821   reward: 79
episode: 1822   reward: 78
episode: 1823   reward: 77
episode: 1824   reward: 74
episode: 1825   reward: 74
episode: 1826   reward: 75
episode: 1827   reward: 76
episode: 1828   reward: 80
episode: 1829   reward: 84
episode: 1830   reward: 95
episode: 1831   reward: 119
episode: 1832   reward: 120
episode: 1833   reward: 121
episode: 1834   reward: 119
episode: 1835   reward: 117
episode: 1836   reward: 113
episode: 1837   reward: 111
episode: 1838   reward: 110
episode: 1839   reward: 109
episode: 1840   reward: 108
episode: 1841   reward: 105
episode: 1842   reward: 104
episode: 1843   reward: 102
episode: 1844   reward: 99
episode: 1845   reward: 98
episode: 1846   reward: 95
episode: 1847   reward: 92
episode: 1848   reward: 90
episode: 1849   reward: 88
episode: 1850   reward: 87
episode: 1851   reward: 88
episode: 1852   reward: 89
episode: 1853  

episode: 2113   reward: 102
episode: 2114   reward: 100
episode: 2115   reward: 100
episode: 2116   reward: 101
episode: 2117   reward: 104
episode: 2118   reward: 110
episode: 2119   reward: 115
episode: 2120   reward: 109
episode: 2121   reward: 117
episode: 2122   reward: 116
episode: 2123   reward: 114
episode: 2124   reward: 112
episode: 2125   reward: 111
episode: 2126   reward: 107
episode: 2127   reward: 106
episode: 2128   reward: 105
episode: 2129   reward: 104
episode: 2130   reward: 104
episode: 2131   reward: 103
episode: 2132   reward: 101
episode: 2133   reward: 99
episode: 2134   reward: 98
episode: 2135   reward: 97
episode: 2136   reward: 97
episode: 2137   reward: 99
episode: 2138   reward: 102
episode: 2139   reward: 108
episode: 2140   reward: 122
episode: 2141   reward: 135
episode: 2142   reward: 144
episode: 2143   reward: 145
episode: 2144   reward: 144
episode: 2145   reward: 144
episode: 2146   reward: 144
episode: 2147   reward: 143
episode: 2148   reward: 1

episode: 2413   reward: 48
episode: 2414   reward: 50
episode: 2415   reward: 52
episode: 2416   reward: 54
episode: 2417   reward: 56
episode: 2418   reward: 58
episode: 2419   reward: 63
episode: 2420   reward: 70
episode: 2421   reward: 78
episode: 2422   reward: 81
episode: 2423   reward: 85
episode: 2424   reward: 90
episode: 2425   reward: 93
episode: 2426   reward: 95
episode: 2427   reward: 97
episode: 2428   reward: 97
episode: 2429   reward: 96
episode: 2430   reward: 93
episode: 2431   reward: 91
episode: 2432   reward: 91
episode: 2433   reward: 89
episode: 2434   reward: 89
episode: 2435   reward: 90
episode: 2436   reward: 90
episode: 2437   reward: 90
episode: 2438   reward: 91
episode: 2439   reward: 92
episode: 2440   reward: 95
episode: 2441   reward: 109
episode: 2442   reward: 122
episode: 2443   reward: 124
episode: 2444   reward: 123
episode: 2445   reward: 125
episode: 2446   reward: 129
episode: 2447   reward: 132
episode: 2448   reward: 143
episode: 2449   rewa

episode: 2709   reward: 194
episode: 2710   reward: 184
episode: 2711   reward: 176
episode: 2712   reward: 167
episode: 2713   reward: 159
episode: 2714   reward: 151
episode: 2715   reward: 143
episode: 2716   reward: 135
episode: 2717   reward: 128
episode: 2718   reward: 121
episode: 2719   reward: 115
episode: 2720   reward: 108
episode: 2721   reward: 103
episode: 2722   reward: 97
episode: 2723   reward: 92
episode: 2724   reward: 87
episode: 2725   reward: 83
episode: 2726   reward: 79
episode: 2727   reward: 75
episode: 2728   reward: 71
episode: 2729   reward: 68
episode: 2730   reward: 64
episode: 2731   reward: 62
episode: 2732   reward: 59
episode: 2733   reward: 56
episode: 2734   reward: 54
episode: 2735   reward: 51
episode: 2736   reward: 49
episode: 2737   reward: 47
episode: 2738   reward: 45
episode: 2739   reward: 43
episode: 2740   reward: 41
episode: 2741   reward: 39
episode: 2742   reward: 38
episode: 2743   reward: 36
episode: 2744   reward: 34
episode: 2745  