In [1]:
'''
author: Thyrix Yang
github: https://github.com/ThyrixYang
'''

import tensorflow as tf 
import numpy as np 
import tensorflow.contrib.slim as slim
import gym
import random
from collections import deque
from numpy.random import normal
import time
start_time = time.time()

scores = []

discount_factor = 0.9

ob_len = 3
action_len = 1
hidden_size = 16

tau = 0.05
learn_rate = 1e-3

replay_memory = deque(maxlen=1000000)

def sample_from_memory(batch_size):
    return random.sample(replay_memory, batch_size)

def build_actor(state_input):
    actor_fc_1 = slim.fully_connected(state_input, hidden_size, activation_fn=tf.nn.relu)
    actor_fc_2 = slim.fully_connected(actor_fc_1, hidden_size, activation_fn=tf.nn.relu)
    actor_fc_3 = slim.fully_connected(actor_fc_2, hidden_size, activation_fn=tf.nn.relu)
    actor_fc_4 = slim.fully_connected(actor_fc_3, hidden_size, activation_fn=tf.nn.tanh)
    actor_output = slim.fully_connected(actor_fc_4, action_len, activation_fn=tf.nn.tanh) * 2
    return actor_output

def build_critic(state_input, action_input):
    critic_input = slim.flatten(tf.concat([state_input, action_input], axis=1))
    critic_fc_1 = slim.fully_connected(critic_input, hidden_size, activation_fn=tf.nn.relu)
    critic_fc_2 = slim.fully_connected(critic_fc_1, hidden_size, activation_fn=tf.nn.relu)
    critic_fc_3 = slim.fully_connected(critic_fc_2, hidden_size, activation_fn=tf.nn.tanh)
    critic_fc_4 = slim.fully_connected(critic_fc_3, hidden_size, activation_fn=tf.nn.tanh)
    critic_output = slim.fully_connected(critic_fc_4, 1, activation_fn=None)
    return critic_output

state_input_ph = tf.placeholder(tf.float32, shape=(None, ob_len))
action_input_ph = tf.placeholder(tf.float32, shape=(None, action_len))
target_q_ph = tf.placeholder(tf.float32, shape=(None, 1))

with tf.variable_scope("actor"):
    actor_output = build_actor(state_input_ph)

with tf.variable_scope("critic"):
    critic_output = build_critic(state_input_ph, action_input_ph)

with tf.variable_scope("target_actor"):
    target_actor_output = build_actor(state_input_ph)

with tf.variable_scope("target_critic"):
    target_critic_output = build_critic(state_input_ph, actor_output)

actor_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor')
critic_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic')
target_actor_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor')
target_critic_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic')

update_target_ops = []
for i in range(len(actor_weights)):
    update_target_op = target_actor_weights[i].assign(tau*actor_weights[i] + (1-tau)*target_actor_weights[i])
    update_target_ops.append(update_target_op)
for i in range(len(critic_weights)):
    update_target_op = target_critic_weights[i].assign(tau*critic_weights[i] + (1-tau)*target_critic_weights[i])
    update_target_ops.append(update_target_op)

critic_lose = tf.reduce_mean(tf.square(target_q_ph-critic_output))
critic_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(critic_lose, var_list=critic_weights)

actor_lose = tf.reduce_mean(-target_critic_output)
actor_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(actor_lose, var_list=actor_weights)



def train():
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def select_action(state):
        state = np.reshape(np.array(state), (-1, 3))
        action = sess.run(actor_output, {state_input_ph: state})
        return action

    def batch_updade(batch_size):
        batch = sample_from_memory(batch_size)
        state_0 = np.reshape(np.vstack([b[0] for b in batch]), (-1, 3))
        action_0 = np.reshape(np.vstack([b[1] for b in batch]), (-1, 1))
        reward_0 = np.reshape(np.vstack([b[2] for b in batch]), (-1, 1))
        state_1 = np.reshape(np.vstack([b[3] for b in batch]), (-1, 3))
        action_1 = sess.run(actor_output, {state_input_ph:state_1})
        q = sess.run(critic_output, {state_input_ph:state_1, action_input_ph:action_1})
        target_q = reward_0 + discount_factor*q
        lose, _ = sess.run([critic_lose, critic_optimizer],
                             {state_input_ph:state_0, 
                              action_input_ph:action_0,
                              target_q_ph:target_q})

        lose, _ = sess.run([actor_lose, actor_optimizer],
                     {state_input_ph:state_0})

        sess.run(update_target_ops)


    env = gym.make("Pendulum-v0")
    env = gym.wrappers.Monitor(env, '/tmp/experiment-4', force=True)
    epoch = 2000

    def test():
        observation = env.reset()
        accreward = 0
        while True:
            env.render()
            action = select_action(observation)
            observation, reward, done, info = env.step(action)
            accreward += reward
            if done:
                print("test end with reward: {}".format(accreward))
                scores.append(accreward)
                break

    noise_std = 4
    noise_rate = 0.995
    for ep in range(epoch):
        observation = env.reset()
        print("at ep: {}".format(ep))
        noise_std *= noise_rate
        while True:
            action = select_action(observation) + normal(0, noise_std)
            new_observation, reward, done, info = env.step(action)
            new_observation = np.reshape(new_observation, (-1, 3))
            replay_memory.append([observation, action, reward, new_observation])
            observation = new_observation
            if done:
                break

        for _ in range(min(100, len(replay_memory) // 256)):
            batch_updade(128)

        if (ep % 10 == 0):
            print("start test at ep: {}".format(ep))
            test()



if __name__ == '__main__':
    train()
    print("--- %s seconds ---" % (time.time() - start_time))

at ep: 0
start test at ep: 0
test end with reward: [-1603.6501]
at ep: 1
at ep: 2
at ep: 3
at ep: 4
at ep: 5
at ep: 6
at ep: 7
at ep: 8
at ep: 9
at ep: 10
start test at ep: 10
test end with reward: [-1626.6912]
at ep: 11
at ep: 12
at ep: 13
at ep: 14
at ep: 15
at ep: 16
at ep: 17
at ep: 18
at ep: 19
at ep: 20
start test at ep: 20
test end with reward: [-1074.3419]
at ep: 21
at ep: 22
at ep: 23
at ep: 24
at ep: 25
at ep: 26
at ep: 27
at ep: 28
at ep: 29
at ep: 30
start test at ep: 30
test end with reward: [-1737.4834]
at ep: 31
at ep: 32
at ep: 33
at ep: 34
at ep: 35
at ep: 36
at ep: 37
at ep: 38
at ep: 39
at ep: 40
start test at ep: 40
test end with reward: [-1799.306]
at ep: 41
at ep: 42
at ep: 43
at ep: 44
at ep: 45
at ep: 46
at ep: 47
at ep: 48
at ep: 49
at ep: 50
start test at ep: 50
test end with reward: [-1814.3507]
at ep: 51
at ep: 52
at ep: 53
at ep: 54
at ep: 55
at ep: 56
at ep: 57
at ep: 58
at ep: 59
at ep: 60
start test at ep: 60
test end with reward: [-1010.56354]
at ep: 61

at ep: 498
at ep: 499
at ep: 500
start test at ep: 500
test end with reward: [-134.24998]
at ep: 501
at ep: 502
at ep: 503
at ep: 504
at ep: 505
at ep: 506
at ep: 507
at ep: 508
at ep: 509
at ep: 510
start test at ep: 510
test end with reward: [-130.61238]
at ep: 511
at ep: 512
at ep: 513
at ep: 514
at ep: 515
at ep: 516
at ep: 517
at ep: 518
at ep: 519
at ep: 520
start test at ep: 520
test end with reward: [-127.99491]
at ep: 521
at ep: 522
at ep: 523
at ep: 524
at ep: 525
at ep: 526
at ep: 527
at ep: 528
at ep: 529
at ep: 530
start test at ep: 530
test end with reward: [-137.79411]
at ep: 531
at ep: 532
at ep: 533
at ep: 534
at ep: 535
at ep: 536
at ep: 537
at ep: 538
at ep: 539
at ep: 540
start test at ep: 540
test end with reward: [-133.7983]
at ep: 541
at ep: 542
at ep: 543
at ep: 544
at ep: 545
at ep: 546
at ep: 547
at ep: 548
at ep: 549
at ep: 550
start test at ep: 550
test end with reward: [-387.88812]
at ep: 551
at ep: 552
at ep: 553
at ep: 554
at ep: 555
at ep: 556
at ep: 557

at ep: 990
start test at ep: 990
test end with reward: [-1.6276273]
at ep: 991
at ep: 992
at ep: 993
at ep: 994
at ep: 995
at ep: 996
at ep: 997
at ep: 998
at ep: 999
at ep: 1000
start test at ep: 1000
test end with reward: [-129.47838]
at ep: 1001
at ep: 1002
at ep: 1003
at ep: 1004
at ep: 1005
at ep: 1006
at ep: 1007
at ep: 1008
at ep: 1009
at ep: 1010
start test at ep: 1010
test end with reward: [-130.20341]
at ep: 1011
at ep: 1012
at ep: 1013
at ep: 1014
at ep: 1015
at ep: 1016
at ep: 1017
at ep: 1018
at ep: 1019
at ep: 1020
start test at ep: 1020
test end with reward: [-132.02235]
at ep: 1021
at ep: 1022
at ep: 1023
at ep: 1024
at ep: 1025
at ep: 1026
at ep: 1027
at ep: 1028
at ep: 1029
at ep: 1030
start test at ep: 1030
test end with reward: [-127.108635]
at ep: 1031
at ep: 1032
at ep: 1033
at ep: 1034
at ep: 1035
at ep: 1036
at ep: 1037
at ep: 1038
at ep: 1039
at ep: 1040
start test at ep: 1040
test end with reward: [-134.08766]
at ep: 1041
at ep: 1042
at ep: 1043
at ep: 1044
at

test end with reward: [-131.69568]
at ep: 1451
at ep: 1452
at ep: 1453
at ep: 1454
at ep: 1455
at ep: 1456
at ep: 1457
at ep: 1458
at ep: 1459
at ep: 1460
start test at ep: 1460
test end with reward: [-253.54489]
at ep: 1461
at ep: 1462
at ep: 1463
at ep: 1464
at ep: 1465
at ep: 1466
at ep: 1467
at ep: 1468
at ep: 1469
at ep: 1470
start test at ep: 1470
test end with reward: [-130.01683]
at ep: 1471
at ep: 1472
at ep: 1473
at ep: 1474
at ep: 1475
at ep: 1476
at ep: 1477
at ep: 1478
at ep: 1479
at ep: 1480
start test at ep: 1480
test end with reward: [-132.6241]
at ep: 1481
at ep: 1482
at ep: 1483
at ep: 1484
at ep: 1485
at ep: 1486
at ep: 1487
at ep: 1488
at ep: 1489
at ep: 1490
start test at ep: 1490
test end with reward: [-1.126941]
at ep: 1491
at ep: 1492
at ep: 1493
at ep: 1494
at ep: 1495
at ep: 1496
at ep: 1497
at ep: 1498
at ep: 1499
at ep: 1500
start test at ep: 1500
test end with reward: [-129.85045]
at ep: 1501
at ep: 1502
at ep: 1503
at ep: 1504
at ep: 1505
at ep: 1506
at ep

at ep: 1912
at ep: 1913
at ep: 1914
at ep: 1915
at ep: 1916
at ep: 1917
at ep: 1918
at ep: 1919
at ep: 1920
start test at ep: 1920
test end with reward: [-131.13582]
at ep: 1921
at ep: 1922
at ep: 1923
at ep: 1924
at ep: 1925
at ep: 1926
at ep: 1927
at ep: 1928
at ep: 1929
at ep: 1930
start test at ep: 1930
test end with reward: [-130.4346]
at ep: 1931
at ep: 1932
at ep: 1933
at ep: 1934
at ep: 1935
at ep: 1936
at ep: 1937
at ep: 1938
at ep: 1939
at ep: 1940
start test at ep: 1940
test end with reward: [-257.50928]
at ep: 1941
at ep: 1942
at ep: 1943
at ep: 1944
at ep: 1945
at ep: 1946
at ep: 1947
at ep: 1948
at ep: 1949
at ep: 1950
start test at ep: 1950
test end with reward: [-393.5961]
at ep: 1951
at ep: 1952
at ep: 1953
at ep: 1954
at ep: 1955
at ep: 1956
at ep: 1957
at ep: 1958
at ep: 1959
at ep: 1960
start test at ep: 1960
test end with reward: [-130.80406]
at ep: 1961
at ep: 1962
at ep: 1963
at ep: 1964
at ep: 1965
at ep: 1966
at ep: 1967
at ep: 1968
at ep: 1969
at ep: 1970
star

In [2]:
cleaned = [float(i) for i in scores]
index = [i for i in range(0,len(cleaned))]
#print(cleaned)

In [3]:
import pandas as pd
import seaborn as sns

In [4]:
dataframe = pd.DataFrame({
    'Index': index,
    'Reward':cleaned
})

print(dataframe)

     Index       Reward
0        0 -1603.650146
1        1 -1626.691162
2        2 -1074.341919
3        3 -1737.483398
4        4 -1799.306030
5        5 -1814.350708
6        6 -1010.563538
7        7 -1493.694580
8        8 -1501.800781
9        9 -1491.698608
10      10  -133.336166
11      11  -380.229309
12      12  -127.590668
13      13  -132.965073
14      14  -393.823700
15      15  -128.657196
16      16    -7.654939
17      17  -123.364868
18      18  -127.021362
19      19  -127.131432
20      20  -134.114670
21      21    -8.705371
22      22   -12.082965
23      23  -378.169342
24      24  -132.289688
25      25  -383.888397
26      26    -8.035441
27      27  -130.832855
28      28  -276.631744
29      29  -130.901962
..     ...          ...
170    170  -129.775177
171    171  -134.323166
172    172  -129.119293
173    173  -394.339111
174    174  -130.018616
175    175  -133.332443
176    176  -135.592636
177    177  -140.571396
178    178  -128.473312
179    179  -248

In [5]:
from bokeh.io import output_file, output_notebook, show
from bokeh.plotting import figure

# create the plot
p = figure(plot_width=900, plot_height=600,
          x_axis_label='Epoch Number (in tens)',
           y_axis_label='Reward',
          title='Actor-Critic DDPG Method')

# add multiple line glyph
p.line(x=dataframe['Index'], y=dataframe['Reward'], 
       color='black', legend='Reward', line_width=5)

# use this next line to instead output the plot in the jupyter notebook
#output_notebook()
output_file("DDPG")

# show the plot
show(p)

In [6]:
s = pd.Series(cleaned)
s.describe()

count     200.000000
mean     -238.834711
std       316.182180
min     -1814.350708
25%      -262.508598
50%      -133.207481
75%      -129.447685
max        -0.158077
dtype: float64