-
Notifications
You must be signed in to change notification settings - Fork 15
/
critic.py
109 lines (85 loc) · 3.86 KB
/
critic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# ===========================
# Critic DNN
# ===========================
import tensorflow as tf
# Network Parameters - Hidden layers
n_hidden_1 = 400
n_hidden_2 = 300
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.01)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.03, shape=shape)
return tf.Variable(initial)
class CriticNetwork(object):
"""
Input to the network is the state and action, output is Q(s,a).
The action must be obtained from the output of the Actor network.
"""
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.learning_rate = learning_rate
self.tau = tau
# Create the critic network
self.inputs, self.action, self.out = self.create_critic_network()
self.network_params = tf.trainable_variables()[num_actor_vars:]
# Target Network
self.target_inputs, self.target_action, self.target_out = self.create_critic_network()
self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]
# Op for periodically updating target network with online network weights with regularization
self.update_target_network_params = \
[self.target_network_params[i].assign(
tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau))
for i in range(len(self.target_network_params))]
# Network target (y_i)
self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])
# Define loss and optimization Op
self.loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(self.predicted_q_value, self.out))))
self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
# Get the gradient of the net w.r.t. the action
self.action_grads = tf.gradients(self.out, self.action)
def create_critic_network(self):
inputs = tf.placeholder(tf.float32, [None, self.s_dim])
action = tf.placeholder(tf.float32, [None, self.a_dim])
# Input -> Hidden Layer
w1 = weight_variable([self.s_dim, n_hidden_1])
b1 = bias_variable([n_hidden_1])
# Hidden Layer -> Hidden Layer + Action
w2 = weight_variable([n_hidden_1, n_hidden_2])
w2a = weight_variable([self.a_dim, n_hidden_2])
b2 = bias_variable([n_hidden_2])
# Hidden Layer -> Output (Q)
w3 = weight_variable([n_hidden_2, 1])
b3 = bias_variable([1])
# 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
h1 = tf.nn.relu(tf.matmul(inputs, w1) + b1)
# 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
# Action inserted here
h2 = tf.nn.relu(tf.matmul(h1, w2) + tf.matmul(action, w2a) + b2)
out = tf.matmul(h2, w3) + b3
return inputs, action, out
def train(self, inputs, action, predicted_q_value):
return self.sess.run([self.out, self.optimize], feed_dict={
self.inputs: inputs,
self.action: action,
self.predicted_q_value: predicted_q_value
})
def predict(self, inputs, action):
return self.sess.run(self.out, feed_dict={
self.inputs: inputs,
self.action: action
})
def predict_target(self, inputs, action):
return self.sess.run(self.target_out, feed_dict={
self.target_inputs: inputs,
self.target_action: action
})
def action_gradients(self, inputs, actions):
return self.sess.run(self.action_grads, feed_dict={
self.inputs: inputs,
self.action: actions
})
def update_target_network(self):
self.sess.run(self.update_target_network_params)