In [1]:
import tensorflow as tf
import numpy as np

### Build_MLP

In [2]:
def build_mlp(input_placeholder, output_size, scope='nn_baseline', n_layers = 2, size = 64, activation=tf.tanh, output_activation=None):
    """
        Builds a feedforward neural network
        
        arguments:
            input_placeholder: placeholder variable for the state (batch_size, input_size)
            output_size: size of the output layer
            scope: variable scope of the network
            n_layers: number of hidden layers
            size: dimension of the hidden layer
            activation: activation of the hidden layers
            output_activation: activation of the ouput layers

        returns:
            output placeholder of the network (the result of a forward pass) 

        Hint: use tf.layers.dense    
    """
    # YOUR CODE HERE
    with tf.variable_scope(scope):
        x = input_placeholder
        for l in range(n_layers):
            x = tf.layers.dense(inputs=x, units=size, activation = activation)

        output_placeholder = tf.layers.dense(inputs = x, units = output_size, activation = output_activation)    
        
    return output_placeholder

In [3]:
tf.reset_default_graph()

input_placeholder = tf.placeholder(tf.float64, [128, 10])
build_mlp(input_placeholder, 5)

<tf.Tensor 'nn_baseline/dense_2/BiasAdd:0' shape=(128, 5) dtype=float64>

### Computation Graph

In [143]:
tf.reset_default_graph()

ob_dim = 1
ac_dim = 4
discrete = True

batch_size = 64

In [144]:
sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
if discrete == True:
    sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
else:
    sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 
sy_adv_n = tf.placeholder(shape=[None], name="adv",dtype=tf.float32)

print(sy_ob_no)
print(sy_ac_na)
print(sy_adv_n)

Tensor("ob:0", shape=(?, 1), dtype=float32)
Tensor("ac:0", shape=(?,), dtype=int32)
Tensor("adv:0", shape=(?,), dtype=float32)


In [145]:
## Forward Pass

if discrete:
    sy_logits_na = build_mlp(sy_ob_no, ac_dim, scope = "policy")
    print(sy_logits_na)
    
else:
    sy_mean = build_mlp(sy_ob_no, ac_dim, scope = "policy")
    sy_logstd = tf.Variable(tf.zeros([ac_dim]), name = "std")
    print(sy_mean)
    print(sy_logstd)

Tensor("policy/dense_2/BiasAdd:0", shape=(?, 4), dtype=float32)


In [146]:
if discrete:
    sy_sampled_ac = tf.multinomial(sy_logits_na, num_samples = 1)

else:
    epsilon = tf.random_normal([ac_dim])
    sy_sampled_ac = sy_mean + epsilon * tf.exp(sy_logstd)
print(sy_sampled_ac)

Tensor("multinomial/Multinomial:0", shape=(?, 1), dtype=int64)


In [147]:
# Logits: probability of policy in forward pass, labels = action that has taken by policy
if discrete:
    sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = sy_logits_na, labels = sy_ac_na)
else:
    sy_std = tf.exp(sy_logstd)
    # Z ~ N(0,1)
    sy_z  = (sy_ac_na - sy_mean) / sy_std
    sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)
print(sy_logprob_n)

Tensor("SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits:0", shape=(?,), dtype=float32)


In [148]:
observations = np.array([0,1,0,1,0,1]).reshape(6,1)
if discrete:
    actions = np.array([1,3,1,2,0,1])
else:
    actions = np.array([[0,1,2,2],[1,1,2,2],[1,4,2,2],[1,0,5,3],[4,4,2,2],[1,1,6,5]]) 

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    prob = sess.run([sy_sampled_ac], feed_dict= {sy_ob_no: observations,
                                                sy_ac_na: actions})
print('log probability of taken action with given observation:')
print(prob)

log probability of taken action with given observation:
[array([[3],
       [1],
       [2],
       [2],
       [1],
       [2]])]


In [151]:
prob[0].shape

(6, 1)