## Policy Gradient

#### Imports

In [1]:
import numpy as np
import tensorflow as tf
np.random.seed(1337)

#### Parameters

In [2]:
n_states = 10 # Number of states
n_actions = 4 # Number of actions
gamma = 0.9 # Discount Factor
tolerance = 0.00001 # Convergence criteria
max_iterations = 100 # Maximum number of iterations
learning_rate = 0.8 # Learning Rate
n_epoch = 500 # Number of model training epochs

#### Set rewards R(s,a)

In [3]:
rewards = np.zeros([n_states, n_actions])
rewards[-1] = 1 # Goal state
rewards[-2] = -1 # Penalty state

#### Define transition probabilities

In [4]:
transition_prob = np.random.random([n_states,n_actions,n_states])
s = transition_prob.sum(axis=-1)
transition_prob = transition_prob/np.repeat(s, n_states).reshape([n_states, n_actions, n_states]) # Normalization
transition_prob[-1] = 0 # Make goal state terminal
transition_prob[-1,:,-1] = 1 # Make goal state terminal
transition_prob[-2] = 0 # Make penalty state terminal
transition_prob[-2,:,-2] = 1 # Make goal state terminal

Define Policy Network

In [7]:
inputs = tf.placeholder(tf.float32, [None, n_states])
weights = tf.Variable(tf.zeros([n_states,n_actions]))
outputs = tf.matmul(inputs, weights)
targets = tf.placeholder(tf.float32, [None, n_actions])
selected_actions = tf.argmax(outputs,1)

#### Define loss and optimizer

In [8]:
loss = tf.losses.mean_squared_error(targets, outputs)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

#### Policy gradient through Bellman updates until convergence

In [11]:
init = tf.global_variables_initializer()
state_one_hot = np.eye(n_states)

with tf.Session() as sess:
    sess.run(init)
    itr = 0
    I = np.eye(n_states, n_states)
    while itr < max_iterations:
        itr += 1
        policy = sess.run([selected_actions], feed_dict={inputs: state_one_hot})[0]
        rewards_p = rewards[np.arange(n_states), policy]
        transition_prob_p = transition_prob[np.arange(n_states), policy, :]
        state_values = np.dot(np.linalg.inv(I - gamma*transition_prob_p), rewards_p) # Policy Evaluation
        action_values = rewards + gamma*np.dot(transition_prob, state_values)
        _, mse = sess.run([optimizer, loss], feed_dict={inputs: state_one_hot, targets: action_values})
        print('Iteration %d, Error %f' % (itr, mse))
print('Learned Policy', policy)

Iteration 1, Error 20.790823
Iteration 2, Error 20.227566
Iteration 3, Error 16.313326
Iteration 4, Error 12.657439
Iteration 5, Error 9.819757
Iteration 6, Error 7.448901
Iteration 7, Error 5.753360
Iteration 8, Error 4.474958
Iteration 9, Error 3.525198
Iteration 10, Error 2.596204
Iteration 11, Error 1.863461
Iteration 12, Error 1.196033
Iteration 13, Error 0.674459
Iteration 14, Error 0.309178
Iteration 15, Error 0.128507
Iteration 16, Error 0.103010
Iteration 17, Error 0.217050
Iteration 18, Error 0.418176
Iteration 19, Error 0.551833
Iteration 20, Error 0.754273
Iteration 21, Error 0.924122
Iteration 22, Error 1.183784
Iteration 23, Error 1.218796
Iteration 24, Error 1.089127
Iteration 25, Error 1.076974
Iteration 26, Error 1.016996
Iteration 27, Error 0.937115
Iteration 28, Error 0.841047
Iteration 29, Error 0.715186
Iteration 30, Error 0.596040
Iteration 31, Error 0.462986
Iteration 32, Error 0.352280
Iteration 33, Error 0.254923
Iteration 34, Error 0.171440
Iteration 35, Error