## Fitted Value Iteration - Function Approximation

#### Imports

In [12]:
import numpy as np
import tensorflow as tf
np.random.seed(1337)

#### Parameters

In [13]:
n_states = 100 # Number of states
n_actions = 4 # Number of actions
gamma = 0.9 # Discount Factor
learning_rate = 0.8 # Learning Rate
tolerance = 0.001 # Convergence criteria
iterations = 100# Number of iterations
n_epoch = 500 # Number of model training epochs

#### Set state rewards

In [14]:
rewards = np.zeros([n_states, n_actions])
rewards[-1] = 1 # Goal state
rewards[-2] = -1 # Penalty state

#### Define transition probabilities

In [15]:
transition_prob = np.random.random([n_states,n_actions,n_states])
s = transition_prob.sum(axis=-1)
transition_prob = transition_prob/np.repeat(s, n_states).reshape([n_states, n_actions, n_states]) # Normalization
transition_prob[-1] = 0 # Make goal state terminal
transition_prob[-1,:,-1] = 1 # Make goal state terminal
transition_prob[-2] = 0 # Make penalty state terminal
transition_prob[-2,:,-2] = 1 # Make goal state terminal

#### Initialize value network

In [16]:
inputs = tf.placeholder(tf.float32, [None, n_states])
weights = tf.Variable(tf.zeros([n_states,1]))
outputs = tf.matmul(inputs, weights)
targets = tf.placeholder(tf.float32, [None, 1])

#### Define loss and optimizer 

In [17]:
loss = tf.losses.mean_squared_error(targets, outputs)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

#### Fitted Value Iteration through Bellman updates until convergence

In [18]:
init = tf.global_variables_initializer()
state_one_hot = np.eye(n_states)

with tf.Session() as sess:
    sess.run(init)
    for itr in range(iterations):
        v_theta = sess.run([outputs], feed_dict={inputs: state_one_hot})[0].T[0]
        v_estimated = np.zeros(n_states)
        for s in range(n_states):
            v_estimated[s] = max(rewards[s,:] + gamma*np.dot(transition_prob[s,:], v_theta)) # Bellman Update
        v_estimated = np.expand_dims(v_estimated, -1)
        for i in range(n_epoch):
            _, mse = sess.run([optimizer, loss], feed_dict={inputs: state_one_hot, targets: v_estimated})
        print('Iteration %d, Error %f' % (itr, mse))
                
print(v_theta)

Iteration 0, Error 0.000000
Iteration 1, Error 0.000000
Iteration 2, Error 0.000000
Iteration 3, Error 0.000000
Iteration 4, Error 0.000000
Iteration 5, Error 0.000000
Iteration 6, Error 0.000000
Iteration 7, Error 0.000000
Iteration 8, Error 0.000000
Iteration 9, Error 0.000034
Iteration 10, Error 0.000429
Iteration 11, Error 0.000005
Iteration 12, Error 0.000005
Iteration 13, Error 0.000060
Iteration 14, Error 0.000234
Iteration 15, Error 0.000510
Iteration 16, Error 0.000398
Iteration 17, Error 0.000234
Iteration 18, Error 0.000346
Iteration 19, Error 0.000581
Iteration 20, Error 0.000110
Iteration 21, Error 0.000670
Iteration 22, Error 0.000389
Iteration 23, Error 0.000137
Iteration 24, Error 0.000131
Iteration 25, Error 0.001193
Iteration 26, Error 0.000405
Iteration 27, Error 0.000505
Iteration 28, Error 0.000344
Iteration 29, Error 0.000598
Iteration 30, Error 0.000487
Iteration 31, Error 0.000519
Iteration 32, Error 0.000300
Iteration 33, Error 0.000417
Iteration 34, Error 0.00