## Value Iteration - Table Representation

#### Imports

In [15]:
import numpy as np
np.random.seed(1337)

#### Parameters

In [23]:
n_states = 10 # Number of states
n_actions = 4 # Number of actions
gamma = 0.9 # Discount Factor
tolerance = 0.00001 # Convergence criteria
max_iterations = 100 # Maximum number of iterations

#### Set rewards R(s,a)

In [24]:
rewards = np.zeros([n_states, n_actions])
rewards[-1] = 1 # Goal state
rewards[-2] = -1 # Penalty state

#### Define transition probabilities

In [25]:
transition_prob = np.random.random([n_states,n_actions,n_states])
s = transition_prob.sum(axis=-1)
transition_prob = transition_prob/np.repeat(s, n_states).reshape([n_states, n_actions, n_states]) # Normalization
transition_prob[-1] = 0 # Make goal state terminal
transition_prob[-2] = 0 # Make penalty state terminal

#### Initialize random policy

In [41]:
policy = np.random.randint(n_actions, size=n_states)
state_values = np.zeros(n_states)

#### Policy Iteration through Bellman updates until convergence

In [42]:
print('Initial Random Policy', policy)
itr = 0
while itr < max_iterations:
    itr += 1
    for s in range(n_states):
        state_values[s] = rewards[s, policy[s]] + gamma*np.dot(transition_prob[s, policy[s]], state_values) # Bellman Update
    new_policy = np.zeros(n_states)
    for s in range(n_states):
        action_values = np.zeros([n_actions])
        for a in range(n_actions):
            action_values[a] = rewards[s, a] + gamma*np.dot(transition_prob[s, a], state_values)
        new_policy[s] = np.argmax(action_values)
    new_policy = new_policy.astype(np.int32)
    if np.array_equal(new_policy, policy):
        break
    policy = new_policy.copy()

print('Learned Policy', policy)

('Initial Random Policy', array([1, 3, 1, 1, 1, 0, 2, 3, 1, 3]))
('Learned Policy', array([3, 1, 3, 3, 1, 0, 1, 2, 0, 0], dtype=int32))
