In [1]:
# Import libraries
import numpy as np

# Custom modules
import sys 
sys.path.append('../environments/')
import gridworlds
from dp import policy_evaluation, policy_iteration

# Dynamic Programming

In [2]:
def print_value_function(V, env):
    iterations = V[1]
    V = np.reshape(V[0], env.size)
    print("Policy evaluated after {} iterations:".format(iterations))
    print(np.round(V, 2))
    
def print_policy(pi, env):
    pi = np.reshape(np.argmax(pi, axis=1), env.size)
    pi = pi.astype(str)
    pi[pi=="0"] = "U" 
    pi[pi=="1"] = "R" 
    pi[pi=="2"] = "D" 
    pi[pi=="3"] = "L" 
    print(pi)

## Policy evaluation

In [3]:
# 4x4 Grid (Sutton & Barto, 2018, page 76) with equiprobable random policy
env = gridworlds.Grid_4x4_Sutton()
pi = np.ones([len(env.state_space()), len(env.action_space())]) / len(env.action_space())
V = policy_evaluation(pi, env, gamma=1)
print_value_function(V, env)


Policy evaluated after 425 iterations:
[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]


In [4]:
# 5x5 Grid (Sutton & Barto, 2018, page 60) with equiprobable random policy
env = gridworlds.Grid_5x5_Sutton()
pi = np.ones([len(env.state_space()), len(env.action_space())]) / len(env.action_space())
V = policy_evaluation(pi, env, gamma=0.9)
print_value_function(V, env)

Policy evaluated after 176 iterations:
[[ 3.31  8.79  4.43  5.32  1.49]
 [ 1.52  2.99  2.25  1.91  0.55]
 [ 0.05  0.74  0.67  0.36 -0.4 ]
 [-0.97 -0.44 -0.35 -0.59 -1.18]
 [-1.86 -1.35 -1.23 -1.42 -1.98]]


In [5]:
# 3x4 Grid (Russell & Norvig, 2020, page 842) with equiprobable random policy
env = gridworlds.Grid_3x4_RNG()
pi = np.ones([len(env.state_space()), len(env.action_space())]) / len(env.action_space())
V = policy_evaluation(pi, env)
print_value_function(V, env)

Policy evaluated after 526 iterations:
[[-1.23 -0.83 -0.28  0.  ]
 [-1.47  0.   -0.87  0.  ]
 [-1.55 -1.47 -1.22 -1.17]]


## Policy Iteration

In [6]:
# 4x4 Grid (Sutton & Barto, 2018, page 76)
env = gridworlds.Grid_4x4_Sutton()
pi = policy_iteration(env)
print_policy(pi[0], env)

[['U' 'L' 'L' 'D']
 ['U' 'U' 'U' 'D']
 ['U' 'U' 'R' 'D']
 ['U' 'R' 'R' 'U']]


In [7]:
# 5x5 Grid (Sutton & Barto, 2018, page 60)
env = gridworlds.Grid_5x5_Sutton()
pi = policy_iteration(env, gamma=0.9)
print_policy(pi[0], env)

[['R' 'U' 'L' 'U' 'L']
 ['U' 'U' 'U' 'L' 'L']
 ['U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U']]


In [8]:
# 3x4 Grid (Russell & Norvig, 2020, page 842)
env = gridworlds.Grid_3x4_RNG()
pi = policy_iteration(env)
print_policy(pi[0], env)

[['R' 'R' 'R' 'U']
 ['U' 'U' 'U' 'U']
 ['U' 'L' 'L' 'L']]
