In [1]:
# Import libraries
import numpy as np

# Custom modules
import sys 
sys.path.append('../environments/')
import gridworlds
from dp import policy_evaluation, policy_iteration, value_iteration

# Dynamic Programming

In [2]:
def print_value_function(V, env):
    iterations = V[1]
    V = np.reshape(V[0], env.size)
    print("Policy evaluated after {} iterations:".format(iterations))
    print(np.round(V, 1))
    
def print_policy(pi, env):
    pi = np.reshape(np.argmax(pi, axis=1), env.size)
    pi = pi.astype(str)
    pi[pi=="0"] = "U" 
    pi[pi=="1"] = "R" 
    pi[pi=="2"] = "D" 
    pi[pi=="3"] = "L" 
    print(pi)

## Policy evaluation

In [3]:
# 4x4 Grid (Sutton & Barto, 2018, page 76) with equiprobable random policy
env = gridworlds.Grid_4x4_Sutton()
pi = np.ones([len(env.state_space()), len(env.action_space())]) / len(env.action_space())
V = policy_evaluation(pi, env, gamma=1)
print_value_function(V, env)


Policy evaluated after 425 iterations:
[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]


In [4]:
# 5x5 Grid (Sutton & Barto, 2018, page 60) with equiprobable random policy
env = gridworlds.Grid_5x5_Sutton()
pi = np.ones([len(env.state_space()), len(env.action_space())]) / len(env.action_space())
V = policy_evaluation(pi, env, gamma=0.9)
print_value_function(V, env)

Policy evaluated after 176 iterations:
[[ 3.3  8.8  4.4  5.3  1.5]
 [ 1.5  3.   2.3  1.9  0.5]
 [ 0.1  0.7  0.7  0.4 -0.4]
 [-1.  -0.4 -0.4 -0.6 -1.2]
 [-1.9 -1.3 -1.2 -1.4 -2. ]]


In [5]:
# 3x4 Grid (Russell & Norvig, 2020, page 842) with equiprobable random policy
env = gridworlds.Grid_3x4_RNG()
pi = np.ones([len(env.state_space()), len(env.action_space())]) / len(env.action_space())
V = policy_evaluation(pi, env)
print_value_function(V, env)

Policy evaluated after 526 iterations:
[[-1.2 -0.8 -0.3  0. ]
 [-1.5  0.  -0.9  0. ]
 [-1.5 -1.5 -1.2 -1.2]]


## Policy Iteration

In [6]:
# 4x4 Grid (Sutton & Barto, 2018, page 76)
env = gridworlds.Grid_4x4_Sutton()
pi, V = policy_iteration(env)
V = np.round(np.reshape(V, env.size), 1)
print(V)
print("")
print_policy(pi, env)

[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

[['U' 'L' 'L' 'D']
 ['U' 'U' 'U' 'D']
 ['U' 'U' 'R' 'D']
 ['U' 'R' 'R' 'U']]


In [7]:
# 5x5 Grid (Sutton & Barto, 2018, page 60)
env = gridworlds.Grid_5x5_Sutton()
pi, V = policy_iteration(env, gamma=0.9)
V = np.round(np.reshape(V, env.size), 1)
print(V)
print("")
print_policy(pi, env)

[[22.  24.4 22.  19.4 17.5]
 [19.8 22.  19.8 17.8 16. ]
 [17.8 19.8 17.8 16.  14.4]
 [16.  17.8 16.  14.4 13. ]
 [14.4 16.  14.4 13.  11.7]]

[['R' 'U' 'L' 'U' 'L']
 ['U' 'U' 'U' 'L' 'L']
 ['U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U']]


In [8]:
# 3x4 Grid (Russell & Norvig, 2020, page 842)
env = gridworlds.Grid_3x4_RNG()
pi, V = policy_iteration(env)
V = np.round(np.reshape(V, env.size), 1)
print(V)
print("")
print_policy(pi, env)

[[0.9 0.9 1.  0. ]
 [0.8 0.  0.7 0. ]
 [0.7 0.7 0.7 0.4]]

[['R' 'R' 'R' 'U']
 ['U' 'U' 'U' 'U']
 ['U' 'L' 'L' 'L']]


## Value iteration

In [9]:
# 4x4 Grid (Sutton & Barto, 2018, page 76)
env = gridworlds.Grid_4x4_Sutton()
pi, V = value_iteration(env)
V = np.round(np.reshape(V, env.size), 1)
print(V)
print("")
print_policy(pi, env)

[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]

[['U' 'L' 'L' 'D']
 ['U' 'U' 'U' 'D']
 ['U' 'U' 'R' 'D']
 ['U' 'R' 'R' 'U']]


In [10]:
# 5x5 Grid (Sutton & Barto, 2018, page 60)
env = gridworlds.Grid_5x5_Sutton()
pi, V = value_iteration(env, gamma=0.9)
V = np.round(np.reshape(V, env.size), 1)
print(V)
print("")
print_policy(pi, env)

[[22.  24.4 22.  19.4 17.5]
 [19.8 22.  19.8 17.8 16. ]
 [17.8 19.8 17.8 16.  14.4]
 [16.  17.8 16.  14.4 13. ]
 [14.4 16.  14.4 13.  11.7]]

[['R' 'U' 'L' 'U' 'L']
 ['U' 'U' 'U' 'L' 'L']
 ['U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U']]


In [11]:
# 3x4 Grid (Russell & Norvig, 2020, page 842)
env = gridworlds.Grid_3x4_RNG()
pi, V = value_iteration(env)
V = np.round(np.reshape(V, env.size), 1)
print(V)
print("")
print_policy(pi, env)

[[0.9 0.9 1.  0. ]
 [0.8 0.  0.7 0. ]
 [0.7 0.7 0.7 0.4]]

[['R' 'R' 'R' 'U']
 ['U' 'U' 'U' 'U']
 ['U' 'L' 'L' 'L']]
