In [1]:
from IPython.core.debugger import set_trace
import numpy as np
#import pprint
import sys
if "../" not in sys.path:
  sys.path.append("../") 

from lib.envs.gridworld import GridworldEnv
env = GridworldEnv()

#from lib.envs.cliff_walking import CliffWalkingEnv
#env = CliffWalkingEnv()

In [2]:
def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
   
    # V = Estimate of the value function
    # Initialize V arbitrarily, except that V (terminal) = 0
    # In this implementation we initialize V to zeros 
    V = np.zeros(env.nS)
    iter = 0
    printouts = 0
    print_factor = 1
    while True:
        iter += 1
        delta = 0
        # Loop over all states and perform an update
        for s in range(env.nS):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action available in each state
                # look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    # Calculate the expected value. Ref: Sutton Barto eq. 4.5.
                    # Sum over all actions i each state    
                    v += action_prob * prob * (reward + discount_factor * V[next_state])
                    """
                    To think about...
                    How to handle the cases where you are done. For example, in the cliff walking environment, 
                    if done = TRUE, do you add the values for the next_state?
                    if done:
                        print("-- -- -- -- -- -- -- -- -- -- --")
                        print("Done:", done, "Current state:", s, "Next state:", next_state)
                        print("action_prob:", action_prob, "prob:", prob, "Reward:", reward, "V[next_state]", V[next_state])
                        print("-- -- -- -- -- -- -- -- -- -- --")
                        v += action_prob * prob * reward
                    else:
                        v += action_prob * prob * (reward + discount_factor * V[next_state])
                    """
            # Calculate How much our value function changed (across any states)
            delta = max(delta, abs(v - V[s]))
            # Update the value function
            V[s] = v
            
        # Some printing for debugging
        if 0 == ((iter-1) % print_factor):
            printouts += 1
            print("Value Function:", iter, printouts, print_factor)
            print(V)
            if 0 == (printouts % 10):
                print_factor *= 10
        
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

In [3]:
equal_dist_policy = np.ones([env.nS, env.nA]) / env.nA
print(equal_dist_policy)
#zeros_policy = np.zeros([env.nS, env.nA]) 
#print(zeros_policy)
# Zeros will not work as all the action_probabilities be zero 

[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]


In [4]:
#Discount factor
df = 1.0
theta = 0.001
v = policy_eval(equal_dist_policy, env, df, theta)

Value Function: 1 1 1
[ 0.        -1.        -1.25      -1.3125    -1.        -1.5
 -1.6875    -1.75      -1.25      -1.6875    -1.84375   -1.8984375
 -1.3125    -1.75      -1.8984375  0.       ]
Value Function: 2 2 1
[ 0.         -1.9375     -2.546875   -2.73046875 -1.9375     -2.8125
 -3.23828125 -3.40429688 -2.546875   -3.23828125 -3.56835938 -3.21777344
 -2.73046875 -3.40429688 -3.21777344  0.        ]
Value Function: 3 3 1
[ 0.         -2.82421875 -3.83496094 -4.17504883 -2.82421875 -4.03125
 -4.7097168  -4.87670898 -3.83496094 -4.7097168  -4.96374512 -4.26455688
 -4.17504883 -4.87670898 -4.26455688  0.        ]
Value Function: 4 4 1
[ 0.         -3.67260742 -5.0980835  -5.58122253 -3.67260742 -5.19116211
 -6.03242493 -6.18872833 -5.0980835  -6.03242493 -6.14849091 -5.15044403
 -5.58122253 -6.18872833 -5.15044403  0.        ]
Value Function: 5 5 1
[ 0.         -4.49046326 -6.30054855 -6.91293049 -4.49046326 -6.26144409
 -7.22480297 -7.36922646 -6.30054855 -7.22480297 -7.1876235  -

In [5]:
print("Value Function:")
print(v)
print("")

print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")

Value Function:
[  0.         -13.99330608 -19.99037659 -21.98940765 -13.99330608
 -17.99178568 -19.99108113 -19.99118312 -19.99037659 -19.99108113
 -17.99247411 -13.99438108 -21.98940765 -19.99118312 -13.99438108
   0.        ]

Reshaped Grid Value Function:
[[  0.         -13.99330608 -19.99037659 -21.98940765]
 [-13.99330608 -17.99178568 -19.99108113 -19.99118312]
 [-19.99037659 -19.99108113 -17.99247411 -13.99438108]
 [-21.98940765 -19.99118312 -13.99438108   0.        ]]



In [6]:
# Test: Make sure the evaluated policy is what we expected
# Gridworld
# expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])
# np.testing.assert_array_almost_equal(v, expected_v, decimal=2)