In [1]:
import numpy as np
import math

In [2]:
def compute_next_state(current, action, dim):
    x, y = current
    coord = None
    
    if action == 'up':
        coord = (x-1, y) if x-1 >= 0 else (x, y)
    elif action == 'down':
        coord = (x+1, y) if x+1 < dim else (x, y)
    elif action == 'left':
        coord = (x, y-1) if y-1 >= 0 else (x, y)
    elif action == 'right':
        coord = (x, y+1) if y+1 < dim else (x, y)
    
    return coord

In [3]:
'''
This is the algorithm from Chapter 4.1 Policy Evaluation policy
Iterative Policy Evaluation
'''
def eval_policy(policy, theta):
    # 1. init the values
    grid = np.zeros((4, 4))
    terminal_states = [ (0,0), (3, 3) ]

    while True: 
        delta = 0.0
        state_iter = np.nditer(grid, flags=['multi_index'])

        for item in state_iter:
            old_value = item
            x, y = state_iter.multi_index

            if (x, y) in terminal_states:
                continue

            value = 0.0
            for action, prob in policy.items():
                next_state_x, next_state_y = compute_next_state((x, y), action, 4)
                value += prob * 1 * (-1 + grid[next_state_x][next_state_y])

            delta = max(delta, abs(value - old_value))
            grid[x][y] = value

        if delta < theta: 
            break
    print(grid)

In [4]:
theta = 0.05

policy = { 
    'up': 0.25,
    'down': 0.25,
    'left': 0.25,
    'right': 0.25
}

# figure 4.1 last diagram
eval_policy(policy, theta)

[[  0.         -13.65620118 -19.5057432  -21.45597877]
 [-13.65620118 -17.57811374 -19.54192847 -19.54716624]
 [-19.5057432  -19.54192847 -17.6134715  -13.71141307]
 [-21.45597877 -19.54716624 -13.71141307   0.        ]]
