# A simple policy evaluation algorithm.
For the better version that also implements policy improvement see the notebook *policy_iteration.ipynb*

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.table import Table

In [2]:
def is_outside(i):
    if (i < 0) or (i >= GRID_SIZE):
        return True

def is_terminal(state):
    x, y = state
    return (x == 0 and y == 0) or (x==GRID_SIZE-1 and y == GRID_SIZE-1)

def take_action(state, action):
    x, y = state
    if is_terminal(state):
        return (x, y), 0
    dy, dx = ACTIONS[action]
    x_new, y_new = x+dx, y+dy
    if not is_outside(x_new):
        x = x_new
    if not is_outside(y_new):
        y = y_new
    return (x, y), REWARD

def draw_image(image, decimals=2):
    image = np.round(image, decimals)
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = image.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(image):
        tb.add_cell(i, j, width, height, text=val,
                    loc='center', facecolor='white')

        # Row and column labels...
    for i in range(len(image)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                    edgecolor='none', facecolor='none')
    ax.add_table(tb)
    return fig

In [3]:
GRID_SIZE = 4
GRID = np.zeros((GRID_SIZE, GRID_SIZE)) 
ACTIONS = {
    "←": [-1, 0],
    "↑": [0, -1],
    "→": [1, 0],
    "↓": [0, 1]
}
ACTION_PROBS = {
    "←": 0.25,
    "↑": 0.25,
    "→": 0.25,
    "↓": 0.25,
}
REWARD = -1
GAMMA = 1

In [4]:
iter_count = 0
while True:
    iter_count += 1
    old_value = GRID.copy()
    delta = 0
    for i in range(GRID_SIZE):
        for j in range(GRID_SIZE):
            current_state = (i, j)
            value = 0
            for action in ACTIONS:
                new_state, reward = take_action(current_state, action)
                value += ACTION_PROBS[action]*(reward + GAMMA*GRID[new_state])
            GRID[current_state] = value
            delta = max(delta, np.abs(old_value[current_state] - GRID[current_state]).max())
    if delta < 1e-6:
        break

In [5]:
np.round(GRID, 2)

array([[  0., -14., -20., -22.],
       [-14., -18., -20., -20.],
       [-20., -20., -18., -14.],
       [-22., -20., -14.,   0.]])