In [23]:
import numpy as np

def PolicyPrediction(eps, pie):
    # Runs policy prediction algorithm to evaluate the cost of policy pie
    # Returns a vector V. This is done on a grid world with 16 states, 
    # termination at NW and SE corner, cost of -1 per step. 
    # For policy use 0 = up, 1 = right, 2 = down, 3 = left
    
    V = np.zeros(16)  # Initialize V, 0 and 15 are terminal states
    
    while True:
        Delta = 0
        for s in range(1, 15):
            v = V[s]  
            V[s] = (pie[0, s ] * (-1 + V[GWJR(s, 0, 4)]) +
                        pie[1, s] * (-1 + V[GWJR(s, 1, 4)]) +
                        pie[2, s] * (-1 + V[GWJR(s, 2, 4)]) +
                        pie[3, s] * (-1 + V[GWJR(s, 3, 4)]))
            
            Delta = max(Delta, abs(v - V[s]))
        
        if Delta < eps:
            break
    
    return V





In [10]:
def GWJR(OldState, Jump, Size):
    # Function to give a new state jumping from the current state based on jump and
    # the size of the grid. Grid is assumed to be square with NW corner corresponding to
    # state 0, and reflecting boundary conditions. Jump encoding: 1 = up, 2 = right, 
    # 3 = down, 4 = left.

    switch = {
        0: lambda s: s if s <= Size-1 else s - Size,
        1: lambda s: s if (s+1) % Size == 0 else s + 1,
        2: lambda s: s if s >= Size**2 - Size else s + Size,
        3: lambda s: s if s % Size == 0 else s - 1,
    }

    if Jump in switch:
        NewState = switch[Jump](OldState)
    else:
        raise ValueError('Bad Jump Value')

    return NewState




In [25]:
eps=.00002
pie=(1/4) * np.ones((4, 15))

In [26]:
V = PolicyPrediction(eps, pie)
print(V)

[  0.         -13.99986967 -19.99981263 -21.99979376 -13.99986967
 -17.99984006 -19.99982635 -19.99982833 -19.99981263 -19.99982635
 -17.99985347 -13.9998906  -21.99979376 -19.99982833 -13.9998906
   0.        ]
