In [2]:
import numpy as np

In [153]:
class Gridworld(object):
    def __init__(self, discount_factor = 1.0):
        self.states = range(16)
        self.num_states = len(self.states)
        self.actions = ["up", "down", "right", "left"]
        self.value_state = np.full(self.num_states, 0)
        self.discount_factor = discount_factor
        
        self.policy = np.full([self.num_states, len(self.actions), 1], 1/len(self.actions))
        self.reward = -1#np.full([self.num_states, len(self.actions), self.num_states], -1)
        self.probabilities = np.full([self.num_states, len(self.actions), self.num_states], 0)
        
        for from_state in self.states:
            for action in self.actions:
                action_index = self.actions.index(action)
                for to_state in self.states:
                    p = self.probability(from_state, action, to_state)
                    self.probabilities[from_state][action_index][to_state] = p
                    
    
    def probability(self, from_position, action, to_position):
        num_rows, num_cols = self.num_states / 4, self.num_states / 4
        from_x, from_y = from_position//num_rows, from_position%num_cols
        to_x, to_y = to_position//num_rows, to_position%num_cols
        
        if abs(to_x - from_x) > 1: return 0
        if abs(to_y - from_y) > 1: return 0
        
        if from_x == 0 and from_x == to_x and from_y == to_y and action == "up":
            return 1
        if from_x == num_rows-1 and from_x == to_x and from_y == to_y and action == "down":
            return 1
        if from_y == 0 and from_x == to_x and from_y == to_y and action == "left":
            return 1
        if from_y == num_cols-1 and from_x == to_x and from_y == to_y and action == "right":
            return 1
        
        if from_x == to_x and from_y == to_y: return 0
        
        if from_x == to_x + 1 and from_y == to_y + 1: return 0
        if from_x == to_x - 1 and from_y == to_y - 1: return 0
        if from_x == to_x + 1 and from_y == to_y - 1: return 0
        if from_x == to_x - 1 and from_y == to_y + 1: return 0
        
        if from_x + 1 == to_x and action != "down": return 0
        if from_x - 1 == to_x and action != "up": return 0
        if from_y + 1 == to_y and action != "right": return 0
        if from_y - 1 == to_y and action != "left": return 0
        
        return 1
    
    def policy_eval(self):
        new_v_action_state = (self.value_state*self.discount_factor + self.reward)*self.probabilities*self.policy
        new_v = new_v_action_state.sum(axis=2).sum(axis=1)
        new_v.put([0, -1], [0, 0])
        return new_v
        

In [154]:
theta = 0.001
grid = Gridworld()

while True:
    delta = 0
    v = grid.value_state
    grid.value_state = grid.policy_eval()
    delta = max(delta, (v - grid.value_state).max())
    if delta < theta:
        break
        
grid.value_state.reshape(4, 4)

array([[  0.        , -13.98945772, -19.98437823, -21.98251832],
       [-13.98945772, -17.98623815, -19.98448273, -19.98437823],
       [-19.98437823, -19.98448273, -17.98623815, -13.98945772],
       [-21.98251832, -19.98437823, -13.98945772,   0.        ]])