In [1]:
import numpy as np

In [2]:
actions = {
    'n': 0.25,
    's': 0.25,
    'e': 0.25,
    'w': 0.25
}
discount_factor = 0.9
epochs = 1000

In [3]:
def calculate_reward(state_coord, action):
    x, y = state_coord
    reward = 0
    
    if action == 'n':
        x = x-1
    elif action == 's':
        x = x+1
    elif action == 'e':
        y = y+1
    elif action == 'w':
        y = y-1
        
    # if x, y went beyond boundaries, return in same state and reward = -1
    if x < 0: 
        x = 0
        reward = -1
        
    if x > 4: 
        x = 4
        reward = -1
        
    if y < 0: 
        y = 0
        reward = -1
        
    if y > 4: 
        y = 4
        reward = -1
        
    # special states A -> A' and B -> B'
    if state_coord == (0, 1):
        x, y = (4, 1)
        reward = 10
    
    if state_coord == (0, 3):
        x, y = (2, 3)
        reward = 5
    
    return (x, y, reward)

In [4]:
gridworld = np.zeros((5, 5))
for epoch in range(epochs):
    with np.nditer(gridworld, op_flags=['readwrite'], flags=['multi_index']) as it:
        for item in it:
            i, j = it.multi_index
            tmp_values = 0
            for action, action_prob in actions.items():
                x, y, r_next = calculate_reward((i, j), action)
                # p(s', r|s, a) = 1 because, for example, if you choose action north,
                # p(s'=top, r=0|s, a=north) = 1 (only this matters)
                # p(s'=left, r=0|s, a=north) = 0
                # p(s'=right, r=0|s, a=north) = 0
                # p(s'=bottom, r=0|s, a=north) = 0
                # Bellman equation(E.q. 3.14 BartoSutton book)
                value = action_prob * 1 * (r_next + (discount_factor) * gridworld[x][y])
                tmp_values += value
            item[...] = tmp_values

In [5]:
gridworld

array([[ 3.30899634,  8.78929186,  4.42761918,  5.32236759,  1.49217876],
       [ 1.52158807,  2.99231786,  2.25013995,  1.9075717 ,  0.54740271],
       [ 0.05082249,  0.73817059,  0.67311326,  0.35818621, -0.40314114],
       [-0.9735923 , -0.43549543, -0.35488227, -0.58560509, -1.18307508],
       [-1.85770055, -1.34523126, -1.22926726, -1.42291815, -1.97517905]])

In [19]:
optimal_gridworld = np.zeros((5, 5))
for epoch in range(epochs):
    with np.nditer(optimal_gridworld, op_flags=['readwrite'], flags=['multi_index']) as it:
        for item in it:
            i, j = it.multi_index
            max_value = -1000 # could possibly use negative infinity
            for action, action_prob in actions.items():
                x, y, r_next = calculate_reward((i, j), action)
                value = r_next + (discount_factor) * optimal_gridworld[x][y]
                if value > max_value: 
                    max_value = value
            item[...] = max_value

In [20]:
optimal_gridworld

array([[21.97748529, 24.4194281 , 21.97748529, 19.4194281 , 17.47748529],
       [19.77973676, 21.97748529, 19.77973676, 17.80176308, 16.02158677],
       [17.80176308, 19.77973676, 17.80176308, 16.02158677, 14.4194281 ],
       [16.02158677, 17.80176308, 16.02158677, 14.4194281 , 12.97748529],
       [14.4194281 , 16.02158677, 14.4194281 , 12.97748529, 11.67973676]])