In [15]:
import numpy as np

In [16]:
ACTION_SPACE = ('U', 'D', 'L', 'R')

In [17]:
class WindyGrid:
    def __init__(self, rows, columns, start):
        self.rows = rows
        self.columns = columns
        self.i = start[0]
        self.j = start[1]

    def set_rewards_actions_probs(self, rewards, actions, probs):
        self.rewards = rewards # {s: r}
        self.actions = actions # {s: (a)}
        self.probs = probs # transition probabilities -> nested dict {(s,a): {(s'): prob}}
    
    def get_state(self):
        return (self.i, self.j)

    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]

    # def get_next_state(self, s, a) is removed # -> because now next state is probabilistic so we can't know for sure the next state 
    
    def move(self, a):
        next_state_probs = self.probs[((self.i, self.j), a)]
        next_states = next_state_probs.keys()
        next_probs = next_state_probs.values()

        next_state = np.random.choice(next_states, p=next_probs) # random choice for each next states with their corresponding probability

        self.i, self.j = next_state

        return self.rewards.get(next_state, 0)
    
    def is_terminal(self, s):
        return s not in self.actions

    def game_over(self):
        return (self.i, self.j) not in self.actions

    def all_states(self):
        return set(self.actions.keys() | self.rewards.keys())

In [18]:
def grid_windy():
    g = WindyGrid(3, 4, (2, 0))
    rewards = {
        (0,3): 1,
        (1,3): -1
    }
    
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('U', 'L', 'R'),
        (2, 3): ('U', 'L')
    }

    probs = {
        ((2, 0), 'U'): {(1, 0): 1.0},
        ((2, 0), 'D'): {(2, 0): 1.0},
        ((2, 0), 'L'): {(2, 0): 1.0},
        ((2, 0), 'R'): {(2, 1): 1.0},
        ((1, 0), 'U'): {(0, 0): 1.0},
        ((1, 0), 'D'): {(2, 0): 1.0},
        ((1, 0), 'L'): {(1, 0): 1.0},
        ((1, 0), 'R'): {(1, 0): 1.0},
        ((0, 0), 'U'): {(0, 0): 1.0},
        ((0, 0), 'D'): {(1, 0): 1.0},
        ((0, 0), 'L'): {(0, 0): 1.0},
        ((0, 0), 'R'): {(0, 1): 1.0},
        ((0, 1), 'U'): {(0, 1): 1.0},
        ((0, 1), 'D'): {(0, 1): 1.0},
        ((0, 1), 'L'): {(0, 0): 1.0},
        ((0, 1), 'R'): {(0, 2): 1.0},
        ((0, 2), 'U'): {(0, 2): 1.0},
        ((0, 2), 'D'): {(1, 2): 1.0},
        ((0, 2), 'L'): {(0, 1): 1.0},
        ((0, 2), 'R'): {(0, 3): 1.0},
        ((2, 1), 'U'): {(2, 1): 1.0},
        ((2, 1), 'D'): {(2, 1): 1.0},
        ((2, 1), 'L'): {(2, 0): 1.0},
        ((2, 1), 'R'): {(2, 2): 1.0},
        ((2, 2), 'U'): {(1, 2): 1.0},
        ((2, 2), 'D'): {(2, 2): 1.0},
        ((2, 2), 'L'): {(2, 1): 1.0},
        ((2, 2), 'R'): {(2, 3): 1.0},
        ((2, 3), 'U'): {(1, 3): 1.0},
        ((2, 3), 'D'): {(2, 3): 1.0},
        ((2, 3), 'L'): {(2, 2): 1.0},
        ((2, 3), 'R'): {(2, 3): 1.0},
        ((1, 2), 'U'): {(0, 2): 0.5, (1, 3): 0.5},
        ((1, 2), 'D'): {(2, 2): 1.0},
        ((1, 2), 'L'): {(1, 2): 1.0},
        ((1, 2), 'R'): {(1, 3): 1.0},
    }

    g.set_rewards_actions_probs(rewards, actions, probs)

    return g

In [19]:
delta = 1e-3

def print_values(V, g):
    for i in range(g.rows):
        print("-------------------------")
        for j in range(g.columns):
            v = V.get((i, j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")

def print_policy(P, g):
    for i in range(g.rows):
        print("-------------------------")
        for j in range(g.columns):
            a = P.get((i, j), ' ')
            print(" %s |" % a, end="")
        print("")

In [20]:
transition_probs = {}

rewards = {}

g = grid_windy()

for i in range(g.rows):
    for j in range(g.columns):
        s = (i, j)
        if not g.is_terminal(s):
            for a in ACTION_SPACE:
                next_states = g.probs[(s, a)]
                for s_next in next_states.keys():
                    transition_probs[(s, a, s_next)] = next_states[s_next] # prob of s with action to reach s_next is probabilistic
                    rewards[(s, a, s_next)] = g.rewards.get(s_next, 0)

In [21]:
# fixed policy
policy = {
    (0, 0): {'R': 1.0},
    (0, 1): {'R': 1.0},
    (0, 2): {'R': 1.0},
    (1, 0): {'U': 1.0},
    (1, 2): {'U': 1.0},
    (2, 0): {'U': 0.5, 'R': 0.5},
    (2, 1): {'R': 1.0},
    (2, 2): {'U': 1.0},
    (2, 3): {'L': 1.0}
}

In [22]:
print_policy(policy, g)

-------------------------
 {'R': 1.0} | {'R': 1.0} | {'R': 1.0} |   |
-------------------------
 {'U': 1.0} |   | {'U': 1.0} |   |
-------------------------
 {'U': 0.5, 'R': 0.5} | {'R': 1.0} | {'U': 1.0} | {'L': 1.0} |


In [25]:
V = {}
for s in g.all_states():
    V[s] = 0
gamma = 0.9
iterations = 0

while True:
    biggest_change = 0
    for s in g.all_states():
        if not g.is_terminal(s):
            old_v = V[s]
            new_v = 0
            for a in ACTION_SPACE:
                for s_next in g.all_states():
                    action_prob = policy[s].get(a, 0)

                    r = rewards.get((s, a, s_next), 0)

                    new_v += action_prob * transition_probs.get((s, a, s_next), 0) * (r + gamma * V[s_next])

            V[s] = new_v
            biggest_change = max(biggest_change, np.abs(old_v - V[s]))
    print("iterations: ", iterations, "; biggest_change: ", biggest_change)
    print_values(V, g)

    iterations += 1
    if biggest_change < delta:
        break

print("\n\n")
print(V)

iterations:  0 ; biggest_change:  1.0
-------------------------
 0.00| 0.00| 1.00| 0.00|
-------------------------
 0.00| 0.00|-0.50| 0.00|
-------------------------
 0.00| 0.00|-0.45| 0.00|
iterations:  1 ; biggest_change:  0.9
-------------------------
 0.81| 0.90| 1.00| 0.00|
-------------------------
 0.73| 0.00|-0.05| 0.00|
-------------------------
-0.18|-0.41|-0.04|-0.41|
iterations:  2 ; biggest_change:  0.4920750000000001
-------------------------
 0.81| 0.90| 1.00| 0.00|
-------------------------
 0.73| 0.00|-0.05| 0.00|
-------------------------
 0.31|-0.04|-0.04|-0.04|
iterations:  3 ; biggest_change:  0
-------------------------
 0.81| 0.90| 1.00| 0.00|
-------------------------
 0.73| 0.00|-0.05| 0.00|
-------------------------
 0.31|-0.04|-0.04|-0.04|



{(0, 1): 0.9, (1, 2): -0.04999999999999999, (0, 0): 0.81, (1, 3): 0, (2, 1): -0.040499999999999994, (2, 0): 0.3098250000000001, (2, 3): -0.040499999999999994, (2, 2): -0.04499999999999999, (1, 0): 0.7290000000000001, (0,