# 2. Markov Decision Processes
## 2.2 Introducing Randomness

- Random transition probabilities $P(s'|s,a)$
- Non-deterministic policies $\pi(a|s)$

## Deterministic Transitions

In [267]:
import numpy as np

gamma = 0.9

grid = np.array([
    ['S', '#', '-', '-', '-', '-'],
    ['-', '#', '-', '-', '-', '-'],
    ['-', '-', '-', '-', '#', '-'],
    ['-', '-', '-', '#', '-', 'F']
])

rows, cols = grid.shape
terminal_state = (3, 5)

actions = ['up', 'down', 'left', 'right']

rewards = { terminal_state: 10}

Q_values = { (i, j): {a: 0.0 for a in actions} for i in range(rows) for j in range(cols) }

def is_valid(state):
    i, j = state
    return 0 <= i < rows and 0 <= j < cols and grid[i, j] != '#'

def get_next_state(state, action):
    i, j = state
    if action == 'up': next_state = (i-1, j)
    elif action == 'down': next_state = (i+1, j)
    elif action == 'left': next_state = (i, j-1)
    elif action == 'right': next_state = (i, j+1)
    
    if is_valid(next_state): return True, next_state
    else: return False, state

def update():
    for state in Q_values.keys():
        if state == terminal_state or grid[state] == '#':
            continue
        for action in actions:
            is_valid, next_state = get_next_state(state, action) 
            if is_valid:
                reward = rewards.get(next_state, 0) 
                Q_values[state][action] = reward + gamma * max(Q_values[next_state].values())
            else: 
                Q_values[state][action] = -5

action_symbols = {
    "up": "↑",
    "down": "↓",
    "left": "←",
    "right": "→"
}

def get_best_action(state):
    return max(Q_values[state], key=Q_values[state].get)

def visualise_path():
    grid_viz = grid.copy()

    state = (0, 0)
    path = []
    max_steps = 1000

    for _ in range(max_steps):
        path.append(state)
        if state == terminal_state:
            break

        best_action = get_best_action(state)
        _, next_state = get_next_state(state, best_action)

        if next_state == state:
            pass

        state = next_state
    
    for (i, j) in path:
        if grid_viz[i, j] not in ['S', 'F']:
            grid_viz[i, j] = action_symbols[get_best_action((i, j))]

    for row in grid_viz:
        print(" ".join(row))
        
for i in range(50):
    update()

visualise_path()

S # - - - -
↓ # → → → ↓
→ → ↑ - # ↓
- - - # - F


In [268]:
Q_values = {
    state: {action: round(value, 2) for action, value in actions.items()}
    for state, actions in Q_values.items()
}
Q_values

{(0, 0): {'up': -5, 'down': 3.87, 'left': -5, 'right': -5},
 (0, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (0, 2): {'up': -5, 'down': 5.9, 'left': -5, 'right': 5.9},
 (0, 3): {'up': -5, 'down': 6.56, 'left': 5.31, 'right': 6.56},
 (0, 4): {'up': -5, 'down': 7.29, 'left': 5.9, 'right': 7.29},
 (0, 5): {'up': -5, 'down': 8.1, 'left': 6.56, 'right': -5},
 (1, 0): {'up': 3.49, 'down': 4.3, 'left': -5, 'right': -5},
 (1, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (1, 2): {'up': 5.31, 'down': 5.31, 'left': -5, 'right': 6.56},
 (1, 3): {'up': 5.9, 'down': 5.9, 'left': 5.9, 'right': 7.29},
 (1, 4): {'up': 6.56, 'down': -5, 'left': 6.56, 'right': 8.1},
 (1, 5): {'up': 7.29, 'down': 9.0, 'left': 7.29, 'right': -5},
 (2, 0): {'up': 3.87, 'down': 3.87, 'left': -5, 'right': 4.78},
 (2, 1): {'up': -5, 'down': 4.3, 'left': 4.3, 'right': 5.31},
 (2, 2): {'up': 5.9, 'down': 4.78, 'left': 4.78, 'right': 5.9},
 (2, 3): {'up': 6.56, 'down': -5, 'left': 5.31, 'right': -5},
 (

## Non-Deterministic Transitions
Recall the formulation of a Markov decision process:
$$(\mathcal{S}, \mathcal{A}, P, \mathcal{R}, \gamma)$$

Where: 
- $\mathcal{S}$ is the **state space**.
- $\mathcal{A}$ is the **action space**.
- $P(s'|s,a)$ is the **transition function** - the probability of reaching state $s'$ by taking action $a$ in state $s$. 
- $R(s,a,s')$ is the reward received immediately after taking action $a$ in state $s$. 
- $\gamma$ is a discount factor.

Previously, in our gridworld, we worked by the assumption that state transitions were **completely deterministic**, i.e.:
$$P(s'|s,a) = 1 \quad \forall s,a \in \mathcal{S}, \mathcal{A}$$

This means that every time we took a given action in a given state, we would move to the next state with 100% certainty every single time. For example, if we took action '$\text{right}$' in state $(0,0)$, our agent will *always* move to state $(0,1)$. 

In the real world, state transitions are usually not determinstic, but instead are **probabilistic (stochastic)**. 

Let's go back to our gridworld. This time, the grid is slippery - instead of moving in our intended direction with 100% certainty, there is now a chance of slipping and moving in an unintended direction. 

Instead, when an action is chosen, we will: 
- Move in the intended direction with $p=0.8$.
- Move left of the intended direction with $p=0.1$.
- Move right of the intended direction with $p=0.1$.
- Remain in the same state if the action results in an invalid move.

Now our **state transition function $P(s'|s,a)$** becomes: 
$$P(s'|s,a)=
\begin{cases}
0.8, & \text{if } s' \text{ is the intended direction from } s \text{ given } a \\
0.1, & \text{if } s' \text{ is to the left of the the intended direction from } s  \\
0.1, & \text{if } s' \text{ is to the right of the intended direction from } s  \\
0, & \text{otherwise}
\end{cases}$$

In [269]:
def is_valid(state):
    i, j = state
    return 0 <= i < rows and 0 <= j < cols and grid[i, j] != '#'

In [270]:
def update():
    gamma = 0.9
    for state in Q_values.keys():

        if state == terminal_state or grid[state] == '#':
            continue

        for action in actions:
            is_valid, next_state = get_next_state(state, action)
            if is_valid:
                reward = rewards.get(next_state, 0)
                Q_values[state][action] = reward + gamma * max(Q_values[next_state].values())
            else: 
                Q_values[state][action] = -5

To introduce this stochasticity into our gridworld, we introduce a new function: $\text{get\_possible\_moves}$. This function will return a list of three tuples, with the $0$-th tuple of the list representing the intended action, and the other two tuples representing a perpendicular move to the intended direction.

In [271]:
def get_possible_moves(state, action):
    i, j = state
    if action == 'up':
        return [(i-1, j), (i, j-1), (i, j+1)] # up, left, right
    elif action == 'down':
        return [(i+1, j), (i, j-1), (i, j+1)] # down, left, right
    elif action == 'left':
        return [(i, j-1), (i-1, j), (i+1, j)] # left, up, down
    elif action == 'right':
        return [(i, j+1), (i-1, j), (i+1, j)] # right, up, down 

We then update our $\text{get\_next\_state}$ function to first sample the actual move from our set of possible moves. 

In [272]:
np.random.seed(0)
probabilities = [0.8, 0.1, 0.1] # p=0.8 of moving in the intended direction, (1-p)/2 for moving perpendicular

def get_next_state(state, action):
    possible_moves = get_possible_moves(state, action)

    next_state = possible_moves[np.random.choice(3, p=probabilities)]

    if is_valid(next_state):
        return True, next_state
    else: 
        return False, state

We perform our updates as before, using the equation: 
$$Q(s,a)=R + \gamma \cdot \max_{a'}Q(s',a')$$

For now, we just perform one update.

In [273]:
update()

We can how visualise how an agent following the *optimal policy* $\pi^*$ (that just means selecting the **best** action in each state every time) navigates the grid after just one update.

Recall: 
$$\pi^*= \arg\max_{a}Q(s,a)$$

We can use this equation to plot our agent's course across the grid. Look closely. We see that in state $(2,3)$ the agent has learned the optimal action '$\text{left}$'.

In [274]:
visualise_path()

S # - - - -
↓ # - → → ↓
→ → → ← # ↓
- - - # - F


Recall from the previous notebook that the update function iterates over all states and **retrieves the value of the next state corresponding to each action**. It will then use the value of this next state to update its belief about the value of its *own* state. 

Specifically, we look at this part of the $\text{update}$ function: 
```python
for action in actions:
            is_valid, next_state = get_next_state(state, action)
            reward = rewards.get(next_state, 0)
                Q_values[state][action] = reward + gamma * max(Q_values[next_state].values())
         
```
Recall how we laid out our state transition probabilities earlier: we said that with probability $0.8$, the action would transition state as intended, but with probability $0.2$, the action would transition state perpendicular to the intended. 

This means that during our update, the agent tested action '$\text{left}$' in state $(2,3)$ and, with a probability of 10%, it moved upwards instead (i.e., ended up in state $(1,3)$). It then used the *value of this state* to update its belief about the value of $Q((2,3), \text{left})$. 

In [275]:
Q_values[(2,3)]

{'up': -5, 'down': -5, 'left': 5.3136, 'right': -5}

Let's do another iteration of our update rule.

In [276]:
update()

And visualise the path that it has learned now. Already, it is failing to learn an optimal path to reach the terminal state.

In [277]:
visualise_path()

S # - ↓ ↓ ↓
↓ # → → → ↑
→ ↑ ↑ - # -
- - - # - F


We can see the agent gets stuck in state $(1,5)$. Let's look at the Q-values for this state and see what the agent has learned.

In [278]:
Q_values[(1,5)]

{'up': 7.29, 'down': 7.29, 'left': 7.29, 'right': -5}

The Q-values for actions '$\text{up}$', '$\text{down}$', and '$\text{left}$' have been assigned equal value. With each value equal, the agent will simply pick the **first occurrence** of the maximum value, and so it selects action '$\text{up}$'.

We can also visualise what happens when we update our Q-values a few more times (remember, we want our Q-values to converge so we can't take the first update at face value):

In [279]:
for _ in range(50):
    update()
visualise_path()

S # - - - -
↑ # - - - -
- - - - # -
- - - # - F


The agent gets stuck almost immediately as the Q-values converge to their 'true' values. This is the problem with taking a deterministic approach when dealing with probabilistic transitions: the Q-values fail to update in a manner that can be useful to us. 

In [280]:
Q_values = {
    state: {action: round(value, 2) for action, value in actions.items()}
    for state, actions in Q_values.items()
}
Q_values

{(0, 0): {'up': -5, 'down': 3.87, 'left': -5, 'right': -5},
 (0, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (0, 2): {'up': -5, 'down': 3.87, 'left': -5, 'right': -5},
 (0, 3): {'up': -5, 'down': 6.56, 'left': -5, 'right': 6.56},
 (0, 4): {'up': -5, 'down': 7.29, 'left': 7.29, 'right': 7.29},
 (0, 5): {'up': -5, 'down': 6.56, 'left': -5, 'right': -5},
 (1, 0): {'up': 3.49, 'down': 3.49, 'left': 3.49, 'right': -5},
 (1, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (1, 2): {'up': 3.49, 'down': 6.56, 'left': -5, 'right': 6.56},
 (1, 3): {'up': 5.9, 'down': 3.87, 'left': 5.9, 'right': 7.29},
 (1, 4): {'up': 6.56, 'down': -5, 'left': 6.56, 'right': 6.56},
 (1, 5): {'up': 5.9, 'down': 9.0, 'left': 5.9, 'right': 5.9},
 (2, 0): {'up': 3.14, 'down': 3.14, 'left': -5, 'right': 3.87},
 (2, 1): {'up': -5, 'down': 4.3, 'left': 3.49, 'right': 4.3},
 (2, 2): {'up': 5.9, 'down': 3.87, 'left': 3.87, 'right': 3.87},
 (2, 3): {'up': 6.56, 'down': -5, 'left': 5.31, 'right': -5},

To address this, let's consider our update rule: 
$$Q(s,a)=R + \gamma_ \cdot \max_{a'} Q(s',a')$$

When we update our Q-value using this rule, we assume that the **the action always transitions to the best next state**. This means that the agent is not learning the **true expected value of the next state**. 

Consider an action which has a 90% chance of a reward of $10$, but a 10% chance of a penalty of $100$. Can we assign a value of $10$ to the action just because it is the most probable outcome? No - **we must weigh all outcomes by their probability** to get an accurate representation of the expected value: 
$$(0.9 * 10) + (0.1 * -100) = -1$$

This is **exactly** what we need to consider when we're navigating our grid. Each action has a probability of transitioning to three separate states: 
- The one intended, with probability $0.8$.
- To the left of the direction intended with probability $0.1$. 
- To the right of the direction intended with probability $0.1$. 

For each state that we could possibly land in, we need to consider both the **immediate reward** and the **value of that state**.

Let's go through an example to see how that works.

We'll work backwards from state $(3,5)$. We saw last time that the value of any action from the terminal state is $0$ because there are no future states and no more rewards to be gained., i.e.: 
$$Q((3,5),a) = 0$$

We step backwards to find $Q((2,5), \text{down})$. We know that the immediate reward for taking this action is $10$, but we **must also consider** that when the agent takes this action, there is a 10% probability it will move **left**, into an obstacle, and a 10% probability that it will move **right**, out of bounds. Both of these transitions carry a **$-5$ penalty**, and so they must be factored into the value of this action. 

So we know: 
- There is a **$0.8$ probability of reward $10$** for moving down.
- A **$0.1$ probability of reward $-5$** for moving left.
- A **$0.1$ probability of reward $-5$** for moving right.

We can sum these to find the immediate expected reward: 
$$\mathbb{E}[R]=(0.8 \times 10) + (0.1 \times -5) + (0.1 \times -5) = 7$$

$$Q((2,5), \text{down}) = 7 + 0.9 \cdot 0 \cdot 0 = 7$$

Instead of working with Q-values (action-values), we're going switch to working with **State Values $V(s)$** to keep things a little bit cleaner. 

Recall from the previous notebook that the **value of the state** is equal to the **value of the best action in that state**, i.e.: 
$$V(s) = \max_a Q(s,a)$$

Think of the value of a state as being the 'best opportunity' that state offers.

Let's now look at $V((1,5))$.
$$V((1,5)) := Q((1,5), \text{down})$$ 

We also want to consider the **value** of the next state $V(s')$ in our calculations, so we can propagate distant rewards back through our grid.

We know: 
- There is a **$0.8$ probability of moving to a state with value $7$** but  **no immediate reward** by moving down.
- A **$0.1$ probability of immediate reward $-5$** by moving left to a state with **no value**.
- A **$0.1$ probability of immediate reward $-5$** by moving right to a state with **no value**.

Remembering that we need to discount future values using discount factor $\gamma=0.9$, our expected *value* is: 
$$(0.8 \times [0 + (0.9 \cdot 7)]) + (0.1 \times (0+ -5)) + (0.1 \times (0+-5)) = 4.04$$

Each time, we're weighting both the **immediate reward and the discounted value of the next state** by the probability that taking a given action reaches that state, and then **summing across all possible states that may result from that action**.

We represent this process mathematically as follows: 
$$V(s)=\sum_{s'}P(s'|s,a)[R(s,a,s')+ \gamma \cdot V(s')]$$

Equivalently: 
$$Q(s,a)=\sum_{s'}P(s'|s,a)[R(s,a,s')+ \gamma \cdot \max_{a'} Q(s',a')]$$

Let's change our $\text{update}$ function to reflect this new update rule. 

In [281]:
def update():
    gamma = 0.9
    for state in Q_values.keys():

        if state == terminal_state or grid[state] == '#':
            continue

        for action in actions:
            possible_moves = get_possible_moves(state, action)

            # We initialise the expected value at zero.
            expected_value = 0

            # We now sum over each possible move
            for move, prob in zip(possible_moves, probabilities):
                if is_valid(move):
                    reward = rewards.get(move, 0)
                    next_value = max(Q_values[move].values())

                # A penalty of 5 and a zero-value for entering invalid states (obstacle/out of bounds)
                else: reward = -5; next_value = 0

                # We add P(s'|s, a) * [R + gamma * max Q(s', a')] to our expected value
                expected_value += prob * (reward + gamma * next_value)

            # Finally, we define our Q-value as the sum of the expected values of each possible move
            Q_values[state][action] = expected_value


Let's iterate over our update rule to give our values a chance to converge.

In [282]:
np.random.seed(0)
for _ in range(100):
    update()

And have a look at our new Q-values.

In [283]:
Q_values = {
    state: {action: round(value, 2) for action, value in actions.items()}
    for state, actions in Q_values.items()
}
Q_values

{(0, 0): {'up': -5.0, 'down': -1.42, 'left': -4.55, 'right': -4.55},
 (0, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (0, 2): {'up': -4.29, 'down': 1.3, 'left': -4.3, 'right': 1.36},
 (0, 3): {'up': -3.62, 'down': 2.3, 'left': 0.72, 'right': 1.78},
 (0, 4): {'up': -3.5, 'down': 2.83, 'left': 1.45, 'right': 2.12},
 (0, 5): {'up': -4.25, 'down': 3.23, 'left': 1.97, 'right': -4.07},
 (1, 0): {'up': -2.02, 'down': -0.58, 'left': -4.08, 'right': -4.08},
 (1, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (1, 2): {'up': 0.72, 'down': 1.05, 'left': -3.71, 'right': 2.21},
 (1, 3): {'up': 2.15, 'down': 1.64, 'left': 1.94, 'right': 2.68},
 (1, 4): {'up': 2.71, 'down': -3.32, 'left': 1.68, 'right': 3.23},
 (1, 5): {'up': 2.12, 'down': 4.83, 'left': 3.25, 'right': -3.08},
 (2, 0): {'up': -0.84, 'down': -0.38, 'left': -4.05, 'right': 0.58},
 (2, 1): {'up': -3.78, 'down': 0.73, 'left': -0.02, 'right': 0.87},
 (2, 2): {'up': 1.82, 'down': 0.85, 'left': 0.91, 'right': 1.42},
 

We can now see how our agent navigates our grid. We can see some slipping, but the agent manages to get itself back on the right track.

In [284]:
visualise_path()

S # - - ↓ ↓
↓ # → → → ↓
→ → ↑ - # ↓
- - - # - F


## Policies
We have previously considered only **deterministically selecting the best action** when calculating our state values and action values: 
$$V^{\pi^*}(s)=\sum_{s'}P(s'|s,a)[R(s,a,s')+ \gamma \cdot V^{\pi^*}(s')]$$

Equivalently: 
$$Q^{\pi^*}(s,a)=\sum_{s'}P(s'|s,a)[R(s,a,s')+ \gamma \cdot \max_{a'} Q(s',a')]$$
where we introduce new notation $V^{\pi^*}(s)$ and $Q^{\pi^*}(s,a)$ to denote the state values and action values **calculated under the assumption of following the optimal policy $\pi^*$**. 

Computing the optimal state-values and action-values using these equations is known as **Value Iteration**. 

Since we have previously defined the optimal policy as: 
$$\pi^*(s) = \arg\max_{a}Q(s',a')$$

We can build a **policy table** similar to our Q-table by extracting the best action for each state $\arg\max_{a}Q(s',a')$ from our Q-table, and assigning it a 100% probability. 

In [285]:
# Build a policy table with the same structure as our Q-table
optimal_policy = { (i, j): {a: 0.0 for a in actions}
                  for i in range(rows) for j in range(cols)}

# Iterate over every state in the Q-table
for state in Q_values:
    
    # Extract the best action from the state
    best_action = max(Q_values[state], key=Q_values[state].get)

    # Iterate over each action in the policy table and assign a 
    # probability of 1.0 if it had the highest Q-value
    for action in actions:
        optimal_policy[state][action] = 1.0 if action == best_action else 0.0
    
optimal_policy

{(0, 0): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (0, 1): {'up': 1.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (0, 2): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 1.0},
 (0, 3): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (0, 4): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (0, 5): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (1, 0): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (1, 1): {'up': 1.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (1, 2): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 1.0},
 (1, 3): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 1.0},
 (1, 4): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 1.0},
 (1, 5): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (2, 0): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 1.0},
 (2, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 1.0},
 (2, 2): {'up': 1.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (2, 3): {'up': 1.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (2, 4):

We see that we have **extracted the optimal policy $\pi^*(s)$**. Our agent can use this to traverse the grid by selecting the action with $p=1.0$ in each state.

## Stochastic Policies

Previously, we have seen how the agent navigates the grid using a **deterministic policy** (specifically the optimal policy), where it simply picks the action with the highest Q-value in each state: 
$$\pi^*(a|s)=\arg \max_{a}Q(s,a)$$

We will now consider situations in which the agent will navigate the grid using a **Stochastic Policy**, where the **next action is sampled from a probability distribution across the action space**: 
$$\pi(a|s)=P(a|s)$$
where the policy for taking action $a$ in state $s$ is given by the probability for taking action $a$ in state $s$.

One way to define a stochastic policy is by converting Q-values into probabilities using a softmax distribution (which we introduced in the previous notebook):
$$P(a)=\frac{e^{Q(a)}}{\sum_{a}e^{Q(a)}}$$

We can introduce a new temperature parameter, $\tau$, to control the shape of the distribution:
$$P(a)=\frac{e^{\frac{Q(a)}{\tau}}}{\sum_{a}e^{\frac{Q(a)}{\tau}}}$$
- A high $\tau$ results in a more uniform probability. 
- A low $\tau$ results in a more determinstic policy, with a preference towards optimal values. 
This is useful for controlling **exploration vs. exploitation**. 

Let's see how that works by looking at the Q-values for state $(2,0)$. 

In [286]:
Q_values[(2,0)]

{'up': -0.84, 'down': -0.38, 'left': -4.05, 'right': 0.58}

Let's map those to probabilities in the range $[0,1]$ using our softmax function, and experiment with different values of our temperature parameter tau, $\tau$. 

In [287]:
def softmax(q_values, tau=1.0):
    values = np.array(list(q_values.values()))
    exp = np.exp(values/tau)
    probabilities = (exp / np.sum(exp)).tolist()

    return ({action: round(prob,3) for action, prob in zip(q_values.keys(), probabilities)})

print(f"Tau = 1: {softmax(Q_values[(2,0)], 1)}")
print(f"Tau = 10: {softmax(Q_values[(2,0)], 10)}")
print(f"Tau = 0.1: {softmax(Q_values[(2,0)], 0.1)}")

Tau = 1: {'up': 0.148, 'down': 0.234, 'left': 0.006, 'right': 0.612}
Tau = 10: {'up': 0.255, 'down': 0.267, 'left': 0.185, 'right': 0.294}
Tau = 0.1: {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 1.0}


If we apply this technique to the Q-values across our Q-table, we can extract a stochastic policy $\pi(a|s)$ defined by our Q-values from our Q-table. 

In [288]:
policy = { (i, j): {a: 0.0 for a in actions}
                  for i in range(rows) for j in range(cols)}

tau = 1.0

# Iterate over each state
for state in policy:

    # Softmax our Q-values to calculate action probabilities
    values = np.array(list(Q_values[state].values()))
    exp = np.exp(values/tau)
    probabilities = (exp / np.sum(exp)).tolist()

    # Update our action probabilities 
    policy[state] = {a: round(prob, 3) for a, prob in zip(actions, probabilities)}
    
policy

{(0, 0): {'up': 0.025, 'down': 0.897, 'left': 0.039, 'right': 0.039},
 (0, 1): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (0, 2): {'up': 0.002, 'down': 0.483, 'left': 0.002, 'right': 0.513},
 (0, 3): {'up': 0.001, 'down': 0.555, 'left': 0.114, 'right': 0.33},
 (0, 4): {'up': 0.001, 'down': 0.573, 'left': 0.144, 'right': 0.282},
 (0, 5): {'up': 0.0, 'down': 0.778, 'left': 0.221, 'right': 0.001},
 (1, 0): {'up': 0.183, 'down': 0.771, 'left': 0.023, 'right': 0.023},
 (1, 1): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (1, 2): {'up': 0.146, 'down': 0.203, 'left': 0.002, 'right': 0.649},
 (1, 3): {'up': 0.243, 'down': 0.146, 'left': 0.197, 'right': 0.413},
 (1, 4): {'up': 0.329, 'down': 0.001, 'left': 0.117, 'right': 0.553},
 (1, 5): {'up': 0.052, 'down': 0.786, 'left': 0.162, 'right': 0.0},
 (2, 0): {'up': 0.148, 'down': 0.234, 'left': 0.006, 'right': 0.612},
 (2, 1): {'up': 0.004, 'down': 0.38, 'left': 0.179, 'right': 0.437},
 (2, 2): {'up': 0.408, 'down': 0.1

#### Bellman Equations
Because we're not deterministically selecting the same trajectory every time we navigate the grid, the expected return for each state-action pair is going to change, which needs to be reflected in our state values $V(s)$ and action values $Q(s,a)$. 

In similar fashion to accounting for our stochastic state transitions, we **need to weigh the probability of each action in each state** when determining the value. 

Instead of following the **optimal policy update rule**: 
$$Q^{\pi^*}(s,a)=\sum_{s'}P(s'|s,a)[R(s,a,s')+ \gamma \cdot \max_{a'} Q(s',a')]$$

Under stochastic policy $\pi$, we'd want to set the value of the action $Q(s,a)$ equal to:
$$Q^\pi(s,a)=\sum_{s'}P(s'|s,a)[R(s,a,s')+ \gamma \cdot \text{???}]$$

Instead of considering only $\max_{a'}Q(s',a')$ (as we did when following the *optimal* policy), we must now factor in **the probability of each action being selected**, i.e., we must consider $\pi(a'|s')$, and weigh the value of $Q^\pi(s',a')$ accordingly. 

To do this, we simply **multiply the action value of each action by the probability of it happening and sum them together**:
$$Q^\pi(s,a)=\sum_{s'} P(s'|s,a)[R(s,a,s') + \gamma \cdot \sum_{a'} \pi(a'|s') \cdot Q^\pi(s',a')]$$ 

This is the **Bellman expectation equation** for the action-value function $Q^\pi(s,a)$, which defines the expected return when starting in state $s$, taking action $a$, and then following policy $\pi$ thereafter. Instead of selecting the best action as in the optimal case $\max_{a'}Q(s',a')$, we take the expected value over all actions weighted by their probabilities under the policy $\pi(a'|s')$. 

Similarly, we can also define the Bellman expectation equation for the state-value function $V^\pi(s)$: 
$$V^\pi(s)=\sum_a\pi(a|s)\sum_{s'}P(s'|s,a)[R(s,a,s')+\gamma \cdot V^\pi(s')]$$

This equation computes the expected value of a state under policy $\pi$ by averaging over all possible actions and state transitions.

Since $V^\pi(s)$ represents the expected return when following $\pi$, it is an average over actions weighted by $\pi(a|s)$, rather than selecting the best action. As a result, we do not define the value of a state as the value of the best action, meaning $V^\pi(s) \neq \max_a Q(s', a')$. 

Computing the value function $V^\pi(s)$ or the action-value function $Q^\pi(s,a)$ for a given policy $\pi$ is known as **Policy Evaluation**. 

#### Policy Iteration
So far, we have seen how we can use **Value Iteration** to compute state- and action-values, which we then use to extract the optimal policy $\pi^*(s)$ or a stochastic policy $\pi(a|s)$. 

Instead of deriving a policy after computing Q-values, we can now construct the optimal policy from scratch using **Policy Iteration**. 

Policy iteration consists of two alternating steps:
1. **Policy Evaluation**: Compute $V^\pi(s)$ or $Q^\pi(s,a)$ for the current policy $\pi$ using the equations derived above. 
2. **Policy Improvement**: Update $\pi(s)$ by selecting actions that lead to higher returns. 

By iterating over these two steps, we can refine our policy until it converges to an optimal policy $\pi^*$. 

##### Policy Evaluation 
$$Q^\pi(s,a)=\sum_{s'} P(s'|s,a)[R(s,a,s') + \gamma \cdot \sum_{a'} \pi(a'|s') \cdot Q^\pi(s',a')]$$ 

$$V^\pi(s)=\sum_a\pi(a|s)\sum_{s'}P(s'|s,a)[R(s,a,s')+\gamma \cdot V^\pi(s')]$$

##### Policy Improvement

$$\pi'(s)=\arg\max_a\sum_{s'}P(s'|s,a)[R(s,a,s')+\gamma\cdot V^\pi(s')]$$

In [289]:
np.random.seed(0)

policy = { (i, j): {a: 0.25 for a in actions} for i in range(rows) for j in range(cols)}
state_values = { (i, j): 0.0 for i in range(rows) for j in range(cols )}

In [290]:
policy

{(0, 0): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (0, 1): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (0, 2): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (0, 3): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (0, 4): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (0, 5): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (1, 0): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (1, 1): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (1, 2): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (1, 3): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (1, 4): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (1, 5): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (2, 0): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (2, 1): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (2, 2): {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25},
 (2, 3): {

In [291]:
state_values

{(0, 0): 0.0,
 (0, 1): 0.0,
 (0, 2): 0.0,
 (0, 3): 0.0,
 (0, 4): 0.0,
 (0, 5): 0.0,
 (1, 0): 0.0,
 (1, 1): 0.0,
 (1, 2): 0.0,
 (1, 3): 0.0,
 (1, 4): 0.0,
 (1, 5): 0.0,
 (2, 0): 0.0,
 (2, 1): 0.0,
 (2, 2): 0.0,
 (2, 3): 0.0,
 (2, 4): 0.0,
 (2, 5): 0.0,
 (3, 0): 0.0,
 (3, 1): 0.0,
 (3, 2): 0.0,
 (3, 3): 0.0,
 (3, 4): 0.0,
 (3, 5): 0.0}

In [292]:
probabilities = [0.8, 0.1, 0.1]

actions = ['up', 'down', 'left', 'right']

def get_possible_moves(state, action):
    i, j = state
    if action == 'up':
        return [(i-1, j), (i, j-1), (i, j+1)] # up, left, right
    elif action == 'down':
        return [(i+1, j), (i, j-1), (i, j+1)] # down, left, right
    elif action == 'left':
        return [(i, j-1), (i-1, j), (i+1, j)] # left, up, down
    elif action == 'right':
        return [(i, j+1), (i-1, j), (i+1, j)] # right, up, down 

#### Policy Evaluation

$$V^\pi(s)=\sum_a\pi(a|s)\sum_{s'}P(s'|s,a)[R(s,a,s')+\gamma \cdot V^\pi(s')]$$

We want to do three things:
1. Calculate the value of taking an action in a given state, summing over each outcome weighted by the probability of it happening.
2. Multiply this value by the probability of it being selected under the current policy.
3. Sum over all possible actions in a given state.

Policy evaluation will update the **state values** only.

In [293]:
def policy_evaluation():
    # We want to update every possible state
    for state in state_values:
        # Ignore terminal states and obstacles (they don't have a value)
        if state == terminal_state or grid[state] == '#':
            continue

        v = 0
        # Σ π(a|s): Sum over all actions weighted by their probability
        for action in actions:
            # π(a|s): The probability of taking this action under the current policy
            action_probability = policy[state][action]
            
            # Get the set of possible moves by taking this action
            # Remember we can go intended direction or perpendicular
            possible_moves = get_possible_moves(state, action)

            # Set a placeholder for the value of the action
            expected_value = 0

            # Σ P(s'|s,a): Sum over each state and weigh by its probability
            for move, prob in zip(possible_moves, probabilities):
                if is_valid(move):
                    reward = rewards.get(move, 0)

                    # P(s'|s,a) * [R + γ * V(s)]
                    expected_value += prob * (reward + gamma * state_values[move])
                
                else: expected_value += prob * (-5)
            
            # π(a|s) * Σ P(s'|s,a) * [R + γ * V(s)]
            # Update the value of the state
            v += action_probability * expected_value
        
        # Update V(s)
        state_values[state] = v

#### Policy Improvement

$$\pi'(s)=\arg\max_a\sum_{s'}P(s'|s,a)[R(s,a,s')+\gamma\cdot V^\pi(s')]$$

We want to modify the policy to select the action that leads to the highest next-state value. 

In [294]:
def policy_improvement():
    for state in state_values:

        # Initialise a dictionary to store the value of each action
        action_values = {}

        for action in actions:
            possible_moves = get_possible_moves(state, action)
            expected_value = 0

            for move, prob in zip(possible_moves, probabilities):
                if is_valid(move):
                    reward = rewards.get(move, 0)
                    expected_value += prob * (reward * gamma * state_values[move])
                else:
                    expected_value += prob * (-5)
            
            action_values[action] = expected_value
        
        # arg max_a: Select the best action from the state's action set
        best_action = max(action_values, key=action_values.get)

        for action in actions:
            policy[state][action] = 1.0 if action == best_action else 0.0

In [295]:
def policy_iteration(iterations: int):
    for i in range(iterations):
        policy_evaluation()
        policy_improvement()

In [296]:
policy_evaluation()

In [298]:
state_values

{(0, 0): -3.75,
 (0, 1): 0.0,
 (0, 2): -2.5,
 (0, 3): -1.8124999999999998,
 (0, 4): -1.6578125,
 (0, 5): -2.8730078125,
 (1, 0): -3.3437500000000004,
 (1, 1): 0.0,
 (1, 2): -1.8124999999999998,
 (1, 3): -0.8156249999999999,
 (1, 4): -1.8065234374999997,
 (1, 5): -2.3028945312499998,
 (2, 0): -2.00234375,
 (2, 1): -1.7005273437500001,
 (2, 2): -0.79043115234375,
 (2, 3): -2.8613626342773437,
 (2, 4): 0.0,
 (2, 5): -0.5181512695312502,
 (3, 0): -2.95052734375,
 (3, 1): -2.2964873046875005,
 (3, 2): -3.1945566528320315,
 (3, 3): 0.0,
 (3, 4): -1.25,
 (3, 5): 0.0}

In [299]:
policy_improvement()

In [300]:
policy

{(0, 0): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (0, 1): {'up': 0.0, 'down': 0.0, 'left': 1.0, 'right': 0.0},
 (0, 2): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (0, 3): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (0, 4): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (0, 5): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (1, 0): {'up': 1.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (1, 1): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (1, 2): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 1.0},
 (1, 3): {'up': 1.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (1, 4): {'up': 1.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (1, 5): {'up': 0.0, 'down': 0.0, 'left': 1.0, 'right': 0.0},
 (2, 0): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 1.0},
 (2, 1): {'up': 0.0, 'down': 1.0, 'left': 0.0, 'right': 0.0},
 (2, 2): {'up': 1.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (2, 3): {'up': 1.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (2, 4):

In [301]:
visualise_path()

S # - - ↓ ↓
↓ # → → → ↓
→ → ↑ - # ↓
- - - # - F


In [302]:
policy_iteration(10)
visualise_path()

S # - - - -
↓ # → → → ↓
→ → ↑ - # ↓
- ↑ - # - F


## Review


## Next ...