# 2. Markov Decision Processes
## 2.2 Introducing Randomness

- Random transition probabilities $P(s'|s,a)$
- Non-deterministic policies $\pi(a|s)$

## Deterministic Transitions

In [1100]:
import numpy as np

gamma = 0.9

grid = np.array([
    ['S', '#', '-', '-', '-', '-'],
    ['-', '#', '-', '-', '-', '-'],
    ['-', '-', '-', '-', '#', '-'],
    ['-', '-', '-', '#', '-', 'F']
])

rows, cols = grid.shape
terminal_state = (3, 5)

actions = ['up', 'down', 'left', 'right']

rewards = { terminal_state: 10}

Q_values = { (i, j): {a: 0.0 for a in actions} for i in range(rows) for j in range(cols) }

def is_valid(state):
    i, j = state
    return 0 <= i < rows and 0 <= j < cols and grid[i, j] != '#'

def get_next_state(state, action):
    i, j = state
    if action == 'up': next_state = (i-1, j)
    elif action == 'down': next_state = (i+1, j)
    elif action == 'left': next_state = (i, j-1)
    elif action == 'right': next_state = (i, j+1)
    
    if is_valid(next_state): return True, next_state
    else: return False, state

def update():
    for state in Q_values.keys():
        if state == terminal_state or grid[state] == '#':
            continue
        for action in actions:
            is_valid, next_state = get_next_state(state, action) 
            if is_valid:
                reward = rewards.get(next_state, 0) 
                Q_values[state][action] = reward + gamma * max(Q_values[next_state].values())
            else: 
                Q_values[state][action] = -5

action_symbols = {
    "up": "↑",
    "down": "↓",
    "left": "←",
    "right": "→"
}

def get_best_action(state):
    return max(Q_values[state], key=Q_values[state].get)

def visualise_path():
    grid_viz = grid.copy()

    state = (0, 0)
    path = []
    max_steps = 1000

    for _ in range(max_steps):
        path.append(state)
        if state == terminal_state:
            break

        best_action = get_best_action(state)
        _, next_state = get_next_state(state, best_action)

        if next_state == state:
            pass

        state = next_state
    
    for (i, j) in path:
        if grid_viz[i, j] not in ['S', 'F']:
            grid_viz[i, j] = action_symbols[get_best_action((i, j))]

    for row in grid_viz:
        print(" ".join(row))
        
for i in range(50):
    update()

visualise_path()

S # - - - -
↓ # → → → ↓
→ → ↑ - # ↓
- - - # - F


In [1101]:
Q_values = {
    state: {action: round(value, 2) for action, value in actions.items()}
    for state, actions in Q_values.items()
}
Q_values

{(0, 0): {'up': -5, 'down': 3.87, 'left': -5, 'right': -5},
 (0, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (0, 2): {'up': -5, 'down': 5.9, 'left': -5, 'right': 5.9},
 (0, 3): {'up': -5, 'down': 6.56, 'left': 5.31, 'right': 6.56},
 (0, 4): {'up': -5, 'down': 7.29, 'left': 5.9, 'right': 7.29},
 (0, 5): {'up': -5, 'down': 8.1, 'left': 6.56, 'right': -5},
 (1, 0): {'up': 3.49, 'down': 4.3, 'left': -5, 'right': -5},
 (1, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (1, 2): {'up': 5.31, 'down': 5.31, 'left': -5, 'right': 6.56},
 (1, 3): {'up': 5.9, 'down': 5.9, 'left': 5.9, 'right': 7.29},
 (1, 4): {'up': 6.56, 'down': -5, 'left': 6.56, 'right': 8.1},
 (1, 5): {'up': 7.29, 'down': 9.0, 'left': 7.29, 'right': -5},
 (2, 0): {'up': 3.87, 'down': 3.87, 'left': -5, 'right': 4.78},
 (2, 1): {'up': -5, 'down': 4.3, 'left': 4.3, 'right': 5.31},
 (2, 2): {'up': 5.9, 'down': 4.78, 'left': 4.78, 'right': 5.9},
 (2, 3): {'up': 6.56, 'down': -5, 'left': 5.31, 'right': -5},
 (

## Non-Deterministic Transitions
Recall the formulation of a Markov decision process:
$$(\mathcal{S}, \mathcal{A}, P, \mathcal{R}, \gamma)$$

Where: 
- $\mathcal{S}$ is the **state space**.
- $\mathcal{A}$ is the **action space**.
- **$P(s'|s,a)$ is the **transition function** - the probability of reaching state $s'$ by taking action $a$ in state $s$.** 
- $R(s,a,s')$ is the reward received immediately after taking action $a$ in state $s$. 
- $\gamma$ is a discount factor.

Previously, in our gridworld, we worked by the assumption that state transitions were **completely deterministic**, i.e.:
$$P(s'|s,a) = 1 \quad \forall s,a \in \mathcal{S}, \mathcal{A}$$

This means that every time we took a given action in a given state, we would move to the next state with 100% certainty every single time. For example, if we took action '$\text{right}$' in state $(0,0)$, our agent will *always* move to state $(0,1)$. 

In the real world, state transitions are usually not determinstic, but instead are **probabilistic (stochastic)**. 

Let's go back to our gridworld. This time, the grid is slippery - instead of moving in our intended direction with 100% certainty, there is now a chance of slipping and moving in an unintended direction. 

Instead, when an action is chosen, we will: 
- Move in the intended direction with $p=0.8$.
- Move left of the intended direction with $p=0.1$.
- Move right of the intended direction with $p=0.1$.
- Remain in the same state if the action results in an invalid move.

Now our **state transition function $P(s'|s,a)$** becomes: 
$$P(s'|s,a)=
\begin{cases}
0.8, & \text{if } s' \text{ is the intended direction from } s \text{ given } a \\
0.1, & \text{if } s' \text{ is to the left of the the intended direction from } s  \\
0.1, & \text{if } s' \text{ is to the right of the intended direction from } s  \\
0, & \text{otherwise}
\end{cases}$$

In [1102]:
def is_valid(state):
    i, j = state
    return 0 <= i < rows and 0 <= j < cols and grid[i, j] != '#'

In [1103]:
def update():
    gamma = 0.9
    for state in Q_values.keys():

        if state == terminal_state or grid[state] == '#':
            continue

        for action in actions:
            is_valid, next_state = get_next_state(state, action)
            if is_valid:
                reward = rewards.get(next_state, 0)
                Q_values[state][action] = reward + gamma * max(Q_values[next_state].values())
            else: 
                Q_values[state][action] = -5

To introduce this stochasticity into our gridworld, we introduce a new function: $\text{get\_possible\_moves}$. This function will return a list of three tuples, with the $0$-th tuple of the list representing the intended action, and the other two tuples representing a perpendicular move to the intended direction.

In [1104]:
def get_possible_moves(state, action):
    i, j = state
    if action == 'up':
        return [(i-1, j), (i, j-1), (i, j+1)] # up, left, right
    elif action == 'down':
        return [(i+1, j), (i, j-1), (i, j+1)] # down, left, right
    elif action == 'left':
        return [(i, j-1), (i-1, j), (i+1, j)] # left, up, down
    elif action == 'right':
        return [(i, j+1), (i-1, j), (i+1, j)] # right, up, down 

We then update our $\text{get\_next\_state}$ function to first sample the actual move from our set of possible moves. 

In [1105]:
np.random.seed(0)
probabilities = [0.8, 0.1, 0.1] # p=0.8 of moving in the intended direction, (1-p)/2 for moving perpendicular

def get_next_state(state, action):
    possible_moves = get_possible_moves(state, action)

    next_state = possible_moves[np.random.choice(3, p=probabilities)]

    if is_valid(next_state):
        return True, next_state
    else: 
        return False, state

We perform our updates as before, using the equation: 
$$Q(s,a)=R + \max_{a'}Q(s',a')$$

For now, we just perform one update.

In [1106]:
update()

We can how visualise how an agent following the *optimal policy* $\pi^*$ (that just means selecting the **best** action in each state every time) navigates the grid after just one update.

Recall: 
$$\pi^*= \arg\max_{a}Q(s,a)$$

We can use this equation to plot our agent's course across the grid. Look closely. We see that in state $(2,3)$ the agent has learned the optimal action '$\text{left}$'.

In [1107]:
visualise_path()

S # - - - -
↓ # - → → ↓
→ → → ← # ↓
- - - # - F


Recall from the previous notebook that the update function iterates over all states and **retrieves the value of the next state corresponding to each action**. It will then use the value of this next state to update its belief about the value of its *own* state. 

Specifically, we look at this part of the $\text{update}$ function: 
```python
for action in actions:
            is_valid, next_state = get_next_state(state, action)
            reward = rewards.get(next_state, 0)
                Q_values[state][action] = reward + gamma * max(Q_values[next_state].values())
         
```
Recall how we laid out our state transition probabilities earlier: we said that with probability $0.8$, the action would transition state as intended, but with probability $0.2$, the action would transition state perpendicular to the intended. 

This means that during our update, the agent tested action '$\text{left}$' in state $(2,3)$ and, with a probability of 10%, it moved upwards instead (i.e., ended up in state $(1,3)$). It then used the *value of this state* to update its belief about the value of $Q((2,3), \text{right})$. 

Let's do another iteration of our update rule.

In [1108]:
update()

And visualise the path that it has learned now. Already, it is failing to learn an optimal path to reach the terminal state.

In [1109]:
visualise_path()

S # - ↓ ↓ ↓
↓ # → → → ↑
→ ↑ ↑ - # -
- - - # - F


We can see the agent gets stuck in state $(1,5)$. Let's look at the Q-values for this state and see what the agent has learned.

In [1110]:
Q_values[(1,5)]

{'up': 7.29, 'down': 7.29, 'left': 7.29, 'right': -5}

The Q-values for actions '$\text{up}$', '$\text{down}$', and '$\text{left}$' have been assigned equal value. With each value equal, the agent will simply pick the **first occurrence** of the maximum value, and so it selects action '$\text{up}$'.

We can also visualise what happens when we update our Q-values a few more times:

In [1111]:
for _ in range(10):
    update()
visualise_path()

S # - - - -
↑ # - - - -
- - - - # -
- - - # - F


The agent gets stuck almost immediately.

To address this, let's consider our update rule: 
$$Q(s,a)=R + \gamma_ \cdot \max_{a'} Q(s',a')$$

When we update our Q-value using this rule, we assume that the **the action always transitions to the best next state**. This means that the agent is not learning the **true expected value of the next state**. 

Consider an action which has a 90% chance of a reward of $10$, but a 10% chance of a penalty of $100$. Can we assign a value of $10$ to the action just because it is the most probable outcome? No - **we must weight all outcomes** by their chances of happening to get an accurate representation of their value: 
$$(0.9 * 10) + (0.1 * -100) = -1$$

This is **exactly** what we need to consider when we're navigating our grid. Each action has a probability of transitioning to three separate states: 
- The one intended, with probability $0.8$.
- To the left of the direction intended with probability $0.1$. 
- To the right of the direction intended with probability $0.1$. 

For each state that we could possibly land in, we need to consider both the **immediate reward** and the **value of that state**.

Let's go through an example to see how that works.

We'll work backwards from state $(3,5)$. We saw last time that the value of any action from the terminal state is $0$ because there are no future states and no more rewards to be gained., i.e.: 
$$Q((3,5),a) = 0$$

We step backwards to find $Q((2,5), \text{down})$. We know that the immediate reward for taking this action is $10$, but we **must also consider** that when the agent takes this action, there is a 10% probability it will move **left**, into an obstacle, and a 10% probability that it will move **right**, out of bounds. Both of these transitions carry a **$-5$ penalty**, and so they must be factored into the value of this action. 

So we know: 
- There is a **$0.8$ probability of reward $10$** by moving down.
- A **$0.1$ probability of reward $-5$** by moving left.
- A **$0.1$ probability of reward $-5$** by moving right.

We can sum these to find the immediate expected reward: 
$$(0.8 \times 10) + (0.1 \times -5) + (0.1 \times -5) = 7$$

$$Q((2,5), \text{down}) = 7 + 0.9 \cdot 0 \cdot 0 = 7$$

Instead of working with Q-values (action-values), we're going switch to working with **State Values $V(s)$** to keep things a little bit cleaner. 

Recall from the previous notebook that the **value of the state** is equal to the **value of the best action in that state**, i.e.: 
$$V(s) = \max_a Q(s,a)$$

Think of the value of a state as being the 'best opportunity' that state offers.

Let's now look at $V((1,5))$.
$$V((1,5)) := Q((1,5), \text{down})$$ 

We also want to consider the **value** of the next state $V(s')$ in our calculations, so we can propagate distant rewards back through our grid.

We know: 
- There is a **$0.8$ probability of moving to a state with value $7$** but  **no immediate reward** by moving down.
- A **$0.1$ probability of immediate reward $-5$** by moving left to a state with **no value**.
- A **$0.1$ probability of immediate reward $-5$** by moving right to a state with **no value**.

Remembering that we need to discount future values using discount factor $\gamma=0.9$, our expected *value* is: 
$$(0.8 \times [0 + (0.9 \cdot 7)]) + (0.1 \times (0+ -5)) + (0.1 \times (0+-5)) = 4.04$$

Each time, we're weighting both the **immediate reward and the value of the next state** by the probability that taking a given action reaches that state, and then summing across all possible states that may result from that action.

We represent this process mathematically as follows: 
$$V(s)=\sum_{s'}P(s'|s,a)[R(s,a,s')+ \gamma \cdot V(s')]$$

Equivalently: 
$$Q(s,a)=\sum_{s'}P(s'|s,a)[R(s,a,s')+ \gamma \cdot \max_{a'} Q(s',a')]$$

Let's change our $\text{update}$ function to reflect this new update rule. 

In [1112]:
def update():
    gamma = 0.9
    for state in Q_values.keys():

        if state == terminal_state or grid[state] == '#':
            continue

        for action in actions:
            possible_moves = get_possible_moves(state, action)
            expected_value = 0

            # Sum over each possible move
            for move, prob in zip(possible_moves, probabilities):
                if is_valid(move):
                    reward = rewards.get(move, 0)
                    next_value = max(Q_values[move].values())
                else: reward = -5; next_value = 0
                # P(s'|s, a) * [R + gamma * max Q(s', a')]
                expected_value += prob * (reward + gamma * next_value)

            Q_values[state][action] = expected_value


Let's iterate over our update rule to give our values a chance to converge.

In [1113]:
for _ in range(100):
    update()

And have a look at our new Q-values.

In [1114]:
Q_values = {
    state: {action: round(value, 2) for action, value in actions.items()}
    for state, actions in Q_values.items()
}
Q_values

{(0, 0): {'up': -5.0, 'down': -1.42, 'left': -4.55, 'right': -4.55},
 (0, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (0, 2): {'up': -4.29, 'down': 1.3, 'left': -4.3, 'right': 1.36},
 (0, 3): {'up': -3.62, 'down': 2.3, 'left': 0.72, 'right': 1.78},
 (0, 4): {'up': -3.5, 'down': 2.83, 'left': 1.45, 'right': 2.12},
 (0, 5): {'up': -4.25, 'down': 3.23, 'left': 1.97, 'right': -4.07},
 (1, 0): {'up': -2.02, 'down': -0.58, 'left': -4.08, 'right': -4.08},
 (1, 1): {'up': 0.0, 'down': 0.0, 'left': 0.0, 'right': 0.0},
 (1, 2): {'up': 0.72, 'down': 1.05, 'left': -3.71, 'right': 2.21},
 (1, 3): {'up': 2.15, 'down': 1.64, 'left': 1.94, 'right': 2.68},
 (1, 4): {'up': 2.71, 'down': -3.32, 'left': 1.68, 'right': 3.23},
 (1, 5): {'up': 2.12, 'down': 4.83, 'left': 3.25, 'right': -3.08},
 (2, 0): {'up': -0.84, 'down': -0.38, 'left': -4.05, 'right': 0.58},
 (2, 1): {'up': -3.78, 'down': 0.73, 'left': -0.02, 'right': 0.87},
 (2, 2): {'up': 1.82, 'down': 0.85, 'left': 0.91, 'right': 1.42},
 

In [1115]:
visualise_path()

S # - - - -
↓ # → → → ↓
→ → ↑ ↑ # ↓
- ↑ - # - F
