---

<div class="alert alert-primary alert-info">

# Frozen Lake $4\times4$ と $8\times8$

## Reinforcement Learning

</div>

<div class="alert alert-block alert-success">

- ### Q-Learning
    
</div>

---

<img src='frozenlake.jpg' width=1000 height=50/>

---

In [1]:
%config IPCompleter.greedy=True
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt

import gym

import math
import typing

In [3]:
np.random.seed(1)

---

<div class="alert alert-danger" role="alert">

# $\lambda = 0$
    
</div>

---

$
\begin{align}
q_{t+1}(s, a) &:= q_{t}(s, a) + \eta (r^{a}_{t+1} + \gamma \max_{\forall a'} \{ q_{t+1}(s_{t+1}, a') \} - q_{t}(s, a) ), \eta \in [0, 1], \gamma \in (0, 1] \\
\end{align}
$

$\pi_{*} (a \mid s) := \underset{a\in A}{\operatorname{argmax}} q_{\pi}(s, a)$

---

<div class="alert alert-primary alert-info">
    
## Non-slippery version
    
</div>

---

In [4]:
LEFT, DOWN, RIGHT, UP, TERMINAL = 0, 1, 2, 3, -1


def print_policy(policy: np.ndarray, dim: int) -> None:
    for state, action in enumerate(policy):
        if state == env.nS - 1:
            print('G', end=' ')
        elif action == LEFT:
            print('<', end=' ')
        elif action == DOWN:
            print('v', end=' ')
        elif action == RIGHT:
            print('>', end=' ')
        elif action == UP:
            print('^', end=' ')
        else:
            print('H', end=' ')
        if (state + 1) % dim == 0:
            print('\n')

---

In [5]:
def q_learning_non_slippery(env: typing.Type[gym.Wrapper],
                            gamma: float,
                            epsilon: float,
                            eta: float,
                            max_num_episodes: int,
                            max_moves_per_episodes: int,
                            max_exploring_actions: int) -> None:
    
    policy = np.random.randint(0, env.nA, size=(env.nS)).astype(np.int8)
    action_values = np.zeros((env.nS, env.nA))
    
    env.reset()
    print('Start:')
    env.render()
    
    dim = np.sqrt(env.nS)
    
    print('\nInitial random policy:\n')
    print_policy(policy, dim)
    
    for iteration in range(max_num_episodes):
        
        curr_state = env.reset()
        
        for _ in range(max_moves_per_episodes):
            
            if iteration < max_exploring_actions:
                action = np.random.randint(0, env.nA)  # Explore action space
            else:
                action = policy[curr_state]
            
            prev_action_values = action_values.copy()
            
            next_state, reward, finished, _ = env.step(action)
            
            delta = reward + gamma * np.max(action_values[next_state])
            action_values[curr_state, action] += eta * ( delta - action_values[curr_state, action] )
            
            policy[curr_state] = np.argmax(action_values[curr_state])
            
            curr_state = next_state
            
            if finished:
                break
                
        delta = np.fabs(action_values - prev_action_values).max()
        if delta <= epsilon * (1 - gamma) / gamma and not math.isclose(delta, 0.0):
            break

    print(f'Number of iterations: {iteration + 1}')
    print(f'Delta: {delta}')
    # print('Action values:\n', action_values)
    print('\nFinal policy:\n')
    print_policy(policy, dim)

---

### $4\times4$

---

In [6]:
env = gym.make('FrozenLake-v0', is_slippery=False)

gamma = 0.999
epsilon = 0.0001
eta = 0.001
max_num_episodes = 100000
max_moves_per_episodes = 1000
max_exploring_actions = 1000

q_learning_non_slippery(env, gamma, epsilon, eta, max_num_episodes, max_moves_per_episodes, max_exploring_actions)

Start:

[41mS[0mFFF
FHFH
FFFH
HFFG

Initial random policy:

v ^ < < 

^ v ^ v 

^ < < v 

< ^ v G 

Number of iterations: 10191
Delta: 1.0007350714769103e-07

Final policy:

> > v < 

v v v v 

> > v v 

< > > G 



---

### $8\times8$

---

In [7]:
env = gym.make('FrozenLake8x8-v0', is_slippery=False)

gamma = 0.999
epsilon = 0.0001
eta = 0.001
max_num_episodes = 500000
max_moves_per_episodes = 1000
max_exploring_actions = 10000

q_learning_non_slippery(env, gamma, epsilon, eta, max_num_episodes, max_moves_per_episodes, max_exploring_actions=10000)

Start:

[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

Initial random policy:

< v ^ < < > > ^ 

> > v > ^ ^ > ^ 

^ > ^ ^ ^ > ^ < 

< > v v > v < < 

^ < > ^ v < < < 

> v < v v ^ < ^ 

^ ^ v < v ^ < > 

> ^ > < < > ^ G 

Number of iterations: 4889
Delta: 4.4353789258588883e-14

Final policy:

> v v v > > > v 

> > > > > > > v 

^ ^ ^ ^ > > > v 

> > > > v v > v 

^ ^ ^ ^ > > > v 

^ v < > ^ ^ < v 

^ ^ > ^ v < < v 

^ < ^ < < < > G 



---

<div class="alert alert-primary alert-info">

## Slippery when Wet

<img src='Slippery_when_wet.jpg' width=250 height=5/>

</div>

---

In [8]:
def q_learning_slippery(env: typing.Type[gym.Wrapper],
                        gamma: float,
                        epsilon: float,
                        eta: float,
                        max_num_episodes: int,
                        max_moves_per_episodes: int,
                        max_exploring_actions: int) -> None:
    
    policy = np.random.randint(0, env.nA, size=(env.nS)).astype(np.int8)
    action_values = np.zeros((env.nS, env.nA))
    
    env.reset()
    print('Start:')
    env.render()
    
    dim = np.sqrt(env.nS)
    
    print('\nInitial random policy:\n')
    print_policy(policy, dim)
    
    for iteration in range(max_num_episodes):
        
        curr_state = env.reset()
        
        for _ in range(max_moves_per_episodes):
            
            if iteration < max_exploring_actions:
                action = np.random.randint(0, env.nA) # Explore action space
            else:
                action = policy[curr_state]
            
            prev_action_values = action_values.copy()
            
            next_state, reward, finished, _ = env.step(action)
            
            # Enhanced policy improvements using intended action lookbacks.
            # Keep track of previous states to estimate true current state.
            if curr_state - 1 == next_state:
                action = LEFT
            elif curr_state + 1 == next_state:
                action = RIGHT
            elif curr_state + dim == next_state:
                action = DOWN
            elif curr_state - dim == next_state:
                action = UP
            
            delta = reward + gamma * np.max(action_values[next_state])
            action_values[curr_state, action] += eta * ( delta - action_values[curr_state, action] )
            
            policy[curr_state] = np.argmax(action_values[curr_state])
            
            curr_state = next_state
            
            if finished:
                break
                
        delta = np.fabs(action_values - prev_action_values).max()
        if delta <= epsilon * (1 - gamma) / gamma and not math.isclose(delta, 0.0):
            break
    
    print(f'Number of iterations: {iteration + 1}')
    print(f'Delta: {delta}')
    # print('Action values:\n', action_values)
    print('\nFinal policy:\n')
    print_policy(policy, dim)

---

### $4\times4$

---

In [9]:
env = gym.make('FrozenLake-v0', is_slippery=True)

gamma = 0.999
epsilon = 0.0001
eta = 0.001
max_num_episodes = 500000
max_moves_per_episodes = 1000
max_exploring_actions = 1000

q_learning_slippery(env, gamma, epsilon, eta, max_num_episodes, max_moves_per_episodes, max_exploring_actions)

Start:

[41mS[0mFFF
FHFH
FFFH
HFFG

Initial random policy:

> v ^ < 

v > ^ > 

< > > < 

< v ^ G 

Number of iterations: 158
Delta: 1.3903306411762718e-17

Final policy:

> > v < 

v > v > 

> > v < 

< < > G 



---

### $8\times8$

---

In [10]:
env = gym.make('FrozenLake8x8-v0', is_slippery=True)

gamma = 0.999
epsilon = 0.0001
eta = 0.001
max_num_episodes = 500000
max_moves_per_episodes = 1000
max_exploring_actions = 1000

q_learning_slippery(env, gamma, epsilon, eta, max_num_episodes, max_moves_per_episodes, max_exploring_actions=1000)

Start:

[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

Initial random policy:

v > < ^ ^ > > ^ 

v ^ < ^ ^ < v v 

< > v > > ^ > ^ 

^ < < ^ ^ > v v 

v v ^ < ^ v ^ > 

^ v v v < > < > 

> v < < < ^ < > 

^ v < > v < < G 

Number of iterations: 500000
Delta: 0.0

Final policy:

> v v v v v v < 

> > > > > v v < 

> ^ ^ > > > v v 

> ^ ^ > v > v v 

> ^ ^ < > > > v 

^ v v > ^ ^ < v 

^ v > ^ < < < v 

^ < ^ > v < < G 



---