# Q-Learning

Originally from https://skettee.github.io/post/q_learning/ (in Korean)

## Load Libraries and Extensions

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from IPython.display import display, clear_output, Pretty
import numpy as np
from pprint import pprint
from time import sleep
from tqdm import tqdm_notebook as tqdm

import gym

## Probabilistic Frozen Lake Environment

In [3]:
ENV_NAME = 'FrozenLake8x8-v0'
N_STEP = 100

In [4]:
env = gym.make(ENV_NAME)
state = env.reset()

world = env.render(mode='ansi')
display(Pretty(world))
sleep(0.5)


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [5]:
for step in range(N_STEP):
    action =env.action_space.sample()
    next_state, reward, done, info = env.step(action)    
    state = next_state
    
    # updated world display
    world = env.render(mode='ansi')
    clear_output(wait=True)
    display(Pretty(world))
    sleep(0.5)
    
    if done: # an episode finished
        print("Episode finished after {} timesteps".format(step+1))
        break

  (Right)
SFFFFFFF
FFFFFFFF
FFF[41mH[0mFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


Episode finished after 50 timesteps


In [6]:
pprint(env.P[55])

{0: [(0.3333333333333333, 47, 0.0, False),
     (0.3333333333333333, 54, 0.0, True),
     (0.3333333333333333, 63, 1.0, True)],
 1: [(0.3333333333333333, 54, 0.0, True),
     (0.3333333333333333, 63, 1.0, True),
     (0.3333333333333333, 55, 0.0, False)],
 2: [(0.3333333333333333, 63, 1.0, True),
     (0.3333333333333333, 55, 0.0, False),
     (0.3333333333333333, 47, 0.0, False)],
 3: [(0.3333333333333333, 55, 0.0, False),
     (0.3333333333333333, 47, 0.0, False),
     (0.3333333333333333, 54, 0.0, True)]}


It moves to the intended direction only with 1/3 of chance.

## Q-Learning

$Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha \left( R_{t+1} + \gamma \max_{a'} Q(S_{t+1}, a') - Q(S_t, A_t) \right)$

In [7]:
n_state = env.observation_space.n
n_action = env.action_space.n
n_episode = 5000
GAMMA = .9
EPSILON = .3
ALPHA = .1

In [8]:
terminal_states = [19, 29, 35, 41, 42, 49, 52, 54, 59, 63]

Q_table = np.random.uniform(low=0.0, high=0.00000001, size=(n_state, n_action))
for s in terminal_states:
    Q_table[s] = 0
    
for episode in tqdm(range(n_episode)):
    state = env.reset()
    done = False
    
    while not done:
        if np.random.uniform() < EPSILON:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[state])
        next_state, reward, done, info = env.step(action)
        
        target = reward + GAMMA * Q_table[next_state, np.argmax(Q_table[next_state])]
        delta = target - Q_table[state][action]
        Q_table[state][action] += ALPHA * delta
        state = next_state

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




## Solution

In [9]:
state = env.reset()
done = False

world = env.render(mode='ansi')
display(Pretty(world))
sleep(.5)

while not done:
    action = np.argmax(Q_table[state])
    state, reward, done, info = env.step(action)
    
    world = env.render(mode='ansi')
    clear_output(wait=True)
    display(Pretty(world))
    sleep(.5)
    
    if done and state == 63:
        print('\nSuccess!')

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m



Success!
