# FrozenLake - Q Learning
---
> 가장 기초적이고 기본적인 강화학습 연습 환경이다. 시작점 S 에서 시작해서 목표 G 에 도착해야 한다. G 에 도착하거나 H 에 빠지면 episode 가 종료된다.

* state : 4x4 맵, 0 - 15 one-hot encoding 
* action : 좌-0 하-1 우-2 상-3
* reward : G 에 닿으면 +1, 나머지는 0
* 종료조건 : H 에 닿으면 종료
---

In [1]:
import gym
import numpy as np
import random
from gym.envs.registration import register

In [2]:
'''
환경셋팅 한 후에 환경을 추가등록한다.
'''

register(
    id='FrozenLake-v1',
    entry_point="gym.envs.toy_text:FrozenLakeEnv",
    kwargs={'map_name':'4x4','is_slippery':False})

In [115]:
'''
환경 생성
'''
env = gym.make('FrozenLake-v1')

In [116]:
print(env.action_space.n)
print(env.observation_space.n)

4
16


In [117]:
q_table = np.zeros([env.action_space.n, env.observation_space.n])
gamma = .95
epsilon = 1
alpha = .1
episode_total = 3000
episode = 0

In [118]:
state = env.reset()

while(episode < episode_total):
    if(random.random() < epsilon):
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[ : , state])

    state_next, reward, done, _ = env.step(action)
    q_table[action, state] += alpha * (reward + np.max(gamma * q_table[ : , state_next]) - q_table[action, state])
    state = state_next

    if(done):
        if(reward > 0):
            #print(reward)
            if(epsilon < .1):
                epsilon = .1
            else:
                epsilon = 1 / (1 + episode / 10)
        episode += 1
        env.reset()
env.close()

In [119]:
print(q_table)
s = env.reset()

while(True):
    a = np.argmax(q_table[ : , s])
    s,r,d,_ = env.step(a)
    env.render()
    if(d):
        env.render()
        break

[[2.05884558 2.75021404 0.6569266  0.         2.8937239  0.24218779
  2.70035986 0.         2.98672843 2.97315969 3.16265807 2.70369235
  2.77473455 2.62685051 2.21369453 2.62916627]
 [2.92093598 0.         0.         0.         3.07467184 2.92083759
  0.34068244 0.         2.60090216 2.34177757 3.58614635 0.
  0.29197918 0.43025326 3.48329762 2.92093808]
 [1.09706882 0.         0.         0.         2.63784195 0.52330113
  0.         0.         3.2364968  3.40683889 2.32502054 0.
  0.24631126 0.3578965  3.77489104 2.38188083]
 [2.00416966 0.538916   0.         0.         2.70901777 1.29452592
  0.06290209 0.         2.89327639 2.71985834 2.36022795 0.50197185
  1.51921888 0.54453224 3.35836749 2.6927944 ]]
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


# FrozenLake-v2
---

In [92]:
'''
환경셋팅 한 후에 환경을 추가등록한다.
'''

register(
    id='FrozenLake-v2',
    entry_point="gym.envs.toy_text:FrozenLakeEnv",
    kwargs={'map_name':'8x8','is_slippery':False})

In [93]:
'''
환경 생성
'''
env = gym.make('FrozenLake-v2')

In [94]:
print(env.action_space.n)
print(env.observation_space.n)

4
64


In [109]:
q_table = np.zeros([env.action_space.n, env.observation_space.n])
gamma = .95
epsilon = .9
alpha = .1
episode_total = 5000
episode = 0

In [110]:
state = env.reset()

while(episode < episode_total):
    if(random.random() < epsilon):
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[ : , state])

    state_next, reward, done, _ = env.step(action)
    q_table[action, state] += alpha * (reward + np.max(gamma * q_table[ : , state_next]) - q_table[action, state])
    state_old = state
    state = state_next

    if(done):
        if(reward > 0):
            print(episode)
        else:
            q_table[action, state_next] += alpha * (-1 + np.max(gamma * q_table[ : , state_next]) - q_table[action, state_old])

        if(epsilon < .1):
            epsilon = .1
        else:
            #epsilon = 1 / (1 + episode / 20)
            epsilon = 1
        episode += 1
        env.reset()
env.close()

5
45
75
135
798
1520
1813
3663
4355


In [112]:
print(q_table)
s = env.reset()

while(True):
    a = np.argmax(q_table[ : , s])
    s,r,d,_ = env.step(a)
    env.render()
    if(d):
        env.render()
        break

[[ 1.67200275e-01  1.67127999e-01  1.76124066e-01  1.85059308e-01
   1.94591896e-01  2.05676572e-01  2.16355189e-01  2.29522784e-01
   1.75407712e-01  1.75456857e-01  1.84354707e-01  1.93838438e-01
   2.02205642e-01  2.15663432e-01  2.29205948e-01  2.55262496e-01
   1.65656201e-01  1.64874034e-01  1.73169457e-01 -7.32781410e-02
  -1.26253459e-01  2.15134287e-01  2.42108433e-01  2.67424038e-01
   1.54260123e-01  1.54002072e-01  1.59433171e-01  1.58520797e-01
   1.32736618e-01 -3.04211520e-01 -1.31665073e-01  2.47372956e-01
   1.42534615e-01  1.40494972e-01  1.43578003e-01 -1.29673458e-02
  -3.85916717e-03  4.57026254e-02  1.10486290e-01  2.16486749e-01
   1.29310156e-01  1.59350662e-01  8.61239229e-02  1.26427554e-02
   4.12983734e-03  7.99881703e-03 -1.06592669e+00  4.50290296e-02
   8.67017202e-02 -3.32410494e-03  7.03131833e-03  0.00000000e+00
  -2.15004052e-01  3.76652480e-06 -6.37514691e-01  4.37440740e-03
   4.78615765e-02  2.56633281e-02  4.74860950e-03 -6.73593705e-02
   8.26703