# FrozenLake - Q Learning
---
> 가장 기초적이고 기본적인 강화학습 연습 환경이다. 시작점 S 에서 시작해서 목표 G 에 도착해야 한다. G 에 도착하거나 H 에 빠지면 episode 가 종료된다.

* state : 4x4 맵, 0 - 15 one-hot encoding 
* action : 좌-0 하-1 우-2 상-3
* reward : G 에 닿으면 +1, 나머지는 0
* 종료조건 : H 에 닿으면 종료
---

In [34]:
import gym
import numpy as np
import random
from gym.envs.registration import register

In [27]:
'''
환경셋팅 한 후에 환경을 추가등록한다.
'''

register(
    id='FrozenLake-v1',
    entry_point="gym.envs.toy_text:FrozenLakeEnv",
    kwargs={'map_name':'4x4','is_slippery':False})

Error: Cannot re-register id: FrozenLake-v1

In [28]:
'''
환경 생성
'''
env = gym.make('FrozenLake-v1')

In [29]:
print(env.action_space.n)
print(env.observation_space.n)

4
16


In [89]:
q_table = np.zeros([env.action_space.n, env.observation_space.n])
gamma = .95
epsilon = 1
alpha = .1
episode_total = 3000
episode = 0

In [90]:
state = env.reset()

while(episode < episode_total):
    if(random.random() < epsilon):
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[ : , state])

    state_next, reward, done, _ = env.step(action)
    q_table[action, state] += alpha * (reward + np.max(gamma * q_table[ : , state_next]) - q_table[action, state])
    state = state_next

    if(done):
        if(reward > 0):
            #print(reward)
            if(epsilon < .1):
                epsilon = .1
            else:
                epsilon = 1 / (1 + episode / 10)
        episode += 1
        env.reset()
env.close()

In [91]:
print(q_table)
s = env.reset()

while(True):
    a = np.argmax(q_table[ : , s])
    s,r,d,_ = env.step(a)
    #env.render()
    if(d):
        env.render()
        break

[[2.42135407e+00 2.43540251e+00 0.00000000e+00 0.00000000e+00
  2.50364322e+00 2.43597497e+00 5.58430191e-01 0.00000000e+00
  2.62022069e+00 2.64371453e+00 2.83743354e+00 4.52324821e-01
  2.43310558e+00 2.20105272e+00 2.92596503e+00 2.43679854e+00]
 [2.56505121e+00 4.35746167e-01 4.21838093e-02 0.00000000e+00
  2.70005400e+00 6.86798755e-01 2.26145917e-01 0.00000000e+00
  2.22071681e+00 2.99174983e+00 2.36432084e-01 0.00000000e+00
  6.92355076e-01 2.95345333e+00 3.01022481e+00 2.32001086e+00]
 [2.25587035e+00 7.93644417e-04 0.00000000e+00 0.00000000e+00
  2.27845174e+00 2.29666776e-01 0.00000000e+00 0.00000000e+00
  2.84216222e+00 2.58018787e+00 2.20263309e-02 0.00000000e+00
  6.22221183e-01 3.14921047e+00 3.31495850e+00 2.12236968e+00]
 [2.39726875e+00 7.85762280e-01 0.00000000e+00 0.00000000e+00
  2.40355330e+00 9.45213045e-01 0.00000000e+00 0.00000000e+00
  2.44896187e+00 2.20955813e+00 5.37727936e-02 0.00000000e+00
  6.59513633e-01 2.79415540e+00 2.44010795e+00 2.35690540e+00]]
  (

# FrozenLake-v2
---

In [92]:
'''
환경셋팅 한 후에 환경을 추가등록한다.
'''

register(
    id='FrozenLake-v2',
    entry_point="gym.envs.toy_text:FrozenLakeEnv",
    kwargs={'map_name':'8x8','is_slippery':False})

In [93]:
'''
환경 생성
'''
env = gym.make('FrozenLake-v2')

In [94]:
print(env.action_space.n)
print(env.observation_space.n)

4
64


In [100]:
q_table = np.zeros([env.action_space.n, env.observation_space.n])
gamma = .95
epsilon = .9
alpha = .1
episode_total = 5000
episode = 0

In [None]:
state = env.reset()

while(episode < episode_total):
    if(random.random() < epsilon):
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[ : , state])

    state_next, reward, done, _ = env.step(action)
    q_table[action, state] += alpha * (reward + np.max(gamma * q_table[ : , state_next]) - q_table[action, state])
    state_old = state
    state = state_next

    if(done):
        if(reward > 0):
            print(episode)
        else:
            q_table[action, state_next] += alpha * (-1 + np.max(gamma * q_table[ : , state_next]) - q_table[action, state_old])

        if(epsilon < .1):
            epsilon = .1
        else:
            epsilon = 1 / (1 + episode / 20)
        episode += 1
        env.reset()
env.close()

1183


In [98]:
print(q_table)
s = env.reset()

while(True):
    a = np.argmax(q_table[ : , s])
    s,r,d,_ = env.step(a)
    #env.render()
    if(d):
        env.render()
        break

[[  0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.          -0.1
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.


KeyboardInterrupt: 