In [None]:
import gym
import numpy as np
from gym.envs.registration import register
import random as pr

In [None]:
NUM_EPISODES = 2000
POINT_WALL = [[1, 1], [1, 3], [2, 3], [3, 0]]
POINT_GOAL = [[3, 3]]


In [None]:
def rargmax(vector):
    # np,amax = array의 최댓값 반환
    m = np.amax(vector)
    indices = np.nonzero(vector == m)[0]
    # 0이 아닌값 중에 m과 같은 값이 있으면
    # pr.choice(indices)
    # indices 중 random으로 choice
    return pr.choice(indices)
    

In [None]:
def run_episodes_ori(env, q_value):
    done = False
    rAll = 0
    # 초기 state로 설정
    state = env.reset()

    buf_q = []
    buf_act = []
    while not done:
        action = rargmax(q_value[state, :])

        # done : learning 종료 (목적지 도착)
        new_state, reward, done, _ = env.step(action)

        q_value[state, action] = reward + np.max(q_value[new_state, :])

        buf_q.append(list(q_value[state]))
        if done:
            buf_q.append(list(q_value[new_state]))
        buf_act.append(action)
        rAll += reward
        state = new_state
    return buf_q, rAll, buf_act
    

In [None]:
def run_episodes_explo(env, q_value, e, i):
    dis = .9

    done = False
    rAll = 0
    # 초기 state로 설정
    state = env.reset()

    buf_q = []
    buf_act = []
    while not done:
        # e-greedy
        # action = None
        # if np.random.rand(1) < e:
        #     action = env.action_space.sample()
        # else:
        #     action = rargmax(q_value[state, :])

        # noise
        action = np.argmax(q_value[state, :]+np.random.randn(1, env.action_space.n)/(i+1))

        # done : learning 종료 (목적지 도착)
        new_state, reward, done, _ = env.step(action)

        # if state != new_state:
        #     q_value[state, action] = reward + dis * np.max(q_value[new_state, :])

        q_value[state, action] = reward + dis * np.max(q_value[new_state, :])

        buf_q.append(list(q_value[state]))
        if done:
            buf_q.append(list(q_value[new_state]))
        buf_act.append(action)
        rAll += reward
        state = new_state
    return buf_q, rAll, buf_act
    

In [None]:
def run_q_learning(env, num_episodes, explo = False):
    # Q-value table 생성
    q_value = np.zeros([env.observation_space.n, env.action_space.n])

    # reword 저장
    log_reword = []
    log_q_by_step = []
    log_action = []
    log_q_map = []
    log_e = []
    for i in range(num_episodes):
        e = 1. / ((i // 1000) + 1)
        log_e.append(e)

        buf_q = None
        rAll = None
        buf_act = None
        if explo:
            buf_q, rAll, buf_act = run_episodes_explo(env, q_value, e, i)
        else:
            buf_q, rAll, buf_act = run_episodes_ori(env, q_value)

        log_action.append(buf_act)
        log_q_by_step.append(buf_q)
        log_reword.append(rAll)
        log_q_map.append(q_value.copy())
    return q_value, log_reword, log_q_map, log_action, log_q_by_step


# 4x4 환경

In [None]:
# entry_point : gym.envs 환경 불러오기
register(
    id='LakeEnv-v1',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name': '4x4', 'is_slippery': False}
)
env = gym.make('LakeEnv-v1')


## q_learning 기본

In [None]:
result_ = []
cnt = 100
for _ in range(cnt):
    q_value, log_reword, log_q_map, log_action, log_q_by_step = run_q_learning(env, NUM_EPISODES)
    res = sum(log_reword) / NUM_EPISODES
    result_.append(res)

print('Max success rate : ' + str(max(result_)))
print('Avg success rate : ' + str(sum(result_)/cnt))


Max success rate : 0.995
Avg success rate : 0.9379599999999999


## q_learning exploration (noise, discounted reward)

In [None]:
result_ = []
cnt = 100
for _ in range(cnt):
    q_value, log_reword, log_q_map, log_action, log_q_by_step = run_q_learning(env, NUM_EPISODES, True)
    res = sum(log_reword) / NUM_EPISODES
    result_.append(res)

print('Max success rate : ' + str(max(result_)))
print('Avg success rate : ' + str(sum(result_)/cnt))


Max success rate : 0.989
Avg success rate : 0.9339700000000002


- 특별한 차이를 보이지 않음
- 최단경로를 찾아가는 알고리즘 적용이기 때문에 최단경로를 적게 가지고 있는 환경에서의 테스트와 최단경로 추출에 대한 성공률 측정이 필요함

# 8x8 환경

In [None]:
# entry_point : gym.envs 환경 불러오기
register(
    id='LakeEnv-v2',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name': '8x8', 'is_slippery': False}
)
env = gym.make('LakeEnv-v2')


## q_learning 기본

In [None]:
result_ = []
cnt = 100
for _ in range(cnt):
    q_value, log_reword, log_q_map, log_action, log_q_by_step = run_q_learning(env, NUM_EPISODES)
    res = sum(log_reword) / NUM_EPISODES
    result_.append(res)

print('Max success rate : ' + str(max(result_)))
print('Avg success rate : ' + str(sum(result_)/cnt))


Max success rate : 0.9185
Avg success rate : 0.5808650000000002


## q_learning exploration (noise, Discounted reward)

In [None]:
result_ = []
cnt = 100
for _ in range(cnt):
    q_value, log_reword, log_q_map, log_action, log_q_by_step = run_q_learning(env, NUM_EPISODES, True)
    res = sum(log_reword) / NUM_EPISODES
    result_.append(res)

print('Max success rate : ' + str(max(result_)))
print('Avg success rate : ' + str(sum(result_)/cnt))


Max success rate : 0.9445
Avg success rate : 0.5575750000000003


- 특별한 차이를 보이지 않음
- 최단경로 추출에 대한 성공률 측정이 필요함