In [None]:
import gym
import numpy as np
from gym.envs.registration import register
import random as pr

In [None]:
NUM_EPISODES = 2000
POINT_WALL = [[1, 1], [1, 3], [2, 3], [3, 0]]
POINT_GOAL = [[3, 3]]


In [None]:
def rargmax(vector):
    # np,amax = array의 최댓값 반환
    m = np.amax(vector)
    indices = np.nonzero(vector == m)[0]
    # 0이 아닌값 중에 m과 같은 값이 있으면
    # pr.choice(indices)
    # indices 중 random으로 choice
    return pr.choice(indices)
    

In [None]:
def run_episodes(env, q_value, idx=0, greedy=False, noise=False, learning_rate=0, discount=1):
    done = False
    rAll = 0
    # 초기 state로 설정
    state = env.reset()

    buf_q = []
    buf_act = []
    while not done:
        # e-greedy
        action = None
    
        # e-greedy
        if greedy:
            e = 1. / ((idx // 1000) + 1)
            if np.random.rand(1) < e:
                action = env.action_space.sample()
            else:
                action = rargmax(q_value[state, :])
        # noise
        elif noise:
            action = np.argmax(q_value[state, :]+np.random.randn(1, env.action_space.n)/(idx+1))
        else:
            action = rargmax(q_value[state, :])
    
        # done : learning 종료 (목적지 도착)
        new_state, reward, done, _ = env.step(action)

        # if state != new_state:
        #     q_value[state, action] = reward + discount * np.max(q_value[new_state, :])

        if learning_rate>0:
            q_value[state,action] = (1-learning_rate) * q_value[state,action] + learning_rate * (reward + discount*np.max(q_value[new_state, :]))
        else:
            q_value[state, action] = reward + discount * np.max(q_value[new_state, :])

        buf_q.append(list(q_value[state]))
        if done:
            buf_q.append(list(q_value[new_state]))
        buf_act.append(action)
        rAll += reward
        state = new_state
    return buf_q, rAll, buf_act

In [None]:
def run_q_learning(env, num_episodes, greedy=False, noise=False, learning_rate=0, discount=1):
    # Q-value table 생성
    q_value = np.zeros([env.observation_space.n, env.action_space.n])

    # reword 저장
    log_reword = []
    log_q_by_step = []
    log_action = []
    log_q_map = []
    log_e = []
    for i in range(num_episodes):
        e = 1. / ((i // 1000) + 1)
        log_e.append(e)

        buf_q = None
        rAll = None
        buf_act = None
        buf_q, rAll, buf_act = run_episodes(env, q_value, idx=i, greedy=greedy, noise=noise, learning_rate=learning_rate, discount=discount)

        log_action.append(buf_act)
        log_q_by_step.append(buf_q)
        log_reword.append(rAll)
        log_q_map.append(q_value.copy())
    return q_value, log_reword, log_q_map, log_action, log_q_by_step


## Stochastic (non-deterministic) 환경

In [None]:
# entry_point : gym.envs 환경 불러오기
register(
    id='LakeEnv-v1',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name': '4x4', 'is_slippery': True}
)
env = gym.make('LakeEnv-v1')


## Q-learning 기본


In [None]:
result_ = []
cnt = 100
for _ in range(cnt):
    q_value, log_reword, log_q_map, log_action, log_q_by_step  = run_q_learning(env, NUM_EPISODES)
    res = sum(log_reword) / NUM_EPISODES
    result_.append(res)

print('Max success rate : ' + str(max(result_)))
print('Avg success rate : ' + str(sum(result_)/cnt))
print('\n')


Max success rate : 0.0335
Avg success rate : 0.02069499999999999




## Q-learning, Learning rate 적용

In [None]:
cnt = 100
for num in range(1, 11):
    result_ = []
    lr = num/10
    for _ in range(cnt):
        q_value, log_reword, log_q_map, log_action, log_q_by_step  = run_q_learning(env, NUM_EPISODES, learning_rate=lr)
        res = sum(log_reword) / NUM_EPISODES
        result_.append(res)

    print('Learning rate : ' + str(num/10))
    print('Max success rate : ' + str(max(result_)))
    print('Avg success rate : ' + str(sum(result_)/cnt))
    print('\n')


Learning rate : 0.1
Max success rate : 0.4325
Avg success rate : 0.06584000000000005


Learning rate : 0.2
Max success rate : 0.1985
Avg success rate : 0.05355000000000001


Learning rate : 0.3
Max success rate : 0.263
Avg success rate : 0.06394500000000003


Learning rate : 0.4
Max success rate : 0.5625
Avg success rate : 0.06387


Learning rate : 0.5
Max success rate : 0.3095
Avg success rate : 0.051740000000000015


Learning rate : 0.6
Max success rate : 0.204
Avg success rate : 0.05086999999999999


Learning rate : 0.7
Max success rate : 0.352
Avg success rate : 0.05635499999999999


Learning rate : 0.8
Max success rate : 0.1905
Avg success rate : 0.05337499999999999


Learning rate : 0.9
Max success rate : 0.2535
Avg success rate : 0.0622


Learning rate : 1.0
Max success rate : 0.036
Avg success rate : 0.021775




- 현재 state의 q_value를 고려하여 현재 state의 q_value를 업데이트
- reword와 다음 state의 q_value를 고려한 기본 q-learning에 비해 높은 성공률를 보여줌
- 학습 시도 중 최대 성공률이 상대적으로 매우 높은 경우가 있음

- learning rate이 클수록 현재 state의 q_value보다 다음 state의 q_value를 더 많이 고려
- learning rate에 따라 다른 성공률를 보임
- 학습에 따라 learning rate를 선정할 필요가 있음


## Q-learning, Learning rate + discounted reward 적용

In [None]:
result_ = []
cnt = 100
for num in range(1, 11):
    result_ = []
    lr = num/10
    for _ in range(cnt):
        q_value, log_reword, log_q_map, log_action, log_q_by_step  = run_q_learning(env, NUM_EPISODES, learning_rate=lr, discount=0.9)
        res = sum(log_reword) / NUM_EPISODES
        result_.append(res)

    print('Learning rate : ' + str(num/10))
    print('Max success rate : ' + str(max(result_)))
    print('Avg success rate : ' + str(sum(result_)/cnt))
    print('\n')


Learning rate : 0.1
Max success rate : 0.2555
Avg success rate : 0.05218


Learning rate : 0.2
Max success rate : 0.2035
Avg success rate : 0.05248500000000003


Learning rate : 0.3
Max success rate : 0.222
Avg success rate : 0.05266


Learning rate : 0.4
Max success rate : 0.2795
Avg success rate : 0.05464500000000001


Learning rate : 0.5
Max success rate : 0.593
Avg success rate : 0.07009000000000001


Learning rate : 0.6
Max success rate : 0.243
Avg success rate : 0.05355500000000001


Learning rate : 0.7
Max success rate : 0.1995
Avg success rate : 0.05374999999999999


Learning rate : 0.8
Max success rate : 0.3115
Avg success rate : 0.05796499999999999


Learning rate : 0.9
Max success rate : 0.24
Avg success rate : 0.045284999999999985


Learning rate : 1.0
Max success rate : 0.0345
Avg success rate : 0.020894999999999993




- Q-learning에 learning rate와 discounted reword를 적용
- 별다른 차이가 없음


In [None]:
NUM_EPISODES = 20000

lr = 0.85
q_value, log_reword, log_q_map, log_action, log_q_by_step  = run_q_learning(env, NUM_EPISODES, learning_rate=lr)
res = sum(log_reword) / NUM_EPISODES
result_.append(res)

print('Learning rate : ' + str(lr))
print('success rate : ' + str(res))
print('\n')


Learning rate : 0.85
success rate : 0.0174




In [None]:
NUM_EPISODES = 200000

lr = 0.85
q_value, log_reword, log_q_map, log_action, log_q_by_step  = run_q_learning(env, NUM_EPISODES, learning_rate=lr, discount=0.9)
res = sum(log_reword) / NUM_EPISODES
result_.append(res)

print('Learning rate : ' + str(lr))
print('success rate : ' + str(res))
print('\n')


Learning rate : 0.85
success rate : 0.026155




- Episodes 횟수를 높여도 성공률이 낮음
- Learning rate 또는 discounted reward 적용하였을때 성공률이 높아지기는 했지만 일정이상 성공률이 높아지지는 않음


