# Q-learning 최적 경로 출력
- q_learning_02 버전의 최적 경로 출력 문제 보완

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import gym
from gym.envs.registration import register

In [2]:
#환경 변수
IDX_ACTION_UP = 0
IDX_ACTION_DOWN = 1
IDX_ACTION_RIGHT = 2
IDX_ACTION_LEFT = 3

STR_ACTION_UP = 'U'
STR_ACTION_DOWN = 'D'
STR_ACTION_RIGHT = 'R'
STR_ACTION_LEFT = 'L'

ENV_ID = 'FrozenLake-v3'
NUM_EPISODES = 2000

GOAL_STATE = 15

# 환경 등록
def get_env_register(env_id):
    env_dict = gym.envs.registry.env_specs.copy()
 
    for env in env_dict:
        if env_id in env:
            print('Remove {} from registry'.format(env))
            del gym.envs.registry.env_specs[env]   

    register(
        id=env_id,
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name': '4x4',
                'is_slippery': False}
    )
    
    env = gym.make(env_id)

    return env

# Q-table 시각화
def print_str_direct(q_value):
    cnt = 0
    while cnt < len(q_value):
        txt = ''
        for _ in range(4):
            # q-value가 실수인 경우 보완
            q = ''.join([str(int(round(e, 0))) for e in q_value[cnt]])
            if q == '1000':
                txt += STR_ACTION_UP
            elif q == '0100':
                txt += STR_ACTION_DOWN
            elif q == '0010':
                txt += STR_ACTION_RIGHT
            elif q == '0001':
                txt += STR_ACTION_LEFT
            else:
                txt += ' '
            txt += ' | '
            cnt += 1
        print(txt)    

def print_q_table(q_value):
    print('-'*50)
    print("Final Q-Table Values")
    print('-'*50)
    print('state | U   D   R   L')
    print(q_value)


In [3]:
# Q-learning algorithm with Decaying E-greedy
def do_qlearning_decay_egreedy(env, num_episodes):
    q_value = np.zeros([env.observation_space.n, env.action_space.n])

    rList = []
    cntList = [] 
    dis = 0.99 

    min_act = env.observation_space.n * env.action_space.n
    optimal_q_value = []

    for i in range(num_episodes):
        state = env.reset()         
        done = False
        action_cnt = 0
        rAll = 0 

        e = 1./((i // 100) + 1)

        b_success = False

        while not done:       
            if np.random.uniform(0, 1) < e:
                action = env.action_space.sample()
            else : 
                action = np.argmax(q_value[state, :])

            new_state, reward, done, _ = env.step(action)
            
            q_value[state, action] = reward + dis * np.max(q_value[new_state, :]) 
            
            rAll += reward
            action_cnt +=1
           
            if new_state == GOAL_STATE:
                b_success = True

            state = new_state

        rList.append(rAll)
        cntList.append(action_cnt)

        # 최단거리로 Goal간경우 q_value를 optimal value로 설정
        if b_success and action_cnt < min_act:            
            min_act = action_cnt
            optimal_q_value = q_value
           
    return q_value, rList, cntList, optimal_q_value, min_act

In [10]:
env = get_env_register(ENV_ID)
q_value, rList, cntList, optimal_q_value, min_act = do_qlearning_decay_egreedy(env, NUM_EPISODES)
print('**** Success rate:{} min_act:{} '.format((sum(rList) / NUM_EPISODES), min_act))
print("Success average action : " + str(sum(cntList) / NUM_EPISODES)) # 평균 액션 횟수

Remove FrozenLake-v3 from registry
**** Success rate:0.818 min_act:6 
Success average action : 6.361


In [11]:
optimal_q_value

array([[0.94148015, 0.95099005, 0.95099005, 0.94148015],
       [0.94148015, 0.        , 0.96059601, 0.95099005],
       [0.95099005, 0.970299  , 0.95099005, 0.96059601],
       [0.96059601, 0.        , 0.95099005, 0.95099005],
       [0.95099005, 0.96059601, 0.        , 0.94148015],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9801    , 0.        , 0.96059601],
       [0.        , 0.        , 0.        , 0.        ],
       [0.96059601, 0.        , 0.970299  , 0.95099005],
       [0.96059601, 0.9801    , 0.9801    , 0.        ],
       [0.970299  , 0.99      , 0.        , 0.970299  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9801    , 0.99      , 0.970299  ],
       [0.9801    , 0.99      , 1.        , 0.9801    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [12]:
def get_optimal_path(optimal_q_value):
    list_optimal_step = []
    optimal_step = 0
    optimal_path = []

    #q-value중 max값을 1로 변경하기
    qvalue_table = np.zeros([optimal_q_value.shape[0], optimal_q_value.shape[1]])

    for state, q_value in enumerate(optimal_q_value):
        q_max = np.amax(q_value)  # q_value array의 최댓값 반환
        indices = np.nonzero(q_value == q_max)[0]
        qvalue_table[state, indices[0]] = 1
        state += 1

    #print(qvalue_table)

    for state, q_value in enumerate(qvalue_table):
        index = q_value.argmax()

        if optimal_step == state : # 최단 경로 위에 있는 state에 대해 최적경로step을 지정한다.
            list_optimal_step.append(optimal_step)

            if optimal_step == GOAL_STATE:
                q_value = [0,0,0,0]
            else:   
                if index == IDX_ACTION_UP :
                    optimal_step -= 4
                elif index == IDX_ACTION_DOWN :
                    optimal_step += 4
                elif index == IDX_ACTION_RIGHT :
                    optimal_step += 1
                elif index == IDX_ACTION_RIGHT :
                    optimal_step -= 1
                else: 
                    pass
        else:
            q_value = [0,0,0,0]
        
        optimal_path.append(list(map(int,q_value)))

    #print(list_optimal_step)
    #print(optimal_path)

    if optimal_step != GOAL_STATE:
        print("Agent can't find optimal path.")
    return optimal_path

In [13]:
#최종 route 출력
optimal_weight = get_optimal_path(optimal_q_value)
print_str_direct(optimal_weight)

D |   |   |   | 
D |   |   |   | 
R | D |   |   | 
  | R | R |   | 
