In [1]:
import copy
import time
import numpy as np
import tkinter as tk

from tkinter import Button
from PIL import ImageTk, Image

In [2]:
# <Enviroment Class>
PhotoImage = ImageTk.PhotoImage
UNIT = 100  # 픽셀 수
HEIGHT = 5  # 그리드월드 세로
WIDTH = 5  # 그리드월드 가로
TRANSITION_PROB = 1
POSSIBLE_ACTIONS = [0, 1, 2, 3]  # 좌, 우, 상, 하
ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # 좌표로 나타낸 행동
REWARDS = []


class Env:
    def __init__(self):
        self.transition_probability = TRANSITION_PROB
        self.width = WIDTH
        self.height = HEIGHT
        self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
        self.possible_actions = POSSIBLE_ACTIONS
        self.reward[2][2] = 1  # (2,2) 좌표 동그라미 위치에 보상 1
        self.reward[1][2] = -1  # (1,2) 좌표 세모 위치에 보상 -1
        self.reward[2][1] = -1  # (2,1) 좌표 세모 위치에 보상 -1
        self.all_state = []

        for x in range(WIDTH):
            for y in range(HEIGHT):
                state = [x, y]
                self.all_state.append(state)

    def get_reward(self, state, action):
        next_state = self.state_after_action(state, action)
        return self.reward[next_state[0]][next_state[1]]

    def state_after_action(self, state, action_index):
        action = ACTIONS[action_index]
        return self.check_boundary([state[0] + action[0], state[1] + action[1]])

    @staticmethod
    def check_boundary(state):
        state[0] = (0 if state[0] < 0 else WIDTH - 1
                    if state[0] > WIDTH - 1 else state[0])
        state[1] = (0 if state[1] < 0 else HEIGHT - 1
                    if state[1] > HEIGHT - 1 else state[1])
        return state

    def get_transition_prob(self, state, action):
        return self.transition_probability

    def get_all_states(self):
        return self.all_state

In [3]:
# <Environment Setting>
env = Env()

# <가치함수를 2차원 리스트로 초기화>
# value table 생성
value_table = np.zeros(env.width * env.height).reshape(env.width,env.height).tolist()

# next value table 생성 (정책 평가 후 value table 을 next value table로 업데이트)
# next_value_table = np.zeros(env.width * env.height).reshape(env.width,env.height).tolist()
next_value_table = copy.deepcopy(value_table)

In [4]:
# <상하좌우 동일한 확률로 정책 초기화>
policy_table = \
np.repeat(np.repeat(0.25,4), env.width * env.height).reshape(env.height, env.width,-1).tolist()

# next_policy_table = \
# np.repeat(np.repeat(0.25,4), env.width * env.height).reshape(env.height, env.width,-1).tolist()
next_policy_table = copy.deepcopy(policy_table)

In [5]:
# <End 상태의 설정>
policy_table[2][2] = []

# <할인율 설정>
discount_factor = 0.9

### 정책 평가를 여러번 한 후, 정책 발전 -> 이 과정을 반복해서 진행하면 최적 정책이 찾아짐
```
- state  : [x,y]
- action : [0,1,2,3] - > [상,하,좌,우]
```

In [65]:
# <[정책평가] 벨만 기대 방정식을 통해 다음 가치함수를 계산>

# state = env.get_all_states()[0]
for state in env.get_all_states() :
    print(f'[ state : {state} ]')
    value = 0
    # End 위치([2,2])이면 value = 0 아니면 value 계산
    if state != [2,2] :
        for action in env.possible_actions :
            next_state = env.state_after_action(state,action)
            reward = env.get_reward(state, action)
            next_value = value_table[next_state[0]][next_state[1]]
            value += policy_table[state[0]][state[1]][action] * (reward + discount_factor * next_value) # 기대방정식 가치함수(70 page)
            print(f'action : {action} / policy : {policy_table[state[0]][state[1]][action]:.2f} / reward : {reward} / next_state : {next_state} ')

    next_value_table[state[0]][state[1]] = value  
    
    tmp_value      = f'{value_table[state[0]][state[1]] :.2f}'
    tmp_next_value = f'{next_value_table[state[0]][state[1]] :.2f}'
    print(f'>> value : {tmp_value} / next value : {tmp_next_value}')
          
# value_table 을 next_value_table로 업데이트
value_table = copy.deepcopy(next_value_table)

[ state : [0, 0] ]
action : 0 / policy : 0.25 / reward : 0 / next_state : [0, 0] 
action : 1 / policy : 0.25 / reward : 0 / next_state : [1, 0] 
action : 2 / policy : 0.25 / reward : 0 / next_state : [0, 0] 
action : 3 / policy : 0.25 / reward : 0 / next_state : [0, 1] 
>> value : 0.54 / next value : 0.54
[ state : [0, 1] ]
action : 0 / policy : 0.00 / reward : 0 / next_state : [0, 1] 
action : 1 / policy : 0.00 / reward : 0 / next_state : [1, 1] 
action : 2 / policy : 0.00 / reward : 0 / next_state : [0, 0] 
action : 3 / policy : 1.00 / reward : 0 / next_state : [0, 2] 
>> value : 0.66 / next value : 0.66
[ state : [0, 2] ]
action : 0 / policy : 0.00 / reward : 0 / next_state : [0, 2] 
action : 1 / policy : 0.00 / reward : -1 / next_state : [1, 2] 
action : 2 / policy : 0.00 / reward : 0 / next_state : [0, 1] 
action : 3 / policy : 1.00 / reward : 0 / next_state : [0, 3] 
>> value : 0.73 / next value : 0.73
[ state : [0, 3] ]
action : 0 / policy : 0.00 / reward : 0 / next_state : [0, 

In [69]:
# <[정책발전] 현재 가치함수(value_table)에 대해서 탐욕 정책 발전>
# state = env.get_all_states()[0]

for state in env.get_all_states() :
    
    value_list = []
    result = [0,0,0,0]

    # 현재 state에서 각 action에 대한 Q - Value 계산
    for action in env.possible_actions :
        next_state = env.state_after_action(state,action)
        reward = env.get_reward(state, action)
        next_value = value_table[next_state[0]][next_state[1]]
        value = reward + discount_factor * next_value # 큐함수 (74 page)
        value_list.append(value)    
        
    # 가장 큰 Q - Value를 가진 행동의 개수 구하기
    max_idx_list = np.argwhere(value_list == np.amax(value_list))
    max_idx_list = max_idx_list.flatten().tolist()

    # 행동 확률 계산 -> 가장 큰 Q - Value가 1개라면 Prob = 1
    prob = 1/len(max_idx_list)

    # 정책 발전 -> 가장 큰 Q - Value를 가진 행동에 새로 계산된 prob 대입, 나머지는 0
    for idx in max_idx_list :
        result[idx] = prob

    # next policy table의 해당 state의 정책에 대입    
    next_policy_table[state[0]][state[1]] = result    
    
    tmp_po_tab      = [f'{s :.2f}' for s in policy_table[state[0]][state[1]]]
    tmp_next_po_tab = [f'{s :.2f}' for s in next_policy_table[state[0]][state[1]]]
    print(f'state : {state} / policy : {tmp_po_tab} / next policy : {tmp_next_po_tab}')
    
# policy table을 next policy table로 지정    
policy_table = copy.deepcopy(next_policy_table)

state : [0, 0] / policy : ['0.00', '0.50', '0.00', '0.50'] / next policy : ['0.00', '0.50', '0.00', '0.50']
state : [0, 1] / policy : ['0.00', '0.00', '0.00', '1.00'] / next policy : ['0.00', '0.00', '0.00', '1.00']
state : [0, 2] / policy : ['0.00', '0.00', '0.00', '1.00'] / next policy : ['0.00', '0.00', '0.00', '1.00']
state : [0, 3] / policy : ['0.00', '1.00', '0.00', '0.00'] / next policy : ['0.00', '1.00', '0.00', '0.00']
state : [0, 4] / policy : ['0.00', '0.50', '0.50', '0.00'] / next policy : ['0.00', '0.50', '0.50', '0.00']
state : [1, 0] / policy : ['0.00', '1.00', '0.00', '0.00'] / next policy : ['0.00', '1.00', '0.00', '0.00']
state : [1, 1] / policy : ['0.50', '0.00', '0.50', '0.00'] / next policy : ['0.50', '0.00', '0.50', '0.00']
state : [1, 2] / policy : ['0.00', '1.00', '0.00', '0.00'] / next policy : ['0.00', '1.00', '0.00', '0.00']
state : [1, 3] / policy : ['0.00', '1.00', '0.00', '0.00'] / next policy : ['0.00', '1.00', '0.00', '0.00']
state : [1, 4] / policy : ['