In [65]:
import random
import numpy as np   # q(s,a)를 numpy array 형태로 관리

In [66]:
ACTION_SYMBOLS = ['←', '↑', '→', '↓']

In [67]:
# 수정된 GridWorld 클래스: 벽 위치를 명시적으로 정의하고, 벽에 부딪히면 이동을 무효화합니다.
class GridWorld():
    def __init__(self):
        # (row, col) 튜플로 표현된 벽 위치 집합
        self.walls = set([
            (0, 2), (1, 2), (2, 2),    
            (2, 4), (3, 4), (4, 4)     
        ])
        self.reset()

    def step(self, a):
        # 액션 수행 전 위치 기억
        old_x, old_y = self.x, self.y

        if a == 0:   self.move_left()
        elif a == 1: self.move_up()
        elif a == 2: self.move_right()
        elif a == 3: self.move_down()

        # 이동 후 벽에 부딪혔으면 원래 위치로 되돌리기
        if (self.x, self.y) in self.walls:
            self.x, self.y = old_x, old_y

        reward = -1
        done = self.is_done()
        return (self.x, self.y), reward, done

    def move_left(self):  # 벽에 막혀 있을 때, 벽의 방향으로 진행하는 액션은 모두 무효 처리
        if self.y==0:
            pass
        elif self.y==3 and self.x in [0,1,2]:
            pass
        elif self.y==5 and self.x in [2,3,4]:
            pass
        else:
            self.y -= 1

    def move_right(self):
        if self.y==1 and self.x in [0,1,2]:
            pass
        elif self.y==3 and self.x in [2,3,4]:
            pass
        elif self.y==6:
            pass
        else:
            self.y += 1
      
    def move_up(self):
        if self.x==0:
            pass
        elif self.x==3 and self.y==2:
            pass
        else:
            self.x -= 1

    def move_down(self):
        if self.x==4:
            pass
        elif self.x==1 and self.y==4:
            pass
        else:
            self.x+=1

    def is_done(self):
        return (self.x, self.y) == (4, 6)

    def reset(self):
        self.x, self.y = 0, 0
        return (self.x, self.y)


In [68]:
class QAgent():
    def __init__(self):
        self.q_table = np.zeros((5, 7, 4)) # Q 테이블을 0으로 초기화
        self.eps = 0.9

    def select_action(self, s):       # eps-greedy로 액션을 선택해준다
                                      # 𝜖 확률로 새로운 무작위 행동을 선택(탐험)하고, (1- 𝜖) 확률로 현재 상태에서 최선의 행동을 선택
        x, y = s
        coin = random.random()
        if coin < self.eps:
            action = random.randint(0,3)
        else:
            action_val = self.q_table[x,y,:]
            action = np.argmax(action_val)
        return action

    def update_table(self, transition):
        s, a, r, s_prime = transition
        x,y = s
        next_x, next_y = s_prime
        a_prime = self.select_action(s_prime) # S'에서 선택할 액션 (실제로 취한 액션이 아님)
        
             # Q러닝 업데이트 식을 이용 ,  학습률(𝜌) = 0.1 ,   할인율(𝛾) = 1
        self.q_table[x,y,a] = self.q_table[x,y,a] + 0.1 * (r + np.amax(self.q_table[next_x,next_y,:]) - self.q_table[x,y,a])

    def anneal_eps(self):
        self.eps -= 0.01  # Q러닝에선 epsilon 이 좀더 천천히 줄어 들도록 함.
        self.eps = max(self.eps, 0.2) 
   

    def show_table(self):
        q_lst = self.q_table.tolist()
        data = np.zeros((5,7))
        for row_idx in range(len(q_lst)):
            row = q_lst[row_idx]
            for col_idx in range(len(row)):
                col = row[col_idx]
                action = np.argmax(col)
                data[row_idx, col_idx] = action
        print(data)

      

In [69]:
def print_policy(agent, env):
    for i in range(5):
        row = []
        for j in range(7):
            if (i,j) in env.walls:
                row.append(' ')                    # 벽은 빈 칸
            elif (i,j) == (4,6):
                row.append('G')                    # 목표는 G
            else:
                a = np.argmax(agent.q_table[i,j,:])
                row.append(ACTION_SYMBOLS[a])      # 나머지는 화살표
        print(' '.join(row))
    print()

In [70]:
def main():
    env = GridWorld()
    agent = QAgent()

    for n_epi in range(1000):
        done = False

        s = env.reset()
        while not done:
            a = agent.select_action(s)
            s_prime, r, done = env.step(a)
            agent.update_table((s,a,r,s_prime))   # 한 스텝이 끝날때마다 update table 함수 호출
            s = s_prime
        agent.anneal_eps()

        # ── 여기에 출력 코드 추가 ──
        if n_epi % 100 == 0 and (n_epi // 100) <= 10:
            print(f"--- Episode {n_epi} ---")
            print_policy(agent, env)

    agent.show_table()


if __name__ == '__main__':
    main()
  

--- Episode 0 ---
← ←   ↑ → ↑ ←
→ ↓   → ← ↑ ↑
↑ →   →   ← ←
↓ ← ↑ ←   ← ←
↓ → ← ←   ← G

--- Episode 100 ---
↑ ↓   ↑ ↑ ↓ ↑
↓ ←   → → ↓ ↓
→ →   ↑   ↓ ↓
↓ → → ↑   ↓ ↓
→ → → →   ← G

--- Episode 200 ---
↓ ←   → → ↓ →
↓ ↓   → → ↓ ↓
↓ ↓   ↑   ↓ ↓
→ → → ↑   ↓ ↓
↑ ↑ ↓ ↑   → G

--- Episode 300 ---
→ ↓   ↓ → ↓ ↓
↓ ↓   → → ↓ ↓
↓ ↓   ↑   → ↓
→ → → ↑   → ↓
← ↓ ↓ ↓   → G

--- Episode 400 ---
→ ↓   → ↓ → ↓
↓ ↓   → → ↓ ↓
↓ ↓   ↑   → ↓
→ → → ↑   → ↓
↓ → ↑ ↑   → G

--- Episode 500 ---
→ ↓   ← ↓ → ↓
→ ↓   → → → ↓
→ ↓   ↑   ↓ ↓
→ → → ↑   ↓ ↓
↓ → ↑ ↑   → G

--- Episode 600 ---
→ ↓   → → → ↓
→ ↓   → → ↓ ↓
→ ↓   ↑   ↓ ↓
→ → → ↑   → ↓
→ ← ↑ ↑   → G

--- Episode 700 ---
→ ↓   ← → ↓ ↓
↓ ↓   → → ↓ ↓
↓ ↓   ↑   → ↓
→ → → ↑   → ↓
← ↓ ↑ ↑   → G

--- Episode 800 ---
↓ ↓   ↓ → → ↓
↓ ↓   → → ↓ ↓
↓ ↓   ↑   → ↓
→ → → ↑   → ↓
← → ↑ ↑   → G

--- Episode 900 ---
↓ ↓   → → → ↓
↓ ↓   → → ↓ ↓
↓ ↓   ↑   ↓ ↓
→ → → ↑   → ↓
→ → → ↑   → G

[[3. 3. 0. 2. 2. 2. 3.]
 [3. 3. 0. 2. 2. 2. 3.]
 [2. 3. 0. 1. 0. 3. 3.]
 [2. 2. 2. 1. 0. 3. 

In [None]:
class GridWorld():
    def __init__(self):
        self.x=0
        self.y=0
    
    def step(self, a):
        # 0번 액션: 왼쪽, 1번 액션: 위, 2번 액션: 오른쪽, 3번 액션: 아래쪽
        if a==0:
            self.move_left()
        elif a==1:
            self.move_up()
        elif a==2:
            self.move_right()
        elif a==3:
            self.move_down()

        reward = -1  # 보상은 항상 -1로 고정
        done = self.is_done()
        return (self.x, self.y), reward, done

    def move_left(self):  # 벽에 막혀 있을 때, 벽의 방향으로 진행하는 액션은 모두 무효 처리
        if self.y==0:
            pass
        elif self.y==3 and self.x in [0,1,2]:
            pass
        elif self.y==5 and self.x in [2,3,4]:
            pass
        else:
            self.y -= 1

    def move_right(self):
        if self.y==1 and self.x in [0,1,2]:
            pass
        elif self.y==3 and self.x in [2,3,4]:
            pass
        elif self.y==6:
            pass
        else:
            self.y += 1
      
    def move_up(self):
        if self.x==0:
            pass
        elif self.x==3 and self.y==2:
            pass
        else:
            self.x -= 1

    def move_down(self):
        if self.x==4:
            pass
        elif self.x==1 and self.y==4:
            pass
        else:
            self.x+=1

    def is_done(self):
        if self.x==4 and self.y==6: # 목표 지점인 (4,6)에 도달하면 끝난다
            return True
        else:
            return False
      
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)
