In [4]:
import numpy as np

# Temporal-Difference Prediction

In [2]:
class GridWorld():
    def __init__(self, n_row=4, n_col=4):
        self.end_row = n_row - 1
        self.end_col = n_col - 1
        self.x = 0
        self.y = 0
    
    def step(self, a):
        if a==0:
            self.north()
        elif a==1:
            self.south()
        elif a==2:
            self.west()
        elif a==3:
            self.east()

        reward = -1
        done = (self.x == self.end_row and self.y == self.end_col)
        return (self.x, self.y), reward, done

    def north(self):
        self.x = max(self.x - 1, 0)
      
    def south(self):
        self.x = min(self.x + 1, self.end_row)
      
    def west(self):
        self.y = max(self.y - 1, 0)
  
    def east(self):
        self.y = min(self.y + 1, self.end_col)
      
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

In [25]:
class TDAgent():
    def __init__(self):
        self.alpha = 0.0001  # 업데이트 비율
        self.v_table = np.zeros((4, 4))  # 상태 가치 테이블
        self.pi = [0.25, 0.25, 0.25, 0.25]

    def select_action(self, s):
        action = np.random.choice(4, p=self.pi)
        return action

    def update_table(self, transition, gamma):
        # 테이블의 값을 업데이트 한다
        s, action, reward, s_prime = transition
        x, y = s
        x_prime, y_prime = s_prime
        # TD 업데이트. v(s) ← (1 − α) * v(s) + α * (R + v(s′))
        self.v_table[x, y] = (1 - self.alpha) * self.v_table[x, y] \
                             + self.alpha * (reward + gamma * self.v_table[x_prime, y_prime])

In [26]:
env = GridWorld()  # 환경
agent = TDAgent()  # 환경
gamma = 1.0  # 환경

In [27]:
for k in range(50000):
#for k in range(1):
    done = False
    # 에피소드 1회 진행
    s = env.reset()
    while not done:
        action = agent.select_action(s)
        s_prime, reward, done = env.step(action)
        # 한 번의 step이 진행되자 마자 바로 테이블 업데이트, 몬테카를로방식과는 다름
        agent.update_table((s, action, reward, s_prime), gamma)
        # 상태 변경
        s = s_prime
    env.reset()

In [28]:
np.round(agent.v_table, 2)

array([[-20.65, -19.49, -17.83, -16.57],
       [-19.47, -18.16, -16.08, -14.3 ],
       [-17.83, -16.11, -12.93,  -9.3 ],
       [-16.57, -14.3 ,  -9.32,   0.  ]])

# SARSA

In [1]:
class GridWorld():
    def __init__(self, n_row=7, n_col=7):
        self.end_row = n_row - 1
        self.end_col = n_col - 1
        self.x = 0
        self.y = 0
        self.wall = {(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (6, 2),
                     (0, 4), (2, 4), (3, 4), (4, 4), (5, 4), (6, 4)}
    
    def step(self, a):
        if a==0:
            self.north()
        elif a==1:
            self.south()
        elif a==2:
            self.west()
        elif a==3:
            self.east()

        reward = -1
        done = (self.x == self.end_row and self.y == self.end_col)
        return (self.x, self.y), reward, done

    def north(self):
        if (max(self.x - 1, 0), self.y) not in self.wall:
            self.x = max(self.x - 1, 0)
      
    def south(self):
        if (min(self.x + 1, self.end_row), self.y) not in self.wall:
            self.x = min(self.x + 1, self.end_row)
      
    def west(self):
        if (self.x, max(self.y - 1, 0)) not in self.wall:
            self.y = max(self.y - 1, 0)
  
    def east(self):
        if (self.x, min(self.y + 1, self.end_col)) not in self.wall:
            self.y = min(self.y + 1, self.end_col)
      
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

In [24]:
class SARSAAgent():
    def __init__(self):
        self.q_table = np.zeros((7, 7, 4)) # q-table 
        self.alpha = 0.01  

    def select_action(self, s, epsilon=0.0): # E-greedy
        x, y = s
        dice = np.random.random()
        if dice < epsilon:
            action = np.random.randint(0, 4)
        else:
            action_val = self.q_table[x,y,:]
            #print(action_val, " is ", np.argmax(action_val))
            action = np.argmax(action_val)
        return action
        
    def update_table(self, transition, epsilon):
        s, a, r, s_prime = transition
        x,y = s
        next_x, next_y = s_prime
        next_a = self.select_action(s_prime, epsilon) # S'에서 선택할 액션 (실제로 취한 액션이 아님), E-greedy
        # SARSA 업데이트.
        self.q_table[x,y,a] = (1 - self.alpha) * self.q_table[x,y,a] \
                              + self.alpha * (r + self.q_table[next_x,next_y,next_a])

    def show_table(self):
        q_lst = self.q_table.tolist()
        data = np.zeros((7, 7))
        for row_idx in range(len(q_lst)):
            row = q_lst[row_idx]
            for col_idx in range(len(row)):
                col = row[col_idx]
                action = np.argmax(col) if min(col) != 0 else -1
                data[row_idx, col_idx] = action
        return data

In [18]:
env = GridWorld()  # 환경
agent = SARSAAgent()  # 에이전트
epsilon = 0.9  # 탐험 비율

In [19]:
for k in range(50000):
    done = False
    # env 초기화
    s = env.reset()
    # 에피소드 1회 진행
    while not done:
        a = agent.select_action(s, epsilon)
        s_prime, r, done = env.step(a)
        agent.update_table((s, a, r, s_prime), epsilon)  # 매번 업데이트
        s = s_prime
    # decaying epsilon
    epsilon = max(epsilon - 0.001, 0.1)

In [7]:
agent.show_table()

array([[ 1.,  1., -1.,  1., -1.,  1.,  1.],
       [ 1.,  1., -1.,  3.,  3.,  1.,  1.],
       [ 1.,  1., -1.,  0., -1.,  1.,  1.],
       [ 3.,  1., -1.,  0., -1.,  1.,  1.],
       [ 1.,  1., -1.,  0., -1.,  1.,  1.],
       [ 3.,  3.,  3.,  0., -1.,  1.,  1.],
       [ 3.,  0., -1.,  0., -1.,  3., -1.]])

# Q-Learning

In [21]:
class GridWorld():
    def __init__(self, n_row=7, n_col=7):
        self.end_row = n_row - 1
        self.end_col = n_col - 1
        self.x = 0
        self.y = 0
        self.wall = {(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (6, 2),
                     (0, 4), (2, 4), (3, 4), (4, 4), (5, 4), (6, 4)}
    
    def step(self, a):
        if a==0:
            self.north()
        elif a==1:
            self.south()
        elif a==2:
            self.west()
        elif a==3:
            self.east()

        reward = -1
        done = (self.x == self.end_row and self.y == self.end_col)
        return (self.x, self.y), reward, done

    def north(self):
        if (max(self.x - 1, 0), self.y) not in self.wall:
            self.x = max(self.x - 1, 0)
      
    def south(self):
        if (min(self.x + 1, self.end_row), self.y) not in self.wall:
            self.x = min(self.x + 1, self.end_row)
      
    def west(self):
        if (self.x, max(self.y - 1, 0)) not in self.wall:
            self.y = max(self.y - 1, 0)
  
    def east(self):
        if (self.x, min(self.y + 1, self.end_col)) not in self.wall:
            self.y = min(self.y + 1, self.end_col)
      
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

In [25]:
class QAgent():
    def __init__(self):
        self.q_table = np.zeros((7, 7, 4)) # q-table 
        self.alpha = 0.01  

    def select_action(self, s, epsilon=0.0): # E-greedy
        x, y = s
        dice = np.random.random()
        if dice < epsilon:
            action = np.random.randint(0, 4)
        else:
            action_val = self.q_table[x, y, :]
            action = np.argmax(action_val)
        return action
        
    def update_table(self, transition):
        s, a, r, s_prime = transition  # behavior 행동 결과
        x, y = s
        x_prime, y_prime = s_prime
        a_prime = np.argmax(self.q_table[x_prime, y_prime, :])  # target 정책, Greedy
        # Q-Learning 업데이트.
        self.q_table[x, y, a] = (1 - self.alpha) * self.q_table[x, y, a] \
                                + self.alpha * (r + self.q_table[x_prime, y_prime, a_prime])

    def show_table(self):
        q_lst = self.q_table.tolist()
        data = np.zeros((7, 7))
        for row_idx in range(len(q_lst)):
            row = q_lst[row_idx]
            for col_idx in range(len(row)):
                col = row[col_idx]
                action = np.argmax(col) if min(col) != 0 else -1
                data[row_idx, col_idx] = action
        return data

In [26]:
env = GridWorld()  # 환경
agent = QAgent()  # 에이전트
epsilon = 0.9  # 탐험 비율

In [27]:
for k in range(50000):
    done = False
    # env 초기화
    s = env.reset()
    # 에피소드 1회 진행
    while not done:
        a = agent.select_action(s, epsilon)
        s_prime, r, done = env.step(a)
        agent.update_table((s, a, r, s_prime))  # 매번 업데이트
        s = s_prime
    # decaying epsilon
    epsilon = max(epsilon - 0.001, 0.1)

In [28]:
agent.show_table()

array([[ 1.,  1., -1.,  1., -1.,  1.,  1.],
       [ 1.,  1., -1.,  3.,  3.,  1.,  1.],
       [ 1.,  1., -1.,  0., -1.,  1.,  1.],
       [ 1.,  1., -1.,  0., -1.,  1.,  1.],
       [ 1.,  1., -1.,  0., -1.,  1.,  1.],
       [ 3.,  3.,  3.,  0., -1.,  1.,  1.],
       [ 0.,  0., -1.,  0., -1.,  3., -1.]])