# Monte Carlo Control 구현

In [1]:
import random
import numpy as np

In [2]:
class GridWorld():
    def __init__(self):
        self.x = 2
        self.y = 0
    
    def step(self, a):
        if a == 0:
            self.move_left()
        elif a == 1:
            self.move_right()
        elif a == 2:
            self.move_up()
        elif a == 3:
            self.move_down()
        
        reward = -1
        done = self.is_done()
        return (self.x, self.y), reward, done
    
    def move_left(self):
        if self.x == 0:
            pass
        elif self.x in [1, 2, 3] and self.y == 2:
            pass
        elif self.x in [2, 3, 4] and self.y == 4:
            pass
        else:
            self.x -= 1
    
    def move_right(self):
        if self.x == 4:
            pass
        elif self.x in [0, 1, 2] and self.y == 2:
            pass
        elif self.x  in [1, 2, 3, 4] and self.y == 4:
            pass
        else:
            self.x += 1
    
    def  move_up(self):
        if self.y == 6:
            pass
        elif self.x in [0, 1, 2] and self.y == 1:
            pass
        elif self.x in [2, 3, 4] and self.y == 3:
            pass
        else:
            self.y += 1
    
    def move_down(self):
        if self.y == 0:
            pass
        elif self.x in [0, 1, 2] and self.y == 3:
            pass
        elif self.x in [2, 3, 4] and self.y == 5:
            pass
        else:
            self.y -= 1
        
    def is_done(self):
        if self.x == 4 and self.y == 6:
            return True
        else:
            return False
        
    def reset(self):
        self.x = 2
        self.y = 0
        return (self.x, self.y)

In [4]:
class QAgent():
    def __init__(self):
        self.q_table = np.zeros((5, 7, 4)) # (x, y, action) 3차원 배열
        self.eps = 0.9
        self.alpha = 0.01
        self.gamma = 0.1

    def select_action(self, s):
        x, y = s
        coin = random.random()
        if coin < self.eps:
            action = random.randint(0,3)
        else:
            action_value = self.q_table[x, y, :]
            action = np.argmax(action_value)
            return action
    
    def update_table(self, history):
        cum_reward = 0
        for transition in history[::-1]:
            s, a, r, s_prime = transition
            x, y = s
            self.q_table[x, y, a] = self.q_table[x, y, a] + self.alpha * (cum_reward - self.q_table[x, y, a])
            cum_reward = r + self.gamma *cum_reward

    def anneal_eps(self):
        self.eps -= 0.03
        self.eps = max(self.eps, 0.1)

    def show_table(self):
        # 학습이 각 위치에서 어느 행동을 하는 것이 좋을지를 보여주는 함수
        q_lst = self.q_table.tolist()
        data = np.zeros((5, 7))
        for row_idx in range(len(q_lst)):
            row = q_lst[row_idx]
            for col_idx in range(len(row)):
                col = row[col_idx]
                action = np.argmax(col)
                data[row_idx, col_idx] = action
        print(data)
    


In [5]:
def main():
    env = GridWorld()
    agent = QAgent()

    for n_epi in range(100):
        done = False
        history = []

        s = env.reset()
        while not done:
            a = agent.select_action(s)
            s_prime, r, done = env.step(a)
            history.append((s, a, r, s_prime))
            s = s_prime
        agent.update_table(history)
        agent.anneal_eps()

    agent.show_table()
    

In [6]:
main()

# SARSA 구현

In [None]:
class QAgent():
    def __init__(self):
        self.q_table = np.zeros((5, 7, 4)) # (x, y, action) 3차원 배열
        self.eps = 0.9
        self.alpha = 0.01
        self.gamma = 0.1

    def select_action(self, s):
        x, y = s
        coin = random.random()
        if coin < self.eps:
            action = random.randint(0,3)
        else:
            action_value = self.q_table[x, y, :]
            action = np.argmax(action_value)
            return action
    
    # 이 부분에서 MC와 차이가 있다.
    def update_table(self, transition):
        s,a,r,s_prime = transition
        x, y = s
        next_x, next_y = s_prime
        a_prime = self.select_action(s_prime)
        self.q_table[x, y, a] = self.q_table[x, y, a] + self.alpha * (r + self.gamma * self.q_table[next_x, next_y, a_prime] - self.q_table[x, y, a])

    def anneal_eps(self):
        self.eps -= 0.03
        self.eps = max(self.eps, 0.1)

    def show_table(self):
        q_lst = self.q_table.tolist()
        data = np.zeros((5, 7))
        for row_idx in range(len(q_lst)):
            row = q_lst[row_idx]
            for col_idx in range(len(row)):
                col = row[col_idx]
                action = np.argmax(col)
                data[row_idx, col_idx] = action
        print(data)
    


In [None]:
def main():
    env = GridWorld()
    agent = QAgent()

    for n_epi in range(100):
        done = False

        s = env.reset()
        while not done:
            a = agent.select_action(s)
            s_prime, r, done = env.step(a)
            agent.update_table(s,a,r,s_prime)
            s = s_prime
        agent.anneal_eps()

    agent.show_table()
    

# Q Learning 구현   

In [None]:
class QAgent():
    def __init__(self):
        self.q_table = np.zeros((5, 7, 4)) # (x, y, action) 3차원 배열
        self.eps = 0.9
        self.alpha = 0.01
        self.gamma = 0.9

    def select_action(self, s):
        x, y = s
        coin = random.random()
        if coin < self.eps:
            action = random.randint(0,3)
        else:
            action_value = self.q_table[x, y, :]
            action = np.argmax(action_value)
            return action
    
    
    def update_table(self, transition):
        s,a,r,s_prime = transition
        x, y = s
        next_x, next_y = s_prime
        # 이 부분에서 TD와 차이가 있다.
        self.q_table[x, y, a] = self.q_table[x, y, a] + self.alpha * (r + self.gamma * np.amax(self.q_table[next_x, next_y, :]) - self.q_table[x, y, a])

    def anneal_eps(self):
        self.eps -= 0.03
        self.eps = max(self.eps, 0.1)

    def show_table(self):
        q_lst = self.q_table.tolist()
        data = np.zeros((5, 7))
        for row_idx in range(len(q_lst)):
            row = q_lst[row_idx]
            for col_idx in range(len(row)):
                col = row[col_idx]
                action = np.argmax(col)
                data[row_idx, col_idx] = action
        print(data)
    


In [None]:
def main():
    env = GridWorld()
    agent = QAgent()

    for n_epi in range(100):
        done = False

        s = env.reset()
        while not done:
            a = agent.select_action(s)
            s_prime, r, done = env.step(a)
            agent.update_table(s,a,r,s_prime)
            s = s_prime
        agent.anneal_eps()

    agent.show_table()
    