몬테카를로 학습 구현

In [2]:
import random

* Grid World 클래스

In [3]:
class GridWorld():
    def __init__(self):
        self.x=0
        self.y=0

    def step(self, a):
        if a==0:
            self.move_right()
        elif a==1:
            self.move_left()
        elif a==2:
            self.move_up()
        elif a==3:
            self.move_down()
        
        reward = -1
        done = self.is_done()
        return (self.x, self.y), reward, done
    
    def move_right(self):
        self.y +=1
        if self.y>3:
            self.y=3
        
    def move_left(self):
        self.y-=1
        if self.y<0:
            self.y=0
    
    def move_up(self):
        self.x -=1
        if self.x<0:
            self.x=0
    
    def move_down(self):
        self.x +=1
        if self.x>3:
            self.x=3
    
    def is_done(self):
        if self.x==3 and self.y==3:
            return True
        else:
            return False
        
    def get_state(self):
        return (self.x, self.y)
    
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

* Agent 클래스

In [4]:
class Agent():
    def __init__(self):
        pass

    def select_action(self):
        coin = random.random()
        if coin < 0.25:
            action = 0
        elif coin < 0.5:
            action = 1
        elif coin <0.75:
            action = 2
        else:
            action = 3
        return action

In [5]:
#MC 방법을 이용한 강화학습
def main():
    env = GridWorld()
    agent = Agent()
    data = [[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]] # 테이블 초기화
    gamma = 1.0
    alpha = 0.0001

    for k in range(50000): # 총 5만 번의 에피소드 진행
        done = False
        history = []
        while not done:
            action = agent.select_action()
            (x,y), reward, done = env.step(action)
            history.append((x,y,reward))
        env.reset()

        # 매 에피소드가 끝나고 바로 해당 데이터를 이용해 테이블을 업데이트
        cum_reward = 0
        for transition in history[::-1]:
            # 역순으로 리턴을 계산
            x, y, reward = transition
            data[x][y] = data[x][y] + alpha*(cum_reward - data[x][y])
            cum_reward = reward + gamma*cum_reward
        
    for row in data:
        print(row)


In [8]:
#TD 방법을 이용한 강화학습
def main():
    env = GridWorld()
    agent = Agent()
    data = [[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]] # 테이블 초기화
    gamma = 1.0
    alpha = 0.01 # MC에 비해 큰 값을 사용

    for k in range(50000):
        done = False
        while not done:
            x, y = env.get_state()
            action = agent.select_action()
            (x_prime, y_prime), reward, done = env.step(action)
            x_prime, y_prime = env.get_state()

            # 한 번의 step 진행 후 바로 테이블을 업데이트
            data[x][y] = data[x][y] + alpha*(reward + gamma*data[x_prime][y_prime] - data[x][y])
        env.reset()

    for row in data:
        print(row)

In [9]:
main()

[-58.350289411586324, -56.327058645640655, -52.767981434430666, -50.56962588070092]
[-56.287609261528, -53.37444186150564, -48.63188761156009, -44.39537390117887]
[-52.57231111090669, -47.53283983780146, -39.5690361748086, -29.465625735303313]
[-50.10379889639041, -43.396645431975095, -28.392118447389983, 0]
