In [1]:
import numpy as np

# On-Policy Monte-Carlo Control

In [2]:
class Env:
    def __init__(self):
        self.grid_width = 5
        self.grid_height = self.grid_width
        self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)]     # U, D, L, R
        self.gtriangle1 = [1, 2]
        self.gtriangle2 = [2, 1]
        self.goal = [2, 2]
        
    def step(self, state, action):
        x, y = state
        
        # get next state by action
        x+= action[0]
        y+= action[1]
        
        if x < 0 :
            x = 0
        elif x > (self.grid_width-1) :
            x = (self.grid_width-1)

        if y < 0 :
            y = 0
        elif y > (self.grid_width-1) :
            y = (self.grid_width-1)
        
        next_state = [x, y]
        
        # reward 
        if next_state == self.gtriangle1 or next_state == self.gtriangle2:
            reward = -1
            done = True
        elif next_state == self.goal:
            reward = 1
            done = True
        else:
            reward = 0
            done = False
        
        return next_state, reward, done
    
    def reset(self):
        return [0, 0]

In [3]:
class MC_agent:
    def __init__(self):
        self.action_grid = [(-1, 0), (1, 0), (0, -1), (0, 1)]
        self.action_text= ['U', 'D', 'L', 'R']
        self.grid_width = 5
        self.grid_height = self.grid_width
        self.value_table = np.zeros((self.grid_width, self.grid_height))
        self.e = .1
        self.learning_rate = .01
        self.discount_factor = .95
        self.memory=[]
    
    def get_action(self, state):
        # with prob.ε take random action
        if np.random.randn() <  self.e :
            idx = np.random.choice(len(self.action_grid),1)[0]
        else :
            next_values = np.array([])
            for s in self.next_states(state):
                next_values= np.append(next_values, self.value_table[tuple(s)])
            max_value = np.amax(next_values)
            tie_Qchecker = np.where(next_values==max_value)[0]
            
            # if tie max value, get random
            if len(tie_Qchecker) > 1:
                idx = np.random.choice(tie_Qchecker, 1)[0]
            else :
                idx = np.argmax(next_values)
        action = self.action_grid[idx]
        return action
    
    def next_states(self, state):
        x, y = state
        next_S = []
        for action in self.action_grid:
            x, y = state
            # calculate x_coordinate
            x+=action[0]
            if x < 0:
                x = 0
            elif x > 4:
                x = 4               
            # calculate x_coordinate
            y+=action[1]
            if y < 0:
                y = 0
            elif y > 4:
                y = 4
            next_S.append([x, y])        
        return next_S    
        
    # using First visit MC    
    def update(self):
        G_t = 0
        visit_states=[]
        for sample in reversed(self.memory):
            state = sample[0]
            reward = sample[1]
            if state not in visit_states:
                visit_states.append(state)
                G_t = reward + self.discount_factor*G_t
                V_t = self.value_table[tuple(state)]
                # update Value
                self.value_table[tuple(state)] = V_t + self.learning_rate*(G_t - V_t)
        
    def memorizer(self, state, reward, done):
        self.memory.append([state, reward, done])
        
    def save_actionseq(self, action_sequence, action):
        idx = self.action_grid.index(action)
        action_sequence.append(self.action_text[idx])

In [4]:
if __name__ == "__main__":
    env = Env()
    agent = MC_agent()
    total_episode = 10000
    sr=0
    
    for episode in range(total_episode):
        action_sequence=[]
        total_reward = 0
        state = env.reset()
        action = agent.get_action(state)
        done = False
        walk = 0
        
        while True:
            next_state, reward, done = env.step(state, action)
            agent.memorizer(state, reward, done)
            agent.save_actionseq(action_sequence, action)
            walk += 1
            
            # next state and action 
            state = next_state
            action = agent.get_action(state)
            total_reward+=reward
            
            if done:
                if episode % 100 == 0 :
                    print('finished at', state)
                    print('episode :{}, The number of step:{}\n The sequence of action is:\
                          {}\nThe total reward is: {}\n'.format(episode, walk, action_sequence, total_reward))
                if state == env.goal:
                    sr+=1
                agent.update()
                agent.memory.clear()
                break
                
print('The accuracy :', sr/total_episode*100, '%')

finished at [2, 1]
episode :0, The number of step:3
 The sequence of action is:                          ['D', 'R', 'D']
The total reward is: -1

finished at [1, 2]
episode :100, The number of step:3
 The sequence of action is:                          ['R', 'R', 'D']
The total reward is: -1

finished at [1, 2]
episode :200, The number of step:44
 The sequence of action is:                          ['R', 'R', 'U', 'R', 'R', 'R', 'D', 'D', 'R', 'L', 'D', 'U', 'D', 'U', 'D', 'U', 'D', 'U', 'D', 'U', 'D', 'D', 'U', 'R', 'L', 'U', 'D', 'U', 'D', 'L', 'L', 'R', 'D', 'U', 'R', 'L', 'R', 'U', 'D', 'L', 'R', 'U', 'U', 'L']
The total reward is: -1

finished at [2, 2]
episode :300, The number of step:27
 The sequence of action is:                          ['R', 'R', 'R', 'R', 'D', 'D', 'U', 'D', 'U', 'U', 'L', 'R', 'L', 'U', 'D', 'R', 'U', 'D', 'D', 'L', 'U', 'D', 'R', 'L', 'D', 'L', 'U']
The total reward is: 1

finished at [2, 1]
episode :400, The number of step:3
 The sequence of action is:   

In [5]:
agent.value_table

array([[-0.01778219,  0.00173448,  0.07027257,  0.31807991,  0.45327218],
       [-0.03679505, -0.53769129,  0.        ,  0.27727623,  0.50177026],
       [ 0.11331332,  0.        ,  0.        ,  0.68630107,  0.61163694],
       [ 0.37011571,  0.34679193,  0.59913546,  0.65439169,  0.61544138],
       [ 0.42668923,  0.47771908,  0.5621389 ,  0.57223967,  0.59911758]])