# 예제 3.5번(임의 탐색)

<img src="img/문제상황.png" style="width: 500px;" align="left"/>


왼쪽과 같은 문제상황.    
5X5 Grid안에서    
Action은 상,하,좌,우가 가능함 
       
이때 5X5 "State Value Table"을 오른쪽 그림과 같이 만들어 내는것이 문제   

-> Continous Task가 맞으나, 10000번중 한번은 위치 초기화 해줬음

### Action Value Table vs State Value Table
- Action Value 방식은 Action Value에 대한Table과, Reward 파악을 위한 각 State별 Reward Table 두가지를 별도로 만들어줘야함.     
- 반면 State Value는 하나만 있으면 됨-> State Table에 그대로 Reward를 기입해주면 되기 때문

# 풀이시작

** 환경에서 정의되어야 하는 것
- Reward에 대한 정의
- 행동, 상태에대한 정의 -> 에이전트의 각 행동선택에대한 State의 변화 정의

In [152]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [153]:
class gridworld:
    def __init__(self, width = 5 , height = 5):
        self.width = width
        self.height = height
        self.grid = np.zeros((self.width,self.height))
        self.current_location = (4,np.random.randint(0,5))
        # self.current_location = (2,2)
        
        self.grid[0,1] = 10
        self.grid[0,3] = 5
        
        self.actions = np.array(['UP', 'DOWN', 'LEFT', 'RIGHT'])
        
    # For Debugging
    def agent_on_map(self):
        grid = np.zeros(( self.height, self.width))
        grid[ self.current_location[0], self.current_location[1]] = 1
        
        grid[0,1] = 10
        grid[0,3] = 5
        return grid
        
        
    def get_reward(self , location):
        # print('location : ',location)
        # print('reward : ',self.grid[location[0],location[1]])
        return self.grid[location[0],location[1]]
    
    def make_step(self,action):
        reward=0
        before_location = self.current_location
        x = before_location[0]
        y = before_location[1]

        
        
        
        if action == 'UP':
            if self.current_location[0]>0:
                self.current_location = (self.current_location[0]-1,self.current_location[1])
                reward = self.get_reward(self.current_location)
                if self.current_location == (0,1):
                    self.current_location = (4,1)
                elif self.current_location == (0,3):
                    self.current_location = (3,3)

            
        elif action == 'DOWN':
            if self.current_location[0]<4:
                self.current_location = (self.current_location[0]+1,self.current_location[1])
                reward = self.get_reward(self.current_location)
                if self.current_location == (0,1):
                    self.current_location = (4,1)
                elif self.current_location == (0,3):
                    self.current_location = (3,3)

            
        elif action == 'LEFT':
            if self.current_location[1]>0:
                self.current_location = (self.current_location[0],self.current_location[1]-1)
                reward = self.get_reward(self.current_location)
                if self.current_location == (0,1):
                    self.current_location = (4,1)
                elif self.current_location == (0,3):
                    self.current_location = (3,3)

            
        elif action == 'RIGHT':
            if self.current_location[1]<4:
                self.current_location = (self.current_location[0],self.current_location[1]+1)
                reward = self.get_reward(self.current_location)
                if self.current_location == (0,1):
                    self.current_location = (4,1)
                elif self.current_location == (0,3):
                    self.current_location = (3,3)

        # print(action)
        return reward,action
    
    
    


** 에이전트에서 정의되어야 하는것
- 행동을 선택하는 방법에대한 정의
- 학습 과정에 대한 정의

In [154]:
class StateValue_Agent():
    def __init__(self, environment, epsilon=0.05,gamma=0.9):
        self.environment = environment
        self.value_table = np.zeros( (environment.width , environment.height) )
        self.value_table = environment.grid
        self.epsilon = epsilon
        self.gamma = gamma
        
    
    def action_choosing(self,available_actions):
        reward = 0
        if np.random.uniform(0,1) < self.epsilon:
            action = available_actions[np.random.randint(0,len(available_actions))] # 4개중 하나 랜덤하게 택 1 
        else:
            action_list = []
            reward_list = []
            for select_action in available_actions:
                # print(select_action)
                reward, action = self.for_one_step_search(select_action)
                action_list.append(action)
                reward_list.append(reward)
                reward = max(reward_list)
                action = action_list[np.argmax(reward_list)] # one Step Ahead Search중 Max값
            
        return action,reward
    
    def for_one_step_search(self,action):
        reward=0
        location_search = environment.current_location
        x = location_search[0]
        y = location_search[1]
        
        
        if action == 'UP':
            if location_search[0]>0:
                location_search = (location_search[0]-1,location_search[1])
                reward = self.value_table[location_search]
            
        elif action == 'DOWN':
            if location_search[0]<4:
                location_search = (location_search[0]+1,location_search[1])
                reward = self.value_table[location_search]
            
        elif action == 'LEFT':
            if location_search[1]>0:
                location_search = (location_search[0],location_search[1]-1)
                reward = self.value_table[location_search]

        elif action == 'RIGHT':
            if location_search[1]<4:
                location_search = (location_search[0],location_search[1]+1)
                reward = self.value_table[location_search]
        
        return reward,action
    
    
    
    def learning(self,old_state,new_state,reward):
        new_state_value = self.value_table[new_state[0],new_state[1]]
        
        self.value_table[old_state[0],old_state[1]] = reward + self.gamma*new_state_value
        

이걸 실행시킬때 유의사항들
- 환경과 Agent 사이 상호작용을 적절히 분배해주어야함
- Action에 대한 선택은 Agent에 정의되어있고, 그에대한 결과는 환경에정의되어있음
- 그리고 Value Table자체는 Agent에 정의되어있음
- Episodic Task라면, Episode를 정의, Continous Task라면 얼만큼의 Step을 실행시킬 건지 정의

In [155]:
def play(environment , agent, trials = 10000, learn = False):
    reward_list = []
    reward = 0
    for trial in range(trials):
        
        # 한번의 Try시에 거치는 단계들
        old_state = environment.current_location
        action,_ = agent.action_choosing(environment.actions)
        reward,_ = environment.make_step(action)
        new_state = environment.current_location
        
        grid_state_agent = environment.agent_on_map()
        
        # print('old_state : ',old_state)
        # print('Choosed Action : ',action)
        # print('그에대한 reward : ',reward)
        # print('new_state s` : ',new_state)
        # print(grid_state_agent)
        # print('----------')
        
        if learn == True:
            agent.learning(old_state,new_state,reward)

In [156]:
environment = gridworld()
agent = StateValue_Agent(environment)

10000번 움직이면 초기 위치 재정의 하고 다시시작

In [161]:
for i in range(50):
    environment.current_location = (4,np.random.randint(0,5))
    play(environment,agent,trials=10000,learn=True)

결과

In [162]:
np.round(agent.value_table,1)

array([[15.4, 10. , 14.7,  5. ,  8.2],
       [13.8,  1. ,  8.6,  0. ,  4.9],
       [ 8.7,  7.5,  8.2,  1.7,  0.5],
       [ 7.3,  1.4,  1.5,  1.6,  1.4],
       [ 6. ,  5.8,  1.5,  1.7,  1.2]])