# Example 4.1: $4 \times 4$ grid world

<img src="figures/chap.04.01.example1.png" width="70%">

* nonterminal states: $\mathcal{S} = \{1, 2, \cdots, 14 \}$
* possible actions: $\mathcal{A} = \{ \textrm{up}, \textrm{down}, \textrm{right}, \textrm{left} \}$
* action은 deterministic
  * $p(6, -1 | 5, \textrm{right}) = 1$
  * $p(7, -1 | 7, \textrm{right}) = 1$
  * $p(10, r | 5, \textrm{right}) = 0$ for all $r \in \mathcal{R}$
* undiscounted ($\gamma = 1$)
* 모든 transition의 reward는 terminal state에 가기 전까지는 모두 `-1`

In [1]:
import numpy as np
np.set_printoptions(precision=1)

### Grid world state index

|   |   |   |   |
|----|----|----|----|
| 0,0  | 0,1 | 0,2 | 0,3 |
| 1,0  | 1,1 | 1,2 | 1,3 |
| 2,0  | 2,1 | 2,2 | 2,3 |
| 3,0  | 3,1 | 3,2 | 3,3 |

In [2]:
class GridWorld():
  def __init__(self, size=4, terminal_states=[(0, 0), (3, 3)]):
    """
    Args:
      size: int, Gridworld size
      terminal_states: list of tuples
    """
    self.actions = ['up', 'down', 'right', 'left']
    self.terminal_states = terminal_states # special state (terminal state)
    self.values = np.zeros((size, size))
    # random initialization
    #self.values = np.random.normal(scale=0.1, size=(size, size))
    #self.values[0, 0] = 0.
    #self.values[-1, -1] = 0.
    self.gamma = 1.0
    self.size = size
    self.theta = 0.0001 # convergence precision
    
    
  def Step(self, state, action):
    """
    Args:
      state: tuple (x, y) coordinate
      action: string
      
    Returns:
      next_state: tuple (x, y) coordinate
      reward: int
    """
    if state in self.terminal_states:
      # terminal state에 있으면 모든 action에 next_state=state, reward=0 을 준다.
      next_state = state
      reward = 0
    else:
      if action == 'up':
        if state[0] > 0:
          next_state = (state[0]-1, state[1])
          reward = -1
        else:
          next_state = state
          reward = -1
      elif action == 'down':
        if state[0] < self.size-1:
          next_state = (state[0]+1, state[1])
          reward = -1
        else:
          next_state = state
          reward = -1
      elif action == 'right':
        if state[1] < self.size-1:
          next_state = (state[0], state[1]+1)
          reward = -1
        else:
          next_state = state
          reward = -1
      elif action == 'left':
        if state[1] > 0:
          next_state = (state[0], state[1]-1)
          reward = -1
        else:
          next_state = state
          reward = -1
    return next_state, reward
  
  def IterativePolicyEvaluation(self, policy):
    #iteration = 0
    while True:
      delta = 0
      for i in range(self.size):
        for j in range(self.size):
          if (i, j) in self.terminal_states:
            continue
          else:
            v = self.values[i, j]
            new_value = 0.
            for key, value in policy.get_policy_at_state(state=(i, j)).items():
              next_state, reward = self.Step(state=(i, j), action=key)
              new_value += value * (reward + self.gamma * self.values[next_state[0], next_state[1]])
            self.values[i, j] = new_value
            delta = np.maximum(delta, np.abs(v - self.values[i, j]))
      #iteration += 1
      if delta < self.theta:
        break

In [3]:
class Policy():
  def __init__(self, size=4):
    self.init_actions = {'up': 0.25,
                         'down': 0.25,
                         'right': 0.25,
                         'left': 0.25}
    self.policy = np.asarray([self.init_actions] * size * size).reshape((size, size))
    
  def get_policy_at_state(self, state):
    """
    Args:
      state: tuple (x, y) coordinate
    """
    return self.policy[state[0], state[1]]

In [4]:
p = Policy()
g = GridWorld()

In [5]:
g.IterativePolicyEvaluation(p)

In [6]:
g.values

array([[  0., -14., -20., -22.],
       [-14., -18., -20., -20.],
       [-20., -20., -18., -14.],
       [-22., -20., -14.,   0.]])

### Results

<img src="figures/chap.04.01.example2.png" width="70%">