# 017_DP_frozenlake_value_iteration

# Value Iteration
```
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

nA = 4
nS = 4*4 = 16
P = {s: {a: [] for a in range(nA)} for s in range(nS)}
env.P[0][0] 
{0: {0: [(0.3333333333333333, 0, 0.0, False), --> (P[s'], s', r, done)
         (0.3333333333333333, 0, 0.0, False),
         (0.3333333333333333, 4, 0.0, False)],
```

In [40]:
import gym
import numpy as np

env = gym.make('FrozenLake-v1', is_slippery=False)
env.P

{0: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 4, 0.0, False)],
  2: [(1.0, 1, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 1: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 5, 0.0, True)],
  2: [(1.0, 2, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)],
  1: [(1.0, 6, 0.0, False)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 2, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)],
  1: [(1.0, 7, 0.0, True)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 3, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, False)],
  1: [(1.0, 8, 0.0, False)],
  2: [(1.0, 5, 0.0, True)],
  3: [(1.0, 0, 0.0, False)]},
 5: {0: [(1.0, 5, 0, True)],
  1: [(1.0, 5, 0, True)],
  2: [(1.0, 5, 0, True)],
  3: [(1.0, 5, 0, True)]},
 6: {0: [(1.0, 5, 0.0, True)],
  1: [(1.0, 10, 0.0, False)],
  2: [(1.0, 7, 0.0, True)],
  3: [(1.0, 2, 0.0, False)]},
 7: {0: [(1.0, 7, 0, True)],
  1: [(1.0, 7, 0, True)],
  2: [(1.0, 7, 0, True)],
  3: [(1.0, 7, 0, True)]},
 8: {0: [(1.0, 8, 0.0, False)],
  1: [(1.0, 12, 0.0, True)],
  2: [(

In [41]:
num_states = len(env.P)
num_actions = len(env.P[0])
transitions = env.P 

<img src="https://jaydottechdotblog.files.wordpress.com/2016/12/rl-value-iteration-algorithm.png" width=600 />

In [42]:
GAMMA = 1.0
THETA = 1e-5

# 1. initialize array V(s) = 0 for all s in S+
V = np.zeros(num_states)

#Loop
while True:
    #delta <- 0
    delta = 0
    #Loop for each s
    for s in range(num_states):
        old_value = V[s]
        new_action_values = np.zeros(num_actions)
        
        #V(s) = max_a(sum(p(s,a)*[r + gamma*v(s')]))
        for a in range(num_actions):
            # sum over s', r
            for prob, s_, r, _ in transitions[s][a]:
                new_action_values[a] += prob * (r + GAMMA * V[s_]) / num_actions
        V[s] = max(new_action_values)
        
        #delta <-max(delta|v - V(s)|)
        delta = max(delta, np.abs(old_value - V[s]))
    #until delta < theta
    if delta < THETA:
        break

In [43]:
V

array([2.44140625e-04, 9.76562500e-04, 3.90625000e-03, 9.76562500e-04,
       9.76562500e-04, 0.00000000e+00, 1.56250000e-02, 0.00000000e+00,
       3.90625000e-03, 1.56250000e-02, 6.25000000e-02, 0.00000000e+00,
       0.00000000e+00, 6.25000000e-02, 2.50000000e-01, 0.00000000e+00])

action 값을 이용하여 결정론적 최적 정책 추출

In [44]:
# extract deterministic optimal policy using action value
pi = np.zeros((num_states, num_actions))

for s in range(num_states):
    action_values = np.zeros(num_actions)
    
    for a in range(num_actions):
        # sum over s', r
        for prob, s_, r, _ in transitions[s][a]:
            action_values[a] += prob * (r + GAMMA * V[s_])
            #pi(s) <- argmax_a(action_values)
            new_action = np.argmax(action_values)
            pi[s] = np.eye(num_actions)[new_action]

In [45]:
print("Optimal Policy = \n", pi)
print(
    """
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
    """)
print("Optimal Action = \n", np.argmax(pi, axis=1).reshape(4, 4))

Optimal Policy = 
 [[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]

SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
    
Optimal Action = 
 [[1 2 1 0]
 [1 0 1 0]
 [2 1 1 0]
 [0 2 2 0]]
