# 015_DP_frozenlake_policy_evaluation
# Iterative Policy Evaluation
## One-Array version
```
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

nA = 4
nS = 4*4 = 16
P = {s: {a: [] for a in range(nA)} for s in range(nS)}
env.P[0][0] 
{0: {0: [(0.3333333333333333, 0, 0.0, False), --> (prob, s_, reward, done)
         (0.3333333333333333, 0, 0.0, False),
         (0.3333333333333333, 4, 0.0, False)],
```

In [1]:
import gym
import numpy as np

env = gym.make('FrozenLake-v1', is_slippery=False)
env.P

{0: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 4, 0.0, False)],
  2: [(1.0, 1, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 1: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 5, 0.0, True)],
  2: [(1.0, 2, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)],
  1: [(1.0, 6, 0.0, False)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 2, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)],
  1: [(1.0, 7, 0.0, True)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 3, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, False)],
  1: [(1.0, 8, 0.0, False)],
  2: [(1.0, 5, 0.0, True)],
  3: [(1.0, 0, 0.0, False)]},
 5: {0: [(1.0, 5, 0, True)],
  1: [(1.0, 5, 0, True)],
  2: [(1.0, 5, 0, True)],
  3: [(1.0, 5, 0, True)]},
 6: {0: [(1.0, 5, 0.0, True)],
  1: [(1.0, 10, 0.0, False)],
  2: [(1.0, 7, 0.0, True)],
  3: [(1.0, 2, 0.0, False)]},
 7: {0: [(1.0, 7, 0, True)],
  1: [(1.0, 7, 0, True)],
  2: [(1.0, 7, 0, True)],
  3: [(1.0, 7, 0, True)]},
 8: {0: [(1.0, 8, 0.0, False)],
  1: [(1.0, 12, 0.0, True)],
  2: [(

In [3]:
num_states = len(env.P)
num_actions = len(env.P[0])
transitions = env.P 

<img src="https://miro.medium.com/max/1400/1*G1mg6UU6wsLlJApebP3BMg.png" width=600/>

In [4]:
GAMMA = 1.0

#Input pi, the policy to be evaluated 
policy = np.ones([num_states, num_actions]) * 0.25

#Algorithm parameter: a small threshold theta > 0
THETA = 1e-5

# initialize an array V(s) = 0 for all s in S+, arbitrarily except that V(terminal) = 0
V = np.zeros(num_states)

#Loop
while True:
    #delta <- 0
    delta = 0
    #Loop for each s:
    for s in range(num_states):
        #v <- V(s)
        old_value = V[s]
        new_value = 0
        #update rule : V(s) = sum(pi(a|s)*sum(p(s,a)*[r + gamma*v(s')]))
        for a, prob_action in enumerate(policy[s]):
            # sum over s', r
            for prob, s_, reward, _ in transitions[s][a]:
                new_value += prob_action * prob * (reward + GAMMA * V[s_])
        V[s] = new_value
        #delta <- max(delta|v - V(s)|)
        delta = max(delta, np.abs(old_value - V[s]))
        
    #until delta < theta
    if delta < THETA:
        break

In [5]:
#V는 v_pi에 수렴
V = V.reshape(4, 4)
print("수렴한 Optimal Value = \n", V)

수렴한 Optimal Value = 
 [[0.013911   0.01161424 0.02094062 0.01046758]
 [0.01623478 0.         0.04074774 0.        ]
 [0.03479961 0.08816698 0.14205099 0.        ]
 [0.         0.17581855 0.4392897  0.        ]]


In [6]:
for i in range(4):
    print('[', end='')
    for j in range(4):
        print(f"{V[i, j]:.4f}", end=', ')
    print(']')

[0.0139, 0.0116, 0.0209, 0.0105, ]
[0.0162, 0.0000, 0.0407, 0.0000, ]
[0.0348, 0.0882, 0.1421, 0.0000, ]
[0.0000, 0.1758, 0.4393, 0.0000, ]
