In [10]:
from grid_world import negative_grid 
from iterative_policy_evaluation import print_values, print_policy 
import numpy as np 
from collections import defaultdict 

In [23]:
grid = negative_grid() 

print("rewards: \n")
print_values(grid.rewards, grid)

rewards: 

---------------------------
-0.10|-0.10|-0.10| 1.00|
---------------------------
-0.10| 0.00|-0.10|-1.00|
---------------------------
-0.10|-0.10|-0.10|-0.10|


In [45]:
# initialize random policy 

policy = {}
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

for state in grid.actions: 
    policy[state] = np.random.choice(ALL_POSSIBLE_ACTIONS)

    
# print policy 
print_policy(policy, grid)

---------------------------
  U  |  R  |  D  |     |
---------------------------
  R  |     |  D  |     |
---------------------------
  R  |  R  |  D  |  L  |


In [59]:
# iterative policy evaluation & policy improvement 
 
gamma = 0.9 
threshold = 1e-6 
V = {}

policy = {}
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

for state in grid.actions: 
    policy[state] = np.random.choice(ALL_POSSIBLE_ACTIONS)

    
# print policy 
print_policy(policy, grid)

for state in grid.all_states(): 
    if state in policy: 
        V[state] = np.random.random()
    else: 
        V[state] = 0 
        
while True: 
    # iterative policy evaluation 
    while True: 
        delta = 0
        for state in grid.all_states(): 
            old_v = V[state]
            
            if state in policy: 
                grid.set_state(state)
                action = policy[state]
                r = grid.move(action)
                V[state] = r + gamma * V[grid.current_state()]
                delta = max(delta, np.abs(old_v - V[state]))
            
        if delta < threshold: 
            break 
        
    # policy improvement 
    policy_converged = True  
    for state in grid.actions: 
        old_action = policy[state]
    
        new_a = None 
        best_value = float('-inf')
        for action in ALL_POSSIBLE_ACTIONS: 
            grid.set_state(state)
            r = grid.move(action)
            v = r + gamma * V[grid.current_state()]
            if v > best_value: 
                best_value = v 
                new_action = action 
        
        print(old_action, new_action)
        # update policy 
        policy[state] = new_action
        if old_action != new_action: 
            policy_converged = False 
    
    if policy_converged: 
        break 
    
    

---------------------------
  D  |  D  |  D  |     |
---------------------------
  L  |     |  U  |     |
---------------------------
  R  |  R  |  L  |  L  |
D R
D U
D R
L U
U L
R U
R U
L U
L D
R R
U R
R R
U U
L U
U U
U L
U U
D D
R R
R R
R R
U U
U U
U U
L R
U U
D L
R R
R R
R R
U U
U U
U U
R R
U U
L L


In [60]:
print("values:")
print_values(V, grid)
print("policy:")
print_policy(policy, grid)

values:
---------------------------
 0.62| 0.80| 1.00| 0.00|
---------------------------
 0.46| 0.00| 0.80| 0.00|
---------------------------
 0.31| 0.46| 0.62| 0.46|
policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  U  |     |
---------------------------
  U  |  R  |  U  |  L  |
