In [1]:
from grid_world import negative_grid 
from iterative_policy_evaluation import print_values, print_policy 
import numpy as np 
from collections import defaultdict

In [2]:
# import the grid 

grid = negative_grid()

print("rewards: \n")
print_values(grid.rewards, grid)

rewards: 

---------------------------
-0.10|-0.10|-0.10| 1.00|
---------------------------
-0.10| 0.00|-0.10|-1.00|
---------------------------
-0.10|-0.10|-0.10|-0.10|


In [4]:
# initialize policy at random 

policy = {}
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

for state in grid.actions.keys(): 
    policy[state] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    
print_policy(policy, grid)

---------------------------
  D  |  L  |  D  |     |
---------------------------
  D  |     |  D  |     |
---------------------------
  D  |  D  |  R  |  U  |


In [6]:
# initialize value function at random 

V = {}

for state in grid.all_states(): 
    V[state] = np.random.random() if state in policy else 0 

print_values(V, grid)

---------------------------
 0.57| 0.80| 0.65| 0.00|
---------------------------
 0.47| 0.00| 0.04| 0.00|
---------------------------
 0.82| 0.27| 0.13| 0.53|


In [9]:
# value iteration -> find optimal value function and extract policy 

threshold = 10e-4 
gamma = 0.9 
while True: 
    delta = 0 
    for state in grid.all_states(): 
        if state in policy: 
            # extract old state-value in order to be able to check for convergence 
            old_v = V[state]
            # for each state, find the maximum state-value for all possible actions 
            tmp = float('-inf')
            for action in grid.actions[state]: 
                grid.set_state(state)
                r = grid.move(action)
                
                # deterministic environment hence transition prob = 1 or 0 for all possible actions 
                new_v = r + gamma * V[grid.current_state()]
                
                if new_v > tmp: 
                    V[state] = new_v 
                    tmp = new_v 
                    
            delta = max(delta, np.abs(old_v - V[state]))
            
    if delta < threshold: 
        break 
                

In [10]:
print_values(V, grid)

---------------------------
 0.62| 0.80| 1.00| 0.00|
---------------------------
 0.46| 0.00| 0.80| 0.00|
---------------------------
 0.31| 0.46| 0.62| 0.46|


In [13]:
# extract the policy based on the optimal state-value function 

for state in grid.actions.keys(): 
    best_action = None 
    tmp = float('-inf')
    for action in grid.actions[state]:
        grid.set_state(state)
        r = grid.move(action)
        v = r + gamma * V[grid.current_state()]
        
        if v > tmp: 
            best_action = action 
            tmp = v 
            
    policy[state] = best_action 

print_policy(policy, grid)
    

---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  U  |     |
---------------------------
  U  |  R  |  U  |  L  |
