In [5]:
from grid_world import negative_grid, standard_grid
from iterative_policy_evaluation import print_values, print_policy 
import numpy as np 

In [66]:
# define logic for playing one episode of the grid world game  
gamma = 0.9 
def play_game(policy, grid): 
    '''
    Plays one episode of the game given a policy and a game grid 
    
    returns: a list of corresponding states and the returns 
    '''
    gamma = 0.9 
    
    # random first visit state 
    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    
    grid.set_state(start_states[start_idx])
    state = grid.current_state()
    
    states_and_rewards = [(state, 0)] # reward of zero for first state 
    
    while not grid.game_over(): 
        action = policy[state]
        reward = grid.move(action)
        state = grid.current_state()
        states_and_rewards.append((state, reward))
        
    G = 0 
    states_and_returns = []
    first = True
    for idx, tpl in enumerate(reversed(states_and_rewards)): 
        state = tpl[0]
        reward = tpl[1]
        if first: 
            first = False
        else: 
            states_and_returns.append((state, G))
        G = reward + gamma * G
        
    states_and_returns.reverse()
    return states_and_returns
        

In [67]:
# monte carlo policy evaluation i.e. given a policy pi find the value function 
# FIRST VISIT monte carlo approach 

grid = standard_grid()
policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U',
}


V = {}
returns = {} # dictionary of state-returns we have observed 
for state in grid.all_states(): 
    if state in policy: 
        V[state] = np.random.random()
        returns[state] = []
    else: 
        V[state] = 0

# repeat for some iterations 

for _ in range(100): 
    states_and_returns = play_game(policy, grid)
    seen_states = set()
    
    for state, ret in states_and_returns: 
        # check if we have already seen the state
        if state not in seen_states: 
            returns[state].append(ret)
            V[state] = np.mean(returns[state])
            seen_states.add(state)
print("rewards:")
print_values(grid.rewards, grid)
print("values:")
print_values(V, grid)
print("policy:")
print_policy(policy, grid)

rewards:
---------------------------
 0.00| 0.00| 0.00| 1.00|
---------------------------
 0.00| 0.00| 0.00|-1.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|
values:
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00|-1.00| 0.00|
---------------------------
 0.66|-0.81|-0.90|-1.00|
policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |
