In [1]:
from grid_world import negative_grid, standard_grid
from iterative_policy_evaluation import print_values, print_policy 
import numpy as np 

In [39]:
# declare constans 

alpha = 0.1 # learning rate 
gamma = 0.9 # discount factor 
possible_actions = ('U', 'D', 'L', 'R') 
grid = standard_grid()

# define epsilon-greedy policy function 
def random_action(a, epsilon = 0.1): 
    p = np.random.random()
    return a if p < (1 - epsilon) else np.random.choice(possible_actions)

# test the function 
print(random_action('U'))

U


In [46]:
# define an episode 

def play_game(policy, grid): 
    # pick a starting state 
    state = (2, 0)
    
    # set the starting state 
    grid.set_state(state)
    
    # initialize list of states and rewards for the td-algorithm 
    states_and_rewards = [(state, 0)]
    
    # we are in state: state. pick action according to epsilon-greedy policy and make a move. 
    # cache the resulting state and immideate reward 
    while not grid.game_over(): 
        action = policy[state]
        action = random_action(action) 
        reward = grid.move(action) 
        state = grid.current_state()
        states_and_rewards.append((state, reward))
        
        
    return states_and_rewards 

# define policy and test play an episode 
policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U',
  }
    
play_game(policy, grid)

[((2, 0), 0), ((1, 0), 0), ((0, 0), 0), ((0, 1), 0), ((0, 2), 0), ((0, 3), 1)]

In [52]:
# tabular td(0) for estimating the value function given a policy (precition problem)

# initialize V 

V = {}

states = grid.all_states() 

for state in states: 
    if state in grid.actions: 
        V[state] = np.random.random()
    else: 
        V[state] = 0 
        
        
# loop for 10 000 episodes 
for _ in range(1000): 
    states_and_rewards = play_game(policy, grid)
    
    for timestep in range(len(states_and_rewards) - 1): 
        curr_state, _ = states_and_rewards[timestep]
        successor_state, reward = states_and_rewards[timestep + 1]
        V[curr_state] += alpha * (reward + gamma * (V[successor_state] - V[curr_state]))

In [53]:
print("values:")
print_values(V, grid)
print("policy:")
print_policy(policy, grid)

values:
---------------------------
 1.09| 1.10| 1.11| 0.00|
---------------------------
 1.07| 0.00|-0.51| 0.00|
---------------------------
 0.98|-0.19|-0.59|-0.96|
policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |
