In [1]:
from pprint import pprint
import numpy as np
import gym
from gym.envs.toy_text import frozen_lake

In [2]:
ENV = gym.make('FrozenLake-v0')

MAP =  ['S-------',
        '--------',
        '--------',
        '--------',
        '----H---',
        '--------',
        '-H------',
        '---H---G']
frozen_lake.MAPS['8x8'] = MAP
ENV = gym.make('FrozenLake8x8-v0', is_slippery=False)

ACTION_MAPPING = { 0: '←', 1: '↓', 2: '→', 3: '↑'}

In [3]:
def print_state_value_func(V):
    print(' V(s):')
    print(np.round_(V, 2).reshape(8, 8), '\n')

def print_policy(policy: np.ndarray):
    print(' POLICY: ')
    temp_policy = np.argmax(policy, axis = 1)
    temp_map = ''.join(MAP)
    string_map = list()
    for idx, action in enumerate(temp_policy):
        if temp_map[idx] == 'H':
            string_map.append('□')
        else:
            string_map.append(ACTION_MAPPING[action])
    string_map = np.array(string_map).reshape((8, 8))
    print(string_map, '\n')

In [15]:
def evaluate_policy_temporal_diccerence(gym_env,
                                        alpha = 0.5,
                                        discount_factor = 1.0,
                                        max_iter = 9999):
    print('     POLICY EVALUATION: TEMPORAL DIFFERENCE')
    # Init policy with equal prob for all actions
    policy = np.ones([gym_env.nS, gym_env.nA]) / gym_env.nA

    V = np.zeros(gym_env.nS)

    for _ in range(max_iter):
        state = gym_env.reset()
        terminated = False
        
        while not terminated:                
            action_list_prob = policy[state].tolist()
            action = np.random.choice(range(gym_env.nA), p=action_list_prob)
            
            next_state, reward, terminated, info = gym_env.step(action)
            
            TD_target = reward + discount_factor * V[next_state]
            V[state] += alpha * (TD_target - V[state])
            
            state = next_state
    return V

In [16]:
V1 = evaluate_policy_temporal_diccerence(ENV, alpha = 1)
print_state_value_func(V1)

     POLICY EVALUATION: TEMPORAL DIFFERENCE
 V(s):
[[0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 1. 1. 0.]
 [0. 0. 1. 0. 1. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0.]] 



In [17]:
V2 = evaluate_policy_temporal_diccerence(ENV, alpha = 0.8)
print_state_value_func(V2)

     POLICY EVALUATION: TEMPORAL DIFFERENCE
 V(s):
[[0.06 0.12 0.08 0.13 0.13 0.17 0.2  0.24]
 [0.07 0.08 0.09 0.09 0.17 0.18 0.2  0.23]
 [0.05 0.05 0.05 0.05 0.18 0.12 0.25 0.33]
 [0.04 0.06 0.05 0.09 0.02 0.12 0.48 0.45]
 [0.01 0.04 0.03 0.04 0.   0.26 0.52 0.63]
 [0.   0.   0.02 0.07 0.11 0.12 0.8  0.84]
 [0.   0.   0.01 0.05 0.19 0.5  0.96 0.98]
 [0.   0.   0.   0.   0.41 0.53 0.91 0.  ]] 



In [18]:
V3 = evaluate_policy_temporal_diccerence(ENV, alpha = 0.5)
print_state_value_func(V3)

     POLICY EVALUATION: TEMPORAL DIFFERENCE
 V(s):
[[0.07 0.07 0.09 0.11 0.13 0.17 0.18 0.19]
 [0.06 0.06 0.07 0.1  0.13 0.14 0.19 0.2 ]
 [0.06 0.07 0.09 0.1  0.1  0.17 0.21 0.21]
 [0.04 0.05 0.06 0.09 0.07 0.16 0.3  0.31]
 [0.02 0.03 0.03 0.05 0.   0.27 0.41 0.42]
 [0.01 0.02 0.03 0.03 0.04 0.19 0.4  0.6 ]
 [0.   0.   0.01 0.02 0.1  0.18 0.29 0.58]
 [0.   0.   0.   0.   0.12 0.22 0.31 0.  ]] 



In [19]:
V4 = evaluate_policy_temporal_diccerence(ENV, alpha = 0.3)
print_state_value_func(V4)

     POLICY EVALUATION: TEMPORAL DIFFERENCE
 V(s):
[[0.07 0.07 0.09 0.1  0.13 0.13 0.15 0.15]
 [0.06 0.07 0.08 0.1  0.11 0.14 0.15 0.15]
 [0.05 0.06 0.06 0.08 0.11 0.14 0.16 0.19]
 [0.05 0.05 0.06 0.07 0.06 0.12 0.15 0.2 ]
 [0.04 0.05 0.06 0.05 0.   0.15 0.22 0.28]
 [0.04 0.04 0.05 0.08 0.11 0.33 0.31 0.34]
 [0.01 0.   0.02 0.06 0.17 0.34 0.5  0.52]
 [0.   0.   0.01 0.   0.19 0.33 0.61 0.  ]] 

