In [1]:
import numpy as np
import gym

In [13]:
MAP =  [
    'S-------',
    '--------',
    '--------',
    '--------',
    '----H---',
    '--------',
    '-H------',
    '---H---G',
]
MAP_SIZE = (8, 8)
MAP_STRING = ''.join(MAP)
ACTION_MAPPING = {0: '←', 1: '↓', 2: '→', 3: '↑'}

ENV = gym.make('FrozenLake8x8-v0')
gym.envs.toy_text.frozen_lake.MAPS['8x8'] = MAP
ENV = gym.make('FrozenLake8x8-v0', is_slippery=False)

ALL_STATE = range(ENV.nS)
ALL_ACTION = range(ENV.nA)

In [14]:
def print_state_value_func(V: np.ndarray, precision=3):
    rounded = np.round_(V, precision).reshape(MAP_SIZE)
    print(' V(s):\n', rounded, '\n')

In [15]:
def temporal_difference_evaluation(env,
                                   alpha = 0.5,
                                   discount_factor = 1.0,
                                   max_iteration = 9999):
    # Init policy with equal prob for all actions
    policy = np.ones([env.nS, env.nA]) / env.nA

    V = np.zeros(env.nS)

    for _ in range(max_iteration):
        state = env.reset()
        terminated = False
        
        while not terminated:                
            action = np.random.choice(
                ALL_ACTION,
                p=policy[state].tolist()
            )
            next_state, reward, terminated, info = env.step(action)

            TD_target = reward + discount_factor * V[next_state]
            V[state] += alpha * (TD_target - V[state])

            state = next_state

    print_state_value_func(V)
    return V

In [16]:
V1 = temporal_difference_evaluation(ENV, alpha = 1)

 V(s):
 [[0. 0. 0. 0. 1. 1. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0.]] 



In [17]:
V2 = temporal_difference_evaluation(ENV, alpha = 0.8)

 V(s):
 [[0.074 0.077 0.11  0.117 0.141 0.242 0.275 0.294]
 [0.037 0.08  0.092 0.092 0.216 0.196 0.177 0.364]
 [0.032 0.046 0.066 0.038 0.156 0.19  0.343 0.345]
 [0.026 0.026 0.027 0.031 0.008 0.04  0.111 0.314]
 [0.008 0.013 0.043 0.044 0.    0.012 0.123 0.481]
 [0.006 0.007 0.04  0.109 0.19  0.178 0.49  0.832]
 [0.001 0.    0.064 0.031 0.083 0.257 0.716 0.966]
 [0.001 0.    0.    0.    0.01  0.57  0.938 0.   ]] 



In [18]:
V3 = temporal_difference_evaluation(ENV, alpha = 0.5)

 V(s):
 [[0.061 0.068 0.068 0.071 0.078 0.089 0.123 0.129]
 [0.048 0.061 0.061 0.066 0.078 0.087 0.135 0.131]
 [0.044 0.055 0.055 0.076 0.082 0.14  0.152 0.195]
 [0.033 0.038 0.05  0.073 0.041 0.161 0.164 0.181]
 [0.012 0.026 0.036 0.073 0.    0.12  0.219 0.286]
 [0.004 0.004 0.018 0.077 0.098 0.33  0.347 0.488]
 [0.003 0.    0.038 0.078 0.301 0.312 0.369 0.75 ]
 [0.009 0.006 0.009 0.    0.023 0.221 0.637 0.   ]] 



In [19]:
V4 = temporal_difference_evaluation(ENV, alpha = 0.3)

 V(s):
 [[0.059 0.066 0.072 0.08  0.092 0.109 0.128 0.139]
 [0.057 0.064 0.067 0.088 0.09  0.1   0.144 0.155]
 [0.048 0.05  0.064 0.061 0.091 0.139 0.186 0.17 ]
 [0.04  0.04  0.045 0.05  0.058 0.081 0.227 0.219]
 [0.025 0.033 0.036 0.039 0.    0.127 0.183 0.301]
 [0.016 0.014 0.049 0.063 0.204 0.208 0.376 0.409]
 [0.006 0.    0.045 0.096 0.18  0.301 0.472 0.609]
 [0.005 0.002 0.008 0.    0.061 0.432 0.912 0.   ]] 

