In [1]:
from pprint import pprint
import numpy as np
import gym
from gym.envs.toy_text import frozen_lake

In [9]:
ENV = gym.make('FrozenLake-v0')

MAP =  ['S---',
        '----',
        '----',
        '---G']
frozen_lake.MAPS['4x4'] = MAP
ENV = gym.make('FrozenLake-v0', is_slippery=False)

ACTION_MAPPING = { 0: '←', 1: '↓', 2: '→', 3: '↑'}

In [10]:
def print_policy(policy: np.ndarray):
    print(' POLICY: ')
    temp_policy = np.argmax(policy, axis = 1)
    temp_map = ''.join(MAP)
    string_map = list()
    for idx, action in enumerate(temp_policy):
        if temp_map[idx] == 'H':
            string_map.append('□')
        else:
            string_map.append(ACTION_MAPPING[action])
    string_map = np.array(string_map).reshape((4, 4))
    print(string_map, '\n')

In [22]:
def policy_improvement_loop(gym_env,
                            alpha = 0.5,
                            discount_factor = 1.0,
                            max_iter = 9999):
    print('     POLICY ITERATION: TEMPORAL DIFFERENCE')
    # Init policy with equal prob for all actions
    policy = np.ones([gym_env.nS, gym_env.nA]) / gym_env.nA
    policy_greedy = np.copy(policy)

    Q = np.zeros((gym_env.nS, gym_env.nA))

    for k in range(1, max_iter + 1):
        eps_greedy = 1.0 / (k + 1)
        
        def choose_action(state, gym_env = gym_env, policy = policy):
            probs = policy[state].tolist()
            action = np.random.choice(range(gym_env.nA), p=probs)
            return action
        
        state = gym_env.reset()
        action = choose_action(state)
        terminated = False
        
        while not terminated:
            
            next_state, reward, terminated, _ = gym_env.step(action)
            next_action = choose_action(next_state)

            TD_target = reward + discount_factor * Q[state, next_action]
            Q[state, action] += alpha * (TD_target - Q[state, action])
            
            if np.random.rand() < eps_greedy:
                policy[state] = policy_greedy[state]

            elif Q[state].sum() != 0:
                best_action = np.argmax(Q[state])
                policy[state] = np.zeros(gym_env.nA)
                policy[state][best_action] = 1.0
            
            state, action = next_state, next_action
            
#             print(Q)
            
    print_policy(policy)
            
    return policy, Q

In [27]:
policy, Q = policy_improvement_loop(ENV,
                                    alpha = 0.5,
                                    max_iter = 999999)

     POLICY ITERATION: TEMPORAL DIFFERENCE
 POLICY: 
[['←' '←' '←' '←']
 ['←' '←' '←' '←']
 ['←' '←' '←' '↓']
 ['←' '←' '→' '←']] 



In [28]:
Q

array([[0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.76490778, 1.43836349, 0.89736167, 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.52846234, 0.25      , 1.15697224, 0.        ],
       [0.        , 0.        , 0.        , 0.        ]])