In [2]:
import numpy as np
import gym

In [3]:
MAP =  [
    'S-------',
    '--------',
    '---H----',
    '-----H--',
    '---H----',
    '-HH---H-',
    '-H--H-H-',
    '---H---G',
]
MAP_SIZE = (8, 8)
MAP_STRING = ''.join(MAP)
ACTION_MAPPING = {0: '←', 1: '↓', 2: '→', 3: '↑'}

ENV = gym.make('FrozenLake8x8-v0')
gym.envs.toy_text.frozen_lake.MAPS['8x8'] = MAP
ENV = gym.make('FrozenLake8x8-v0', is_slippery=False)

ALL_STATE = range(ENV.nS)
ALL_ACTION = range(ENV.nA)

In [4]:
def print_state_value_func(V: np.ndarray, precision=2):
    rounded = np.round_(V, precision).reshape(MAP_SIZE)
    print(' V(s):\n', rounded, '\n')


def print_policy(policy: np.ndarray):
    greedy_policy = np.argmax(policy, axis=1)
    what_2_print = list()
    for idx, action in enumerate(greedy_policy):
        if MAP_STRING[idx] == 'H':
            character = '□'
        else:
            character = ACTION_MAPPING[action]
        what_2_print.append(character)

    what_2_print = np.array(what_2_print).reshape(MAP_SIZE)
    print(' Policy:\n', what_2_print, '\n')

In [5]:
def compute_state_value_func(env,
                             discount_factor = 0.95,
                             theta = 1e-9,
                             max_iteration = 9999) -> np.ndarray:
    V = np.zeros(env.nS)

    for i in range(1, max_iteration + 1):
        is_converged = True
        # For every state
        for state in ALL_STATE:
            # Compute the state action value Q(s, a) for all actions in that state
            Q = np.zeros(env.nA)
            for action in ALL_ACTION:
                # For an action, there can be multiple posible next state
                for prob, next_state, reward, terminated in env.P[state][action]:
                    Q[action] += prob * (reward + discount_factor * V[next_state])

            old_state_value = V[state]
            V[state]= np.max(Q)

            if abs(old_state_value - V[state]) > theta:
                is_converged = False

        if is_converged:
            break
            
    print(f'COMPUTING V(s): finished after {i} iterations\n')
    return V

In [6]:
def create_policy(env,
                  V: np.ndarray,
                  discount_factor = 0.9) -> np.ndarray:
    policy = np.zeros([env.nS, env.nA])
    # For every state
    for state in range(env.nS):
        # Compute the state action value Q(s, a) for all actions in that state
        Q = np.zeros(env.nA)
        for action in range(env.nA):
            # For an action, there can be multiple posible next state
            for prob, next_state, reward, terminated in env.P[state][action]:
                Q[action] += prob * (reward + discount_factor * V[next_state])
        best_action = np.argmax(Q)
        policy[state][best_action] = 1.0
    return policy

In [7]:
V = compute_state_value_func(ENV)
print_state_value_func(V)

COMPUTING V(s): finished after 15 iterations

 V(s):
 [[0.51 0.54 0.57 0.6  0.63 0.66 0.7  0.74]
 [0.54 0.57 0.6  0.63 0.66 0.7  0.74 0.77]
 [0.57 0.6  0.63 0.   0.7  0.74 0.77 0.81]
 [0.6  0.63 0.66 0.7  0.74 0.   0.81 0.86]
 [0.57 0.6  0.63 0.   0.77 0.81 0.86 0.9 ]
 [0.54 0.   0.   0.77 0.81 0.86 0.   0.95]
 [0.57 0.   0.7  0.74 0.   0.9  0.   1.  ]
 [0.6  0.63 0.66 0.   0.9  0.95 1.   0.  ]] 



In [8]:
policy = create_policy(ENV, V)
print_policy(policy)

 Policy:
 [['↓' '↓' '↓' '↓' '↓' '↓' '↓' '↓']
 ['↓' '↓' '↓' '→' '↓' '↓' '↓' '↓']
 ['↓' '↓' '↓' '□' '↓' '→' '↓' '↓']
 ['→' '→' '→' '→' '↓' '□' '↓' '↓']
 ['→' '→' '↑' '□' '↓' '↓' '→' '↓']
 ['↓' '□' '□' '→' '→' '↓' '□' '↓']
 ['↓' '□' '→' '↑' '□' '↓' '□' '↓']
 ['→' '→' '↑' '□' '→' '→' '→' '←']] 

