In [5]:
from pprint import pprint
import numpy as np
import gym
from gym.envs.toy_text import frozen_lake

In [6]:
ENV = gym.make('FrozenLake8x8-v0')
MAP =  ['S-------',
        '--------',
        '---H----',
        '-----H--',
        '---H----',
        '-HH---H-',
        '-H--H-H-',
        '---H---G']
frozen_lake.MAPS['8x8'] = MAP
ENV = gym.make('FrozenLake8x8-v0', is_slippery=False)

ACTION_MAPPING = { 0: '←', 1: '↓', 2: '→', 3: '↑'}

In [46]:
def compute_state_value_func(gym_env,
                             discount_factor = 0.99,
                             eps = 1e-9,
                             max_iter = 9999) -> np.ndarray:
    V = np.zeros(gym_env.nS)

    for i in range(1, max_iter + 1):
        is_converged = True
        # For every state
        for state in range(gym_env.nS):
            # Compute the state action value Q(s, a) for all actions in that state
            Q = np.zeros(gym_env.nA)
            for action in range(gym_env.nA):
                # For an action, there can be multiple posible next state
                for prob, next_state, reward, terminated in gym_env.P[state][action]:
                    Q[action] += prob * (reward + discount_factor * V[next_state])

            old_state_value = V[state]
            V[state]= np.max(Q)

            if abs(old_state_value - V[state]) > eps:
                is_converged = False

        if is_converged:
            print(f'COMPUTING V(s): converged after {i} iterations\n')
            return V

    print(f'COMPUTING V(s): reached max number of iterations ({max_iter})\n')
    return V

In [47]:
def create_policy(gym_env,
                  V: np.ndarray,
                  discount_factor = 0.99) -> np.ndarray:
    policy = np.zeros([gym_env.nS, gym_env.nA])
    # For every state
    for state in range(gym_env.nS):
        # Compute the state action value Q(s, a) for all actions in that state
        Q = np.zeros(gym_env.nA)
        for action in range(gym_env.nA):
            # For an action, there can be multiple posible next state
            for prob, next_state, reward, terminated in gym_env.P[state][action]:
                Q[action] += prob * (reward + discount_factor * V[next_state])
        best_action = np.argmax(Q)
        policy[state] = np.eye(gym_env.nA)[best_action]
    return policy

In [48]:
def print_state_value_func(V):
    print(' V(s):')
    print(np.round_(V, 2).reshape(8, 8), '\n')

def print_policy(policy: np.ndarray):
    print(' POLICY: ')
    temp_policy = np.argmax(policy, axis = 1)
    temp_map = ''.join(MAP)
    string_map = list()
    for idx, action in enumerate(temp_policy):
        if temp_map[idx] == 'H':
            string_map.append('□')
        else:
            string_map.append(ACTION_MAPPING[action])
    string_map = np.array(string_map).reshape((8, 8))
    print(string_map, '\n')

In [50]:
V = compute_state_value_func(ENV)
print_state_value_func(V)

policy = create_policy(ENV, V)
print_policy(policy)

COMPUTING V(s): converged after 15 iterations

 V(s):
[[0.88 0.89 0.9  0.9  0.91 0.92 0.93 0.94]
 [0.89 0.9  0.9  0.91 0.92 0.93 0.94 0.95]
 [0.9  0.9  0.91 0.   0.93 0.94 0.95 0.96]
 [0.9  0.91 0.92 0.93 0.94 0.   0.96 0.97]
 [0.9  0.9  0.91 0.   0.95 0.96 0.97 0.98]
 [0.89 0.   0.   0.95 0.96 0.97 0.   0.99]
 [0.9  0.   0.93 0.94 0.   0.98 0.   1.  ]
 [0.9  0.91 0.92 0.   0.98 0.99 1.   0.  ]] 

 POLICY: 
[['↓' '↓' '↓' '↓' '↓' '↓' '↓' '↓']
 ['↓' '↓' '↓' '→' '↓' '↓' '↓' '↓']
 ['↓' '↓' '↓' '□' '↓' '→' '↓' '↓']
 ['→' '→' '→' '→' '↓' '□' '↓' '↓']
 ['→' '→' '↑' '□' '↓' '↓' '→' '↓']
 ['↓' '□' '□' '→' '→' '↓' '□' '↓']
 ['↓' '□' '→' '↑' '□' '↓' '□' '↓']
 ['→' '→' '↑' '□' '→' '→' '→' '←']] 



In [51]:
def play(gym_env, policy, num_episodes):
    total_win, total_reward = 0, 0
    
    for episode in range(num_episodes):
        state = gym_env.reset()
        terminated = False
        while not terminated:
            action = np.argmax(policy[state])
            next_state, reward, terminated, info = gym_env.step(action)
            state = next_state
            total_reward += reward
            if terminated and reward == 1.0:
                total_win += 1
    avg_reward = total_reward / num_episodes
    
    print(f'NUM_EPISODES: {num_episodes}')
    print(f'   TOTAL_WIN: {total_win}')
    print(f'  AVG REWARD: {avg_reward}')

In [52]:
NUM_EPISODES = 1000

play(ENV, policy, NUM_EPISODES)

NUM_EPISODES: 1000
   TOTAL_WIN: 1000
  AVG REWARD: 1.0
