In [1]:
from pprint import pprint
import numpy as np
import gym
from gym.envs.toy_text import frozen_lake

In [2]:
ENV = gym.make('FrozenLake-v0')

MAP =  ['S-------',
        '--------',
        '--------',
        '--------',
        '----H---',
        '--------',
        '-H------',
        '---H---G']
frozen_lake.MAPS['8x8'] = MAP
ENV = gym.make('FrozenLake8x8-v0', is_slippery=False)

ACTION_MAPPING = { 0: '←', 1: '↓', 2: '→', 3: '↑'}

In [3]:
def print_state_value_func(V):
    print(' V(s):')
    print(np.round_(V, 2).reshape(8, 8), '\n')

In [4]:
def sample_episode(gym_env, policy):
    episode = list()
    state = gym_env.reset()
    terminated = False
    # Sample an episode
    while not terminated:
        action_list_prob = policy[state].tolist()
        action = np.random.choice(range(gym_env.nA), p=action_list_prob)

        next_state, reward, terminated, info = gym_env.step(action)
        episode.append((state, action, reward))
        state = next_state
    return episode

In [5]:
def evaluate_policy_using_Monte_Carlo(gym_env,
                                      MC_type = 'first_visit',
                                      discount_factor = 1.0,
                                      incremental = False,
                                      max_iter = 9999):
    # Init policy with equal prob for all actions
    policy = np.ones([gym_env.nS, gym_env.nA]) / gym_env.nA

    V = np.zeros(gym_env.nS)
    N = np.zeros_like(V)
    G = np.zeros_like(V)

    for _ in range(max_iter):
        episode = sample_episode(gym_env, policy)

        # Calulate G and N for Monte Carlo first visit
        if MC_type == 'first_visit':
            visited = np.zeros_like(V, dtype=int)
            for idx, (state, _, _) in enumerate(episode):
                if visited[state] == 1:
                    continue
                N[state] += 1
                for i, (_, _, reward) in enumerate(episode[idx:]):
                    G[state] += (discount_factor**i) * reward
                visited[state] == 1
        # Calulate G and N for Monte Carlo every visit
        elif MC_type == 'every_visit':
            for idx, (state, _, _) in enumerate(episode):
                N[state] += 1
                for i, (_, _, reward) in enumerate(episode[idx:]):
                    G[state] += (discount_factor**i) * reward
        else:
            assert False

    if incremental:
        alpha = np.divide(1, N, out = np.zeros_like(N), where = N!=0)
        V += alpha * (G - V)
    else:
        V = np.divide(G, N, out = np.zeros_like(N), where = N!=0)

    return V

In [6]:
V0 = evaluate_policy_using_Monte_Carlo(ENV,
                                       MC_type = 'first_visit',
                                       incremental = False)
print_state_value_func(V0)

 V(s):
[[0.07 0.07 0.08 0.08 0.1  0.11 0.13 0.14]
 [0.06 0.06 0.07 0.08 0.09 0.12 0.13 0.14]
 [0.06 0.06 0.06 0.07 0.08 0.12 0.14 0.16]
 [0.04 0.05 0.05 0.06 0.07 0.12 0.18 0.21]
 [0.03 0.04 0.04 0.04 0.   0.15 0.24 0.29]
 [0.02 0.02 0.04 0.06 0.1  0.23 0.34 0.44]
 [0.01 0.   0.03 0.06 0.16 0.3  0.46 0.65]
 [0.01 0.   0.01 0.   0.17 0.38 0.61 0.  ]] 



In [25]:
V1 = evaluate_policy_using_Monte_Carlo(ENV,
                                       MC_type = 'every_visit',
                                       incremental = False)
print_state_value_func(V1)

 V(s):
[[0.07 0.08 0.08 0.09 0.1  0.11 0.13 0.13]
 [0.06 0.07 0.07 0.08 0.1  0.12 0.14 0.15]
 [0.06 0.06 0.07 0.08 0.09 0.13 0.16 0.17]
 [0.04 0.05 0.05 0.06 0.07 0.12 0.18 0.21]
 [0.04 0.04 0.05 0.04 0.   0.15 0.23 0.3 ]
 [0.03 0.03 0.05 0.07 0.11 0.23 0.33 0.41]
 [0.01 0.   0.04 0.07 0.15 0.3  0.47 0.62]
 [0.02 0.02 0.02 0.   0.16 0.37 0.62 0.  ]] 



In [26]:
V2 = evaluate_policy_using_Monte_Carlo(ENV,
                                       MC_type = 'first_visit',
                                       incremental = True)
print_state_value_func(V2)

 V(s):
[[0.07 0.07 0.07 0.08 0.1  0.12 0.13 0.12]
 [0.06 0.07 0.07 0.08 0.1  0.12 0.13 0.14]
 [0.06 0.06 0.07 0.07 0.09 0.12 0.15 0.17]
 [0.05 0.05 0.05 0.06 0.07 0.12 0.18 0.21]
 [0.04 0.04 0.04 0.04 0.   0.14 0.23 0.28]
 [0.03 0.03 0.04 0.06 0.11 0.23 0.33 0.41]
 [0.01 0.   0.03 0.06 0.16 0.3  0.47 0.62]
 [0.01 0.   0.01 0.   0.16 0.35 0.62 0.  ]] 



In [27]:
V3 = evaluate_policy_using_Monte_Carlo(ENV,
                                       MC_type = 'every_visit',
                                       incremental = True)
print_state_value_func(V3)

 V(s):
[[0.06 0.06 0.07 0.08 0.09 0.11 0.12 0.12]
 [0.06 0.06 0.07 0.08 0.09 0.11 0.12 0.12]
 [0.05 0.05 0.06 0.07 0.09 0.11 0.14 0.15]
 [0.04 0.04 0.05 0.05 0.06 0.12 0.18 0.19]
 [0.03 0.03 0.04 0.04 0.   0.14 0.23 0.26]
 [0.02 0.02 0.04 0.06 0.11 0.21 0.33 0.38]
 [0.01 0.   0.03 0.06 0.15 0.29 0.47 0.63]
 [0.01 0.01 0.01 0.   0.16 0.36 0.62 0.  ]] 

