In [27]:
import numpy as np
import gym

In [28]:
MAP =  [
    'S-------',
    '--------',
    '---H----',
    '-----H--',
    '--------',
    '--H---H-',
    '-H----H-',
    '---H---G',
]
MAP_SIZE = (8, 8)
MAP_STRING = ''.join(MAP)
ACTION_MAPPING = {0: '←', 1: '↓', 2: '→', 3: '↑'}

ENV = gym.make('FrozenLake8x8-v0')
gym.envs.toy_text.frozen_lake.MAPS['8x8'] = MAP
ENV = gym.make('FrozenLake8x8-v0', is_slippery=False)

ALL_STATE = range(ENV.nS)
ALL_ACTION = range(ENV.nA)

In [29]:
def print_state_value_func(V: np.ndarray, precision=3):
    rounded = np.round_(V, precision).reshape(MAP_SIZE)
    print(' V(s):\n', rounded, '\n')

In [30]:
def sample_episode(env, policy):
    episode = list()
    state = env.reset()
    terminated = False
    # Sample an episode
    while not terminated:
        action = np.random.choice(
            ALL_ACTION,
            p=policy[state].tolist()
        )
        next_state, reward, terminated, info = env.step(action)
        episode.append((state, action, reward))
        state = next_state
    return episode

In [31]:
def monte_carlo_evaluation(env,
                           strategy = 'first_visit',
                           discount_factor = 1.0,
                           incremental = False,
                           max_iteration = 9999):
    # Init policy with equal prob for all actions
    policy = np.ones([env.nS, env.nA]) / env.nA

    V = np.zeros(env.nS)
    N = np.zeros_like(V)
    G = np.zeros_like(V)

    for _ in range(max_iteration):
        episode = sample_episode(env, policy)

        # Calulate G and N for Monte Carlo first visit
        if strategy == 'first_visit':
            visited = np.zeros_like(V, dtype=bool)
            for idx, (state, _, _) in enumerate(episode):
                if visited[state]:
                    continue
                N[state] += 1
                for i, (_, _, reward) in enumerate(episode[idx:]):
                    G[state] += (discount_factor**i) * reward
                visited[state] == True

        # Calulate G and N for Monte Carlo every visit
        elif strategy == 'every_visit':
            for idx, (state, _, _) in enumerate(episode):
                N[state] += 1
                for i, (_, _, reward) in enumerate(episode[idx:]):
                    G[state] += (discount_factor**i) * reward
        else:
            assert False

    if incremental:
        alpha = np.divide(1, N, out = np.zeros_like(N), where = N!=0)
        V += alpha * (G - V)
    else:
        V = np.divide(G, N, out = np.zeros_like(N), where = N!=0)

    print_state_value_func(V)
    return V

In [32]:
V0 = monte_carlo_evaluation(
    ENV,
    strategy = 'first_visit',
    incremental = False
)

 V(s):
 [[0.004 0.005 0.004 0.006 0.01  0.01  0.01  0.017]
 [0.004 0.004 0.003 0.005 0.008 0.011 0.015 0.018]
 [0.004 0.003 0.002 0.    0.007 0.01  0.022 0.03 ]
 [0.002 0.003 0.005 0.004 0.007 0.    0.033 0.059]
 [0.001 0.002 0.003 0.004 0.011 0.013 0.021 0.06 ]
 [0.    0.    0.    0.006 0.021 0.029 0.    0.141]
 [0.    0.    0.    0.003 0.024 0.087 0.    0.381]
 [0.    0.    0.    0.    0.013 0.174 0.405 0.   ]] 



In [33]:
V1 = monte_carlo_evaluation(
    ENV,
    strategy = 'every_visit',
    incremental = False
)

 V(s):
 [[0.003 0.002 0.002 0.003 0.007 0.008 0.014 0.017]
 [0.003 0.002 0.003 0.003 0.007 0.007 0.01  0.016]
 [0.002 0.002 0.003 0.    0.004 0.007 0.014 0.026]
 [0.003 0.002 0.002 0.004 0.002 0.    0.027 0.048]
 [0.005 0.002 0.003 0.007 0.009 0.02  0.034 0.073]
 [0.002 0.001 0.    0.004 0.017 0.035 0.    0.156]
 [0.002 0.    0.    0.007 0.016 0.037 0.    0.351]
 [0.    0.    0.    0.    0.044 0.103 0.433 0.   ]] 



In [34]:
V2 = monte_carlo_evaluation(
    ENV,
    strategy = 'first_visit',
    incremental = True
)

 V(s):
 [[0.002 0.003 0.004 0.004 0.006 0.013 0.018 0.025]
 [0.002 0.003 0.003 0.004 0.006 0.011 0.017 0.021]
 [0.002 0.002 0.002 0.    0.003 0.006 0.018 0.028]
 [0.002 0.002 0.002 0.004 0.009 0.    0.015 0.03 ]
 [0.003 0.002 0.002 0.005 0.014 0.02  0.015 0.046]
 [0.001 0.001 0.    0.004 0.003 0.021 0.    0.115]
 [0.    0.    0.    0.003 0.019 0.038 0.    0.269]
 [0.    0.    0.    0.    0.03  0.056 0.149 0.   ]] 



In [35]:
V3 = monte_carlo_evaluation(
    ENV,
    strategy = 'every_visit',
    incremental = True
)

 V(s):
 [[0.002 0.003 0.004 0.005 0.006 0.009 0.008 0.007]
 [0.002 0.002 0.003 0.005 0.005 0.007 0.01  0.01 ]
 [0.001 0.001 0.002 0.    0.003 0.008 0.017 0.023]
 [0.002 0.001 0.002 0.002 0.003 0.    0.02  0.038]
 [0.001 0.001 0.003 0.008 0.016 0.019 0.026 0.075]
 [0.    0.    0.    0.009 0.026 0.033 0.    0.134]
 [0.    0.    0.01  0.016 0.038 0.057 0.    0.295]
 [0.    0.    0.    0.    0.033 0.111 0.246 0.   ]] 

