In [1]:
import gym
import numpy as np
env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake-v0', is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps = 20)

# We define a function called generate_episode for generating epsiodes
def generate_episode(policy, env):
    # we initialize the list for storing states, actions, and rewards
    states, actions, rewards = [], [], []
    # Initialize the gym environment
    state = env.reset()
    while True:
        # append the states to the states list
        states.append(state)
        # now, we select an action using our sample_policy function and append the action to actions list
        probs = policy[state]
        action = np.random.choice(np.arange(len(probs)), p=probs)
        actions.append(action)
        # We perform the action in the environment according to our sample_policy, move to the next state
        # and receive reward
        state, reward, done, info = env.step(action)
        rewards.append(reward)
        # Break if the state is a terminal state
        if done:
             break
    return states, actions, rewards


#  Now that we learned how to generate an episode, we will see how to perform First Vist MC Prediction
def every_visit_mc_prediction(env,random_policy, n_episodes):
    N = np.zeros(env.nS)
    V = np.zeros(env.nS)
    for _ in range(n_episodes):
        states, actions, rewards = generate_episode(random_policy, env)
        G = 0
        for t in range(len(states) - 1, -1, -1):
            S = states[t]
            G = G + rewards[t]
            N[S] = N[S] + 1
            V[S] = V[S] + (G - V[S]) / N[S]
    return V

random_policy = np.ones([env.nS, env.nA]) / env.nA
V = every_visit_mc_prediction(env,random_policy, n_episodes=50000)
print("Value Function:")
print(V)    



Value Function:
[0.01003528 0.00785823 0.01555926 0.00453195 0.01257098 0.
 0.03470743 0.         0.02898615 0.0784689  0.12968967 0.
 0.         0.16025237 0.41722906 0.        ]
