In [2]:
import gym
import numpy as np


env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake-v0', is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps = 20)

# We define a function called generate_episode for generating epsiodes
def generate_episode(policy, env):
    # we initialize the list for storing states, actions, and rewards
    states, actions, rewards = [], [], []
    # Initialize the gym environment
    state = env.reset()
    while True:
        # append the states to the states list
        states.append(state)
        # now, we select an action using our sample_policy function and append the action to actions list
        probs = policy[state]
        action = np.random.choice(np.arange(len(probs)), p=probs)
        actions.append(action)
        # We perform the action in the environment according to our sample_policy, move to the next state
        # and receive reward
        state, reward, done, info = env.step(action)
        rewards.append(reward)
        # Break if the state is a terminal state
        if done:
             break
    return states, actions, rewards


#  Now that we learned how to generate an episode, we will see how to perform First Vist MC Prediction

def first_visit_mc_prediction(env,random_policy, n_episodes):
    N = np.zeros([env.nS, env.nA])
    Q = np.zeros([env.nS, env.nA])
    for _ in range(n_episodes):
        states, actions, rewards = generate_episode(random_policy, env)
        G = 0
        for t in range(len(states) - 1, -1, -1):
            S = states[t]
            A = actions[t]
            G = G + rewards[t]
            if tuple([S, A]) not in tuple(zip(states[:t], actions[:t])):
                    N[S,A] = N[S,A] + 1
                    Q[S,A] = Q[S,A] + (G - Q[S,A]) / N[S,A]
    return Q

random_policy = np.ones([env.nS, env.nA]) / env.nA
Q = first_visit_mc_prediction(env,random_policy, n_episodes=500000)

V = np.zeros(env.nS)
for s in range(env.nS):
    for a in range(env.nA):
        V[s] = V[s] + random_policy[s,a] * Q[s,a]

print("Value Function:")
print(V)    



Value Function:
[0.01186211 0.00882561 0.01644486 0.00659898 0.01379513 0.
 0.03565378 0.         0.03115629 0.08158187 0.13113699 0.
 0.         0.16144388 0.42000066 0.        ]
