In [5]:
import gym
import numpy as np
from collections import defaultdict


env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake-v0', is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps = 20)

# We define a function called generate_episode for generating epsiodes
def generate_episode(policy, env):
    # we initialize the list for storing states, actions, and rewards
    states, actions, rewards = [], [], []
    # Initialize the gym environment
    observation = env.reset()
    while True:
        # append the states to the states list
        states.append(observation)
        # now, we select an action using our sample_policy function and append the action to actions list
        probs = policy[observation]
        action = np.random.choice(np.arange(len(probs)), p=probs)
        actions.append(action)
        # We perform the action in the environment according to our sample_policy, move to the next state
        # and receive reward
        observation, reward, done, info = env.step(action)
        rewards.append(reward)
        # Break if the state is a terminal state
        if done:
             break

    return states, actions, rewards


#  Now that we learned how to generate an episode, we will see how to perform First Vist MC Prediction
def every_visit_mc_prediction(env, random_policy, n_episodes):
    V = np.zeros(env.observation_space.n)
    memory =  defaultdict(list)
    for _ in range(n_episodes):
        # Next, we generate the epsiode and store the states and rewards
        states, actions, rewards = generate_episode(random_policy, env)
        returns = 0
        # Then for each step, we store the rewards to a variable R and states to S, and we calculate
        # returns as a sum of rewards
        for t in range(len(states) - 1, -1, -1):
            R = rewards[t]
            S = states[t]
            returns = returns + R
            memory[S].append(returns)
            V[S] = np.average(memory[S])
    return V

random_policy = np.ones([env.nS, env.nA]) / env.nA
V = every_visit_mc_prediction(env,random_policy, n_episodes = 50000)

print("Value Function:")
print(V)    



Value Function:
[0.01141642 0.00891901 0.01562387 0.00511604 0.01412992 0.
 0.03642758 0.         0.03283372 0.08811283 0.14005474 0.
 0.         0.16936488 0.44538835 0.        ]
