In [6]:
import gym
import numpy as np
from collections import defaultdict


env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake-v0', is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps = 20)

# We define a function called generate_episode for generating epsiodes
def generate_episode(policy, env):
    # we initialize the list for storing states, actions, and rewards
    states, actions, rewards = [], [], []
    # Initialize the gym environment
    observation = env.reset()
    while True:
        # append the states to the states list
        states.append(observation)
        # now, we select an action using our sample_policy function and append the action to actions list
        probs = policy[observation]
        action = np.random.choice(np.arange(len(probs)), p=probs)
        actions.append(action)
        # We perform the action in the environment according to our sample_policy, move to the next state
        # and receive reward
        observation, reward, done, info = env.step(action)
        rewards.append(reward)
        # Break if the state is a terminal state
        if done:
             break

    return states, actions, rewards


#  Now that we learned how to generate an episode, we will see how to perform First Vist MC Prediction
def every_visit_mc_prediction(env, random_policy, n_episodes):
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    memory =  defaultdict(list)
    for _ in range(n_episodes):
        # Next, we generate the epsiode and store the states and rewards
        states, actions, rewards = generate_episode(random_policy, env)
        returns = 0
        # Then for each step, we store the rewards to a variable R and states to S, and we calculate
        # returns as a sum of rewards
        for t in range(len(states) - 1, -1, -1):
            R = rewards[t]
            S = states[t]
            A = actions[t]
            returns = returns + R
            memory[S,A].append(returns)
            Q[S,A] = np.average(memory[S,A])
    return Q

random_policy = np.ones([env.nS, env.nA]) / env.nA
Q = every_visit_mc_prediction(env,random_policy, n_episodes = 50000)

V = np.zeros(env.nS)
for s in range(env.nS):
    for a in range(env.nA):
        V[s] = V[s] + random_policy[s,a] * Q[s,a]
print("Value Function:")
print(V)    



Value Function:
[0.00972861 0.00701211 0.01062405 0.00180085 0.01192766 0.
 0.02715052 0.         0.02835433 0.07497612 0.12510117 0.
 0.         0.17469534 0.41720763 0.        ]
