In [7]:
import gym
import numpy as np
import sys
from collections import defaultdict
env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake-v0', is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps = 20)

def generate_episode(policy, env):
    states, actions, rewards = [], [], []
    observation = env.reset()
    while True:
        states.append(observation)
        probs = policy[observation]
        action = np.random.choice(np.arange(len(probs)), p=probs)
        actions.append(action)
        observation, reward, done, info = env.step(action)
        rewards.append(reward)
        if done:
             break
    return states, actions, rewards


def first_visit_mc_prediction(env,random_policy, n_episodes):
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    memory =  defaultdict(list)
    for episode in range(n_episodes):
        states, actions, rewards = generate_episode(random_policy, env)
        returns = 0
        for t in range(len(states) - 1, -1, -1):
            R = rewards[t]
            S = states[t]
            A = actions[t]
            returns = returns + R
            if tuple([S, A]) not in tuple(zip(states[:t], actions[:t])):
                memory[S,A].append(returns)
                Q[S,A] = np.average(memory[S,A])
    return Q

random_policy = np.ones([env.nS, env.nA]) / env.nA
Q = first_visit_mc_prediction(env,random_policy, n_episodes=50000)

V = np.zeros(env.nS)
for s in range(env.nS):
    for a in range(env.nA):
        V[s] = V[s] + random_policy[s,a] * Q[s,a]

print("Value Function:")
print(V)    



Value Function:
[0.01025502 0.00889055 0.01683126 0.00571904 0.0131423  0.
 0.0352736  0.         0.03069913 0.07874477 0.13308878 0.
 0.         0.17770255 0.42998111 0.        ]
