In [None]:
import gym
import matplotlib
import numpy as np
import random
import sys
from collections import defaultdict

env = gym.make('FrozenLake-v0')
# env = gym.make('FrozenLake-v0', is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps = 30)

def epsilon_greedy(state,Q, epsilon):
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state, :])
    return action

def generate_episode(Q, epsilon, env):
    states, actions, rewards = [], [], []
    state = env.reset()
    while True:
        states.append(state)
        action = epsilon_greedy(state,Q, epsilon)
        actions.append(action)
        state, reward, done, info = env.step(action)
        rewards.append(reward)
        if done:
             break

    return states, actions, rewards


#  Now that we learned how to generate an episode, we will see how to perform First Vist MC Prediction

def first_visit_mc_control(env,epsilon, n_episodes):
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    memory =  defaultdict(list)

    for _ in range(n_episodes):
        states, actions, rewards = generate_episode(Q,epsilon, env)
        returns = 0
        for t in range(len(states) - 1, -1, -1):
            R = rewards[t]
            S = states[t]
            A = actions[t]
            returns = returns + R
            if tuple([S,A]) not in tuple(zip(states[:t], actions[:t])):
                memory[S,A].append(returns)
                Q[S,A] = np.average(memory[S,A])
    return Q

Q = first_visit_mc_control(env,0.1, n_episodes=50000)

done = False
state = env.reset()
env.render()
while not done:
    action = np.argmax(Q[state,:])
    new_state, reward, done, info = env.step(action)
    env.render()
    state = new_state

