In [1]:
import gym
import numpy as np
from bisect import bisect

In [2]:
"""
    About FrozenLake: https://gym.openai.com/envs/FrozenLake-v0/

    SFFF       (S: starting point, safe)
    FHFH       (F: frozen surface, safe)
    FFFH       (H: hole, fall to your doom)
    HFFG       (G: goal, where the frisbee is located)
    
    The episode ends when you reach the goal or fall in a hole.
    You receive a reward of 1 if you reach the goal, and zero otherwise.
"""

env = gym.make('FrozenLake-v0')
n_a = env.action_space.n
n_s = env.observation_space.n
print("Nb_Actions: {}".format(n_a))
print("Nb_States: {}".format(n_s))

Nb_Actions: 4
Nb_States: 16


In [3]:
def epsilon_action(state, epsilon, Q):
    return env.action_space.sample() if np.random.uniform(0,1) < epsilon else np.argmax(Q[state,:])

def normal_action(state, i, Q):
    return np.argmax(Q[state, :] + np.random.randn(1, n_a) * (1. / (i + 1)))


def softmax(Q,s,a):
    return np.exp(1/tau*Q[s,a])

def softmax_action(s,Q):
    all_actions = list(range(n_a))
    den = sum([softmax(Q,s,b) for b in all_actions])
    for a in all_actions:
        P[s,a] = softmax(Q,s,a)/den
    cum_probas = np.cumsum(P[s,:])
    return bisect(cum_probas, np.random.uniform())

In [4]:
def sarsa(Q, state, action, new_state, new_action, reward):
    return Q[state, action] + alpha * (reward + gamma * Q[new_state, new_action] - Q[state, action])

def q_learning(Q, state, action, new_state, new_action, reward):
    return Q[state, action] + alpha * (reward + gamma * max(Q[new_state, :]) - Q[state, action])

In [5]:
epsilon = 0.9            # e-greedy stratgy
tau     = 0.9            # softmax strategy
P = np.zeros((n_s,n_a))  # softmax strategy

# Define a Q-table to host the Q-function estimate
Q = np.zeros((n_s,n_a))

gamma   = 0.99 # discount factor
alpha   = 0.85 # learning reate

nb_episodes = 100000

# Store the full rewards of the episodes
all_rewards = []
print_freq  = 5000

for i in range(nb_episodes):
    state       = env.reset() # In this case, observation = state
    full_reward = 0
    timestep    = 0
    done        = False
    #action      = normal_action(state, i, Q)
    #action      = epsilon_action(state, epsilon, Q)
    action      = softmax_action(state, Q)

    while not done:
        #env.render()
        new_state, reward, done, info = env.step(action)

        """ Selects an action """ 
        #new_action = normal_action(new_state, i, Q)
        #new_action = epsilon_action(new_state, Q)
        new_action = softmax_action(new_state, Q)
        
        """ Update the tabular estimate of the Q-function """
        #Q[state, action] = sarsa(Q, state, action, new_state, new_action, reward)
        Q[state, action] = q_learning(Q, state, action, new_state, new_action, reward)
        

        full_reward += reward
        timestep    += 1
        state       = new_state
        action      = new_action

    all_rewards.append(full_reward)
    #print("Episode {} finished after {} timesteps".format(i, timestep))
    if i % print_freq == 0 and i is not 0:
        print("{:6d} | Success rate of the last {} episodes: {}".format(i, print_freq, np.mean(all_rewards[-print_freq:])))

  5000 | Success rate of the last 5000 episodes: 0.0168
 10000 | Success rate of the last 5000 episodes: 0.0166
 15000 | Success rate of the last 5000 episodes: 0.0174
 20000 | Success rate of the last 5000 episodes: 0.0172
 25000 | Success rate of the last 5000 episodes: 0.02
 30000 | Success rate of the last 5000 episodes: 0.0186
 35000 | Success rate of the last 5000 episodes: 0.0212
 40000 | Success rate of the last 5000 episodes: 0.0184
 45000 | Success rate of the last 5000 episodes: 0.0196
 50000 | Success rate of the last 5000 episodes: 0.0192
 55000 | Success rate of the last 5000 episodes: 0.017
 60000 | Success rate of the last 5000 episodes: 0.0166
 65000 | Success rate of the last 5000 episodes: 0.0196
 70000 | Success rate of the last 5000 episodes: 0.0224
 75000 | Success rate of the last 5000 episodes: 0.02
 80000 | Success rate of the last 5000 episodes: 0.0166
 85000 | Success rate of the last 5000 episodes: 0.0214
 90000 | Success rate of the last 5000 episodes: 0.01