In [2]:
import numpy as np
import gym
import random

#env = gym.make("FrozenLake-v0")


from gym.envs.registration import register

register(
    id='D4x4-FrozenLake-v0',
    entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
    kwargs={'map_name': '4x4',
            'is_slippery': True})

env = gym.make('D4x4-FrozenLake-v0')

action_size = env.action_space.n
state_size = env.observation_space.n


In [10]:
# Feel free to play with these hyperparameters

total_episodes = 15000        # Total episodes
test_episodes = 10            # Test episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.96                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

In [11]:
# Initializations
qtable = np.zeros((state_size, action_size))
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # Choose an action a in the current state (greedy or explore)
        
        exp_exp_tradeoff = random.uniform(0, 1)  
        # exploitation (taking the max Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # exploration
        else:
            action = env.action_space.sample()

        # Take this action and observe
        new_state, reward, done, info = env.step(action)

        # Do a Q update
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        state = new_state
        
        if done == True: 
            break
        
    # Decay epsilon to reduce exploration as time progresses
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print("Score over time: " +  str(sum(rewards)/total_episodes))
print("Q values:")
print(qtable)

Score over time: 0.4966
Q values:
[[9.65737409e-02 3.22557690e-01 6.93902749e-02 9.59346536e-02]
 [1.52921286e-02 2.20054989e-02 1.70157744e-02 4.07446322e-01]
 [1.58797737e-01 6.94436986e-03 1.24080751e-02 3.68705360e-02]
 [1.80239093e-02 1.14922141e-02 6.60136885e-03 3.56684093e-02]
 [2.48810874e-01 1.00414983e-01 3.31053706e-02 4.66440728e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.89760187e-02 1.83282382e-04 7.33043537e-05 1.19771921e-11]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [9.58670517e-02 6.34254231e-02 8.01402736e-02 2.87894513e-01]
 [2.75347551e-01 3.52809422e-01 1.40586510e-02 6.78571663e-04]
 [6.48972709e-01 6.04796469e-03 6.29582549e-03 4.02007587e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.83617836e-01 5.53183917e-03 4.09442058e-01 1.06835130e-01]
 [1.44304387e-01 4.89107281e-01 9.05205280e-02 2.02686102e-01]
 [0.00000000e+00 0.00

In [5]:
########################################################################
#################### Final policy animation ############################
########################################################################

print("We only print the last state in each episode, to see if our agent has reached the destination or fallen into a hole")
env.reset()

for episode in range(test_episodes):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        # Taking action with Q learning
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            env.render()
            
            print("Number of steps", step)
            break
        state = new_state
env.close()

We only print the last state in each episode, to see if our agent has reached the destination or fallen into a hole
****************************************************
EPISODE  0
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 12
****************************************************
EPISODE  1
  (Up)
SFFF
FHF[41mH[0m
FFFH
HFFG
Number of steps 30
****************************************************
EPISODE  2
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 36
****************************************************
EPISODE  3
  (Up)
SFFF
FHF[41mH[0m
FFFH
HFFG
Number of steps 20
****************************************************
EPISODE  4
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 69
****************************************************
EPISODE  5
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 73
****************************************************
EPISODE  6
  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 83
******************************************