In [0]:
import numpy as np
import gym

np.random.seed(0)

In [0]:
gamma = 0.99 # discount factor
max_epsilon_decay = 0.5 # maximum decay rate for the exploration rate 
epsilon0 = 1 # initial exploration rate
learning_rate0 = 1 
min_lr = 1e-03
n_episodes = 100000 # number of training episodes

In [3]:
env_name = "FrozenLake-v0"
env = gym.make(env_name)
env.reset()

0

In [0]:
Ns = env.observation_space.n
Na = env.action_space.n

Q = np.zeros((Ns, Na)) 

In [0]:
def choose_action(state, epsilon):
  u = np.random.rand()
  
  if u < epsilon:
    return np.random.choice(Q.shape[1])
    
  return Q[state, :].argmax()   

In [0]:
def update_q(state, new_state, reward, action, learning_rate, done):
  old = Q[state, action]
  if done:
    discounted_reward = reward
  else:
    discounted_reward = reward + gamma*max(Q[new_state, :])
  
  Q[state, action] = old + learning_rate*(discounted_reward - old)

In [0]:
def decay(episode):
  return epsilon0*(1 - episode/n_episodes*max_epsilon_decay)

def decay_lr(episode):
  return max(learning_rate0/(episode+1), min_lr)
  

In [0]:
def q_learning(show = True):
  for episode in range(n_episodes):
    state = env.reset()
    epsilon = decay(episode)
    
    if (episode%(n_episodes/10)==0):
      learning_rate = decay_lr(episode)
    
    if show and (episode%(n_episodes/10)==0):
      print("Episode : {} , epsilon : {}, learning_rate : {}".format(episode+1, epsilon, learning_rate))
    
    while True:
      action = choose_action(state, epsilon)
      
      new_state, reward, done, _ = env.step(action)
      
      update_q(state, new_state, reward, action, learning_rate, done)
      
      state = new_state
      
      if done:
        break
      
      
      

In [9]:
q_learning()

Episode : 1 , epsilon : 1.0, learning_rate : 1.0
Episode : 10001 , epsilon : 0.95, learning_rate : 0.001
Episode : 20001 , epsilon : 0.9, learning_rate : 0.001
Episode : 30001 , epsilon : 0.85, learning_rate : 0.001
Episode : 40001 , epsilon : 0.8, learning_rate : 0.001
Episode : 50001 , epsilon : 0.75, learning_rate : 0.001
Episode : 60001 , epsilon : 0.7, learning_rate : 0.001
Episode : 70001 , epsilon : 0.65, learning_rate : 0.001
Episode : 80001 , epsilon : 0.6, learning_rate : 0.001
Episode : 90001 , epsilon : 0.55, learning_rate : 0.001


In [0]:
def test(test_episodes = 1000):
  success = 0
  
  for episode in range(1, test_episodes+1):
    state = env.reset()
    total_reward = 0
    
    while True:
      action = Q[state, :].argmax()
      new_state, reward, done, _ = env.step(action)
      
      total_reward += reward
      
      if done:
        break
      
      state = new_state
    
    success += total_reward
    
  
  print("{} / {} successful episodes".format(success, test_episodes))

In [11]:
test()

762.0 / 1000 successful episodes


In [0]:
def test_and_render(episodes = 5):
  success = 0
  
  for episode in range(1, episodes+1):
    print('Episode {} \n'.format(episode))
    state = env.reset()
    total_reward = 0
    
    while True:
      env.render()
      action = Q[state, :].argmax()
      new_state, reward, done, _ = env.step(action)
      
      total_reward += reward
      
      if done:
        break
      
      state = new_state
    
    success += total_reward
    
  
  print("{} / {} successful episodes".format(success, episodes))

In [16]:
test_and_render(1)

Episode 1 


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
S