In [0]:
import gym
import numpy as np

In [0]:
env_name = "FrozenLake-v0"

env = gym.make(env_name)

In [0]:
def short_description(env = env, demo = True, demo_steps = 5):
  n_states = env.env.observation_space.n
  n_actions = env.env.action_space.n

  print("This environnement {} has {} possible states and {} possible actions".format(env_name, n_states, n_actions))
  
  if demo:
    observation = env.reset()
    for _ in range(demo_steps):
      env.render()
      action = env.action_space.sample() 
      observation, _, done, _ = env.step(action)

      if done:
        observation = env.reset()

In [17]:
short_description(env)

This environnement FrozenLake-v0 has 16 possible states and 4 possible actions

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG

[41mS[0mFFF
FHFH
FFFH
HFFG


In [0]:
def q_learning_train(env = env, n_episodes = 2, n_steps_per_episode = 5, gamma = 0.99, lr0 = 1, lambda_ = 0.05, min_eps = 0.01, pretrained_q_table = None):
  epsilon0 = 1
  
  n_states = env.observation_space.n
  n_actions = env.action_space.n
  
  q_table = np.zeros((n_states, n_actions))
  
  if pretrained_q_table:
    q_table = pretrained_q_table.copy()
    
  total_rewards = []
  
  for episode in range(1, n_episodes + 1):
    total_reward = 0
    obs = env.reset()
    epsilon = max(epsilon0 - lambda_*episode, min_eps)
    lr = lr0/episode
    
    for episode_step in range(n_steps_per_episode):
      u = np.random.rand()
      
      if u<epsilon:
        action = env.action_space.sample()
        
      else:
        action = q_table[obs, :].argmax()
      
      obs2, reward, done, _ = env.step(action)
      
      total_reward += reward
      
      if done:
        discounted_reward = reward
        
      else:
        discounted_reward = reward + gamma * max(q_table[obs2, :])
      
      q_table[obs, action] = (1-lr)*q_table[obs, action] + lr*discounted_reward
      
      if done:
        break
        
      obs = obs2
    total_rewards.append(total_reward)
    if episode % 10000 == 0:
      print("episode ", episode)
  
  return q_table, total_rewards    
      
      

In [78]:
n_episodes = 1000000
n_steps_per_episode = 1000
lr0 = 100

q_table, total_rewards = q_learning_train(
    env = env, 
    n_episodes = n_episodes, 
    n_steps_per_episode = n_steps_per_episode, 
    gamma = 0.99, 
    lr0 = lr0, 
    pretrained_q_table = None
)

episode  10000
episode  20000
episode  30000
episode  40000
episode  50000
episode  60000
episode  70000
episode  80000
episode  90000
episode  100000
episode  110000
episode  120000
episode  130000
episode  140000
episode  150000
episode  160000
episode  170000
episode  180000
episode  190000
episode  200000
episode  210000
episode  220000
episode  230000
episode  240000
episode  250000
episode  260000
episode  270000
episode  280000
episode  290000
episode  300000
episode  310000
episode  320000
episode  330000
episode  340000
episode  350000
episode  360000
episode  370000
episode  380000
episode  390000
episode  400000
episode  410000
episode  420000
episode  430000
episode  440000
episode  450000
episode  460000
episode  470000
episode  480000
episode  490000
episode  500000
episode  510000
episode  520000
episode  530000
episode  540000
episode  550000
episode  560000
episode  570000
episode  580000
episode  590000
episode  600000
episode  610000
episode  620000
episode  630000
e

In [0]:
def test(env = env, q_table = q_table):
  obs = env.reset()
  
  env.render()
  for i in range(1):
    print("episode ", i)
    for _ in range(100):
      obs, reward, done, _ = env.step(q_table[obs, :].argmax())
      env.render()
      if done:
        break

In [80]:
test()


[41mS[0mFFF
FHFH
FFFH
HFFG
episode  0
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG
