## A simple implementation of TD0 and TD_lambda on OpenAI Gym FrozenLakeNotSlippery-v0

In [1]:
import gym, random
import time
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
)

In [2]:
class TD0(object):
  Q = {}
  gamma = 0.5
  _alpha = 0.01
  _epsilon = 0.2
  
  def __init__(self):
    # init Q, FrozenLake environment contain a grid with 16 cells, 
    # and 4 actions to choose from 
    for i in xrange(16):
      self.Q[i] = [0.0] * 4

  def alpha(self):
    return self._alpha
  
  def epsilon(self):
    return self._epsilon

  def update_sarsa(self, s0, a0, r, s1, a1):
    if a1 != None:
      delta = r + self.gamma * self.Q[s1][a1] - self.Q[s0][a0]
    else:
      delta = r - self.Q[s0][a0]
    self.Q[s0][a0] += self.alpha() * delta
    
  def pick_action(self, state):
    if random.random() < self.epsilon():
      return random.randrange(0,4,1)
    else:
      return self.Q[state].index(max(self.Q[state]))
    
  def show_policy(self):
    action_name_map = ['left','down','right','up','none']
    for i in range(4):
      policy = ""
      for j in range(4):
        state = i*4+j
        action_index = self.Q[state].index(max(self.Q[state]))
        if self.Q[state] == [0.0]*4:
          action_index = 4
        policy += "%2d: %-10s" %(state, action_name_map[action_index])
      print policy
    
  def train(self, env, episode):
    for i in xrange(episode):
      s0 = env.reset()
      a0 = self.pick_action(s0)
      episode_ended = False
      while not episode_ended:
        (s1, reward, episode_ended, info) = env.step(a0)
        # uncomment the following 4 lines the agent should learn the shortest path
#         if reward <= 0.0:
#           reward = -0.1
#           if episode_ended:
#             reward = -0.3
        if not episode_ended:
          a1 = self.pick_action(s1)
        else:
          a1 = None
        self.update_sarsa(s0,a0,reward,s1,a1)
        s0 = s1
        a0 = a1

In [3]:
env = gym.make('FrozenLakeNotSlippery-v0')
agent = TD0()

In [4]:
for i in range(3):
  agent.train(env, 5000)
  agent.show_policy()
  print "=" * 50

 0: down       1: left       2: left       3: none      
 4: down       5: none       6: up         7: none      
 8: right      9: down      10: down      11: none      
12: none      13: right     14: right     15: none      
 0: down       1: left       2: left       3: left      
 4: down       5: none       6: up         7: none      
 8: right      9: down      10: down      11: none      
12: none      13: right     14: right     15: none      
 0: down       1: left       2: left       3: left      
 4: down       5: none       6: down       7: none      
 8: right      9: down      10: down      11: none      
12: none      13: right     14: right     15: none      


Remark: The Agent may not learn the shortest path as the cost / reward for taking a move is 0. So walking a longer route is having the same reward as walking a shorter path. To make the agent learn the shortest path, set reward to a small negative value (e.g. -0.1) and a bigger cost falling into a hole can make the agent prefer shorter path.

In [5]:
class TD_lambda(object):
  Q = {}
  _gamma = 0.5
  _lambda = 1.0
  _alpha = 0.01
  _epsilon = 0.2
  
  def __init__(self):
    # init Q, FrozenLake environment contain a grid with 16 cells, 
    # and 4 actions to choose from 
    for i in xrange(16):
      self.Q[i] = [0.0] * 4

  def alpha(self):
    # maybe a bit over kill to wrap alpha in a function,
    # but this make it easy if want to do alpha decay 
    return self._alpha
  
  def epsilon(self):
    # same as alpha, wrap in a function make it easier to do epsilon decay if needed
    return self._epsilon

  def update_eligibility(self):
    for key in self.eligibility:
      self.eligibility *= self._gamma * self._lambda

  def update_sarsa(self, s0, a0, r, s1, a1):
    if a1 != None:
      delta = r + self._gamma * self.Q[s1][a1] - self.Q[s0][a0]
    else:
      delta = r - self.Q[s0][a0]
    for (s,a) in self.eligibility:
      self.Q[s][a] += self.alpha() * delta * self.eligibility[s,a]
    
  def pick_action(self, state):
    if random.random() < self.epsilon():
      return random.randrange(0,4,1)
    else:
      return self.Q[state].index(max(self.Q[state]))
    
  def show_policy(self):
    action_name_map = ['left','down','right','up','none']
    for i in range(4):
      policy = ""
      for j in range(4):
        state = i*4+j
        action_index = self.Q[state].index(max(self.Q[state]))
        if self.Q[state] == [0.0]*4:
          action_index = 4
        policy += "%2d: %-10s" %(state, action_name_map[action_index])
      print policy
    
  def train(self, env, episode):
    for i in xrange(episode):
      s0 = env.reset()
      a0 = self.pick_action(s0)
      self.eligibility = {}
      episode_ended = False
      while not episode_ended:
        (s1, reward, episode_ended, info) = env.step(a0)
        if (s0,a0) in self.eligibility:
          self.eligibility[s0,a0] += 1
        else:
          self.eligibility[s0,a0] = 1
        if not episode_ended:
          a1 = self.pick_action(s1)
        else:
          a1 = None
        self.update_sarsa(s0,a0,reward,s1,a1)
        s0 = s1
        a0 = a1

In [6]:
env = gym.make('FrozenLakeNotSlippery-v0')
agent_lambda = TD_lambda()

In [7]:
for i in range(3):
  agent_lambda.train(env, 3000)
  agent_lambda.show_policy()
  print "=" * 50

 0: none       1: none       2: none       3: none      
 4: none       5: none       6: none       7: none      
 8: none       9: none      10: none      11: none      
12: none      13: none      14: none      15: none      
 0: right      1: right      2: down       3: down      
 4: down       5: none       6: down       7: none      
 8: right      9: right     10: down      11: none      
12: none      13: up        14: right     15: none      
 0: right      1: right      2: down       3: down      
 4: down       5: none       6: down       7: none      
 8: right      9: right     10: down      11: none      
12: none      13: up        14: right     15: none      


Somehow TD_lambda tend to converge with the shortest path. The reason maybe the longer route the agent walk, the more likely it fall into a hole before hitting goal state. So the shorter path would hit goal state more often and recevive more reward.