In [1]:
import numpy as np
import gym

Taxi-v2

This task was introduced in [Dietterich2000] to illustrate some issues in hierarchical reinforcement learning.
There are 4 locations (labeled by different letters) and your job is to pick up the passenger at one location and 
drop him off in another. You receive +20 points for a successful dropoff, and lose 1 point for every timestep it 
takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.

<img src="Taxi-v2.jpeg">

In [59]:
env = gym.make('Taxi-v2')

# Action: North, South, West, East, PickUp, DropOff
# State: 
num_states = env.observation_space.n
num_actions = env.action_space.n

# Init Q-Table
Q_table = np.zeros([num_states, num_actions])

learning_rate = 0.1
discount_factor = 0.9
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate  = 0.01

n_episodes = 100000

for episode in range(n_episodes + 1):
    state = env.reset()  
    done = False
    
    while (not done):    
        # 1. Initially, choose a random action to take
        random_val = np.random.rand()
        
        if (random_val < epsilon):    
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[state])
              
        # Perform the action
        next_state, reward, done, info = env.step(action)
           
        next_max = np.max(Q_table[next_state])
          
        # Update Q-Table
        Q_table[state, action] += learning_rate * (reward + discount_factor * next_max - Q_table[state, action])
        
        # Assign new state
        state = next_state
        
    # Reduce exploration
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(- decay_rate * episode)    

In [60]:
print(Q_table)

[[  0.           0.           0.           0.           0.
    0.        ]
 [ 47.46225639  65.02064408  49.95175889  65.01475303  90.442349
   54.91057046]
 [ 66.01222297  80.29993667  74.92028896  86.00444059 114.0029
   70.12415486]
 ...
 [ -1.13473865  -1.11237674  -1.12801184  78.11030105  -2.744029
   -1.94332301]
 [ -2.21904139  -2.21143642  -2.21409153  51.59672803  -5.4776448
   -4.8688699 ]
 [ 15.715845    -0.40951     -0.64353051 178.96185107  -6.14289511
   -4.69459   ]]


In [62]:
average_reward = 0.0
average_epochs = 0.0
finish_cnt = 0

for episode in range(n_episodes + 1):
    state = env.reset()  
    epochs = 0
    total_reward = 0
    done = False
    
    while (not done): 
        action = np.argmax(Q_table[state])
        
        # Perform the action
        next_state, reward, done, info = env.step(action)
           
        # Assign new state
        state = next_state
        
        epochs += 1
        total_reward += reward
        
        if (done):
            average_reward += total_reward
            average_epochs += epochs
            finish_cnt += 1

print("Average reward: %f, average actions taken: %f" % (average_reward/finish_cnt, average_epochs/finish_cnt))

Average reward: 8.457935, average actions taken: 12.542065
