In [80]:
import numpy as np
import gym

Taxi-v2

This task was introduced in [Dietterich2000] to illustrate some issues in hierarchical reinforcement learning.
There are 4 locations (labeled by different letters) and your job is to pick up the passenger at one location and 
drop him off in another. You receive +20 points for a successful dropoff, and lose 1 point for every timestep it 
takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.

<img src="Taxi-v2.jpeg">

In [190]:
# env = gym.make('Taxi-v2')
# env = gym.make('FrozenLake8x8-v0')
env = gym.make('FrozenLake-v0')

# State: 
num_states = env.observation_space.n
num_actions = env.action_space.n

# Init Q-Table
Q_table = np.zeros([num_states, num_actions])

learning_rate = 0.1
discount_factor = 0.9
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01

n_episodes = 50000

for episode in range(1, n_episodes + 1):
    state = env.reset()  
    done = False
    
    while (not done):    
        # Either explore or exploit (will exploit more increasingly)
        random_val = np.random.rand()
        if (random_val < epsilon):    
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[state, :])
            
        # Perform the action
        next_state, reward, done, info = env.step(action)        
              
        # Maximum expected future reward one steps from the next state
        next_max = np.max(Q_table[next_state, :])
          
        # Update Q-Table
        Q_table[state, action] = Q_table[state, action] + learning_rate * (reward + discount_factor * next_max - Q_table[state, action])
        
        # Assign new state
        state = next_state
        
    # Reduce exploration
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * (1 - episode/n_episodes)  

In [191]:
average_reward = 0.0
average_epochs = 0.0
finish_cnt = 0
n_episodes = 1000

for episode in range(n_episodes + 1):
    state = env.reset()  
    epochs = 0
    total_reward = 0
    done = False
    
    while (not done): 
        action = np.argmax(Q_table[state, :])
        
        # Perform the action
        next_state, reward, done, info = env.step(action)
                 
        # Assign new state
        state = next_state
        
        epochs += 1
        total_reward += reward
        
        if (done):
            average_reward += total_reward
            average_epochs += epochs
            finish_cnt += 1

print("Average reward: %f, average actions taken: %f finish: %d" % (average_reward/finish_cnt, average_epochs/finish_cnt, finish_cnt))

Average reward: 0.715285, average actions taken: 69.714286 finish: 1001


In [183]:
print(Q_table)

[[4.52035237e-03 4.70296191e-03 6.09736695e-03 4.75517842e-03]
 [5.47048381e-03 6.29643334e-03 8.23650948e-03 6.00344311e-03]
 [7.96347253e-03 8.71237969e-03 9.24765669e-03 7.77873189e-03]
 [1.09761018e-02 1.23460747e-02 1.52560980e-02 1.25226926e-02]
 [1.55142478e-02 1.67716916e-02 2.24987691e-02 1.63991416e-02]
 [2.17418273e-02 2.06577770e-02 2.91763675e-02 2.16018805e-02]
 [2.79974610e-02 3.41058555e-02 2.57508664e-02 2.74073908e-02]
 [2.69446292e-02 3.17310063e-02 4.15629293e-02 2.96504718e-02]
 [3.59937354e-03 3.70231736e-03 4.13905104e-03 5.50781009e-03]
 [4.52915409e-03 4.62468613e-03 6.76266526e-03 5.55954502e-03]
 [6.83354526e-03 6.77284017e-03 6.81898286e-03 9.77669662e-03]
 [6.68137185e-03 9.28624574e-03 1.04958371e-02 1.37548119e-02]
 [1.55093686e-02 1.68393610e-02 2.56230989e-02 1.74540976e-02]
 [2.16301990e-02 3.60905904e-02 2.61742975e-02 2.15535161e-02]
 [3.17361681e-02 3.29947378e-02 4.28365552e-02 3.13438048e-02]
 [3.61984074e-02 3.92796103e-02 5.26774843e-02 3.569122