In [80]:
import numpy as np
import gym

Taxi-v2

This task was introduced in [Dietterich2000] to illustrate some issues in hierarchical reinforcement learning.
There are 4 locations (labeled by different letters) and your job is to pick up the passenger at one location and 
drop him off in another. You receive +20 points for a successful dropoff, and lose 1 point for every timestep it 
takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.

<img src="Taxi-v2.jpeg">

In [192]:
# env = gym.make('Taxi-v2')
# env = gym.make('FrozenLake8x8-v0')
env = gym.make('FrozenLake-v0')

# State: 
num_states = env.observation_space.n
num_actions = env.action_space.n

# Init Q-Table
Q_table = np.zeros([num_states, num_actions])

learning_rate = 0.1
discount_factor = 0.9
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01

n_episodes = 50000

for episode in range(1, n_episodes + 1):
    state = env.reset()  
    done = False
    
    while (not done):    
        # Either explore or exploit (will exploit more increasingly)
        random_val = np.random.rand()
        if (random_val < epsilon):    
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[state, :])
            
        # Perform the action
        next_state, reward, done, info = env.step(action)        
              
        # Maximum expected future reward one steps from the next state
        next_max = np.max(Q_table[next_state, :])
          
        # Update Q-Table
        Q_table[state, action] = Q_table[state, action] + learning_rate * (reward + discount_factor * next_max - Q_table[state, action])
        
        # Assign new state
        state = next_state
        
    # Reduce exploration
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * (1 - episode/n_episodes)  

In [194]:
average_reward = 0.0
average_epochs = 0.0
finish_cnt = 0
n_episodes = 1000

for episode in range(n_episodes + 1):
    state = env.reset()  
    epochs = 0
    total_reward = 0
    done = False
    
    while (not done): 
        action = np.argmax(Q_table[state, :])
        
        # Perform the action
        next_state, reward, done, info = env.step(action)
                 
        # Assign new state
        state = next_state
        
        epochs += 1
        total_reward += reward
        
        if (done):
            average_reward += total_reward
            average_epochs += epochs
            finish_cnt += 1

print("Average reward: %f, average actions taken: %f finish: %d" % (average_reward/finish_cnt, average_epochs/finish_cnt, finish_cnt))

Average reward: 0.432567, average actions taken: 29.178821 finish: 1001
