In [1]:
import gym
import random

In [2]:
streets = gym.make("Taxi-v3").env
streets.render()

+---------+
|R: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



In [3]:
# Taxi v3
# This task was introduced in [Dietterich2000] to illustrate some issues in hierarchical reinforcement learning. 

# There are 4 locations (labeled by different letters) 
# and your job is to pick up the passenger at one location and drop him off in another. 

# You receive +20 points for a successful dropoff, 
# and lose 1 point for every timestep it takes. 
# There is also a 10 point penalty for illegal pick-up and drop-off actions.

In [4]:
# letter colored in blue -> passenger location
# letter colored in magenta -> passenger's destination
# solid line -> wall that taxi can't cross
# filled rectangle -> empty taxi (yellow) / taxi with passenger (green)

In [5]:
# Total number of states in this world = 25 * 4 * 5 = 500

# Taxi location (5 grids * 5 grids = 25 locations)
# Current destination (4 different letters)
# Passenger location (4 at each letters, or inside taxi)

In [6]:
# At every state, there are 6 possible actions

# 1-4: move to east, south, west, north
# 5-6: pickup or drop off a passenger

In [7]:
# intial state is set where taxi is at (2, 3), passenger is at location 2, and destination is location 0
initial_state = streets.encode(2, 3, 2, 0)

streets.s = initial_state

streets.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [8]:
streets.P[initial_state]

{0: [(1.0, 368, -1, False)],
 1: [(1.0, 168, -1, False)],
 2: [(1.0, 288, -1, False)],
 3: [(1.0, 248, -1, False)],
 4: [(1.0, 268, -10, False)],
 5: [(1.0, 268, -10, False)]}

In [14]:
streets.observation_space.n

500

In [9]:
streets.action_space.n

6

In [13]:
streets.reset()

461

In [15]:
# populate q table for the q value at each state (500) and each action (6)

import numpy as np

q_table = np.zeros([streets.observation_space.n, streets.action_space.n])

learning_rate = 0.1
discount_factor = 0.6
exploration = 0.1
epochs = 10000

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    
    while not done:
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Explore a random action
        else:
            action = np.argmax(q_table[state]) # Use the action with the highest q-value
            
        next_state, reward, done, info = streets.step(action)
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        # new_q = prev_q + learning_rate * (reward + discount_factor * next_max_q - prev_q)
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table[state, action] = new_q
        
        state = next_state

In [16]:
q_table

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -2.38389781,  -2.33872065,  -2.39877567,  -2.35402867,
         -2.27325184, -10.41479709],
       [ -1.82538288,  -1.53277134,  -1.84006224,  -1.50561802,
         -0.7504    ,  -8.30517845],
       ...,
       [ -1.13794621,   0.27450256,  -1.12506805,  -1.152714  ,
         -1.96      ,  -1.96      ],
       [ -2.10052887,  -2.0893114 ,  -2.09733911,  -2.08765694,
         -4.386029  ,  -2.8816    ],
       [  0.06353497,   0.12182945,  -0.196     ,  10.83514968,
         -1.        ,  -1.        ]])

In [18]:
from IPython.display import clear_output
from time import sleep

for tripnum in range(1, 11):
    state = streets.reset()
   
    done = False
    trip_length = 0
    
    while not done and trip_length < 25:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render(mode="ansi"))
        sleep(.5)
        state = next_state
        trip_length += 1
        
    sleep(2)

Trip number 10 Step 17
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

