In [2]:
import gym
env = gym.make('Taxi-v3')

In [5]:
env.render()

AttributeError: 'TaxiEnv' object has no attribute 's'

## Environment

- The filled square represents the taxi, which is yellow without a passenger and green with a passenger.
- The pipe ("|") represents a wall which the taxi cannot cross.
- R, G, Y, B are the possible pickup and destination locations. The blue letter represents the current passenger pick-up location, and the purple letter is the current destination.


## States / observation space
You'll also notice there are four (4) locations that we can pick up and drop off a passenger: R, G, Y, B or [(0,0), (0,4), (4,0), (4,3)] in (row, col) coordinates. Our illustrated passenger is in location Y and they wish to go to location R.

When we also account for one (1) additional passenger state of being inside the taxi, we can take all combinations of passenger locations and destination locations to come to a total number of states for our taxi environment; there's four (4) destinations and five (4 + 1) passenger locations.

So, our taxi environment has 5×5×5×4=500 total possible states.

## Action space

The agent encounters one of the 500 states and it takes an action. The action in our case can be to move in a direction or decide to pickup/dropoff a passenger.

In other words, we have six possible actions:

0 = south
1 = north
2 = east
3 = west
4 = pickup
5 = dropoff

This is the action space: the set of all the actions that our agent can take in a given state.

You'll notice in the illustration above, that the taxi cannot perform certain actions in certain states due to walls. In environment's code, we will simply provide a -1 penalty for every wall hit and the taxi won't move anywhere. This will just rack up penalties causing the taxi to consider going around the wall.

In [3]:
print('# states: ', env.observation_space.n)
print('# actions: ', env.action_space.n)

# states:  500
# actions:  6


In [4]:
env.render()

+---------+
|[34;1mR[0m: | :[43m [0m:[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [5]:
epochs = 0
penalties, reward = 0, 0
env.reset()

frames = [] # for rendering
done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    
    if reward == -10:
        penalties += 1
        
    frames.append({
        'frame':env.render(mode='ansi'),
        'state':state,
        'action':action,
        'reward':reward,
    })
    epochs += 1    

In [6]:
print('Timesteps taken: {}'.format(epochs))
print('Penalties incurred: {}'.format(penalties))

Timesteps taken: 1890
Penalties incurred: 616


In [7]:
#frames

In [18]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print_a_frame(frame, i)
        sleep(.5)
        
def print_a_frame(frame, idx=None):
    clear_output(wait=True)
    print(frame['frame'])
    if idx:
        print('Timestep: {}'.format(idx+1))
    print('State: {}'.format(frame['state']))
    print('Action: {}'.format(frame['action']))
    print('Reward: {}'.format(frame['reward']))

In [9]:
print_frames(frames)

+---------+
|R: | : :G|
| :[42m_[0m| : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)

Timestep: 140
State: 138
Action: 1
Reward: -1


KeyboardInterrupt: 

In [10]:
import numpy as np
class QLearning():
    def __init__(self, gym_env, epsilon, gamma, alpha):
        self.gym_env = gym_env
        self._q_table = np.zeros([self.gym_env.observation_space.n, self.gym_env.action_space.n])
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        
    def take_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            action = self.gym_env.action_space.sample() # Explore action space
        else:
            action = np.argmax(self._q_table[state]) # Exploit learned values
        return action
    
    def infer_action(self, state):
        action = np.argmax(self._q_table[state])
        return action
    
    def update_q_table(self, state, action, next_state, reward):
        old_value = self._q_table[state, action]
        next_state_max_q_value = np.max(self._q_table[next_state])
        
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_state_max_q_value)
        self._q_table[state, action] = new_value

In [11]:
# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1
num_of_episodes = 100001

In [12]:
q_learn = QLearning(env, epsilon, gamma, alpha)

In [13]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, num_of_episodes):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        #get action with epsilon 
        action = q_learn.take_action(state)
        next_state, reward, done, info = env.step(action) 
        q_learn.update_q_table(state, action, next_state, reward)
        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print('Episode: {}'.format(i))

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 1min 53s, sys: 38.6 s, total: 2min 32s
Wall time: 1min 55s


In [14]:
q_learn._q_table[328]

array([ -2.40417504,  -2.27325184,  -2.4044241 ,  -2.36219966,
       -10.51808624, -10.8681522 ])

In [15]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
num_of_episodes = 100
frames = []

for _ in range(num_of_episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        #import pdb; pdb.set_trace()
        action = q_learn.infer_action(state)
        state, reward, done, info = env.step(action)
        frames.append({
            'frame':env.render(mode='ansi'),
            'state':state,
            'action':action,
            'reward':reward,
        })
        
        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print('Results after {} num_of_episodes:'.format(num_of_episodes))
print('\tAverage timesteps per episode: {}'.format(total_epochs / num_of_episodes))
print('\tAverage penalties per episode: {}'.format(total_penalties / num_of_episodes))

Results after 100 num_of_episodes:
	Average timesteps per episode: 13.18
	Average penalties per episode: 0.0


In [16]:
len(frames)

1318

In [None]:
print_frames(frames)

+---------+
|[35mR[0m: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)

Timestep: 118
State: 96
Action: 4
Reward: -1
