# Taxi-v3

In [5]:
import gym
env = gym.make("Taxi-v3").env

array([b'+', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'-', b'+'],
      dtype='|S1')

In [26]:
env.desc[1:-1, 1:-1]

array([[b'R', b':', b' ', b'|', b' ', b':', b' ', b':', b'G'],
       [b' ', b':', b' ', b'|', b' ', b':', b' ', b':', b' '],
       [b' ', b':', b' ', b':', b' ', b':', b' ', b':', b' '],
       [b' ', b'|', b' ', b':', b' ', b'|', b' ', b':', b' '],
       [b'Y', b'|', b' ', b':', b' ', b'|', b'B', b':', b' ']],
      dtype='|S1')

In [38]:
cells = env.desc[1:-1, 1:-1]
for x, row  in enumerate(cells):
    for y, cell in enumerate(row):
        print('(',x,',',y,') - ', cell)

( 0 , 0 ) -  b'R'
( 0 , 1 ) -  b':'
( 0 , 2 ) -  b' '
( 0 , 3 ) -  b'|'
( 0 , 4 ) -  b' '
( 0 , 5 ) -  b':'
( 0 , 6 ) -  b' '
( 0 , 7 ) -  b':'
( 0 , 8 ) -  b'G'
( 1 , 0 ) -  b' '
( 1 , 1 ) -  b':'
( 1 , 2 ) -  b' '
( 1 , 3 ) -  b'|'
( 1 , 4 ) -  b' '
( 1 , 5 ) -  b':'
( 1 , 6 ) -  b' '
( 1 , 7 ) -  b':'
( 1 , 8 ) -  b' '
( 2 , 0 ) -  b' '
( 2 , 1 ) -  b':'
( 2 , 2 ) -  b' '
( 2 , 3 ) -  b':'
( 2 , 4 ) -  b' '
( 2 , 5 ) -  b':'
( 2 , 6 ) -  b' '
( 2 , 7 ) -  b':'
( 2 , 8 ) -  b' '
( 3 , 0 ) -  b' '
( 3 , 1 ) -  b'|'
( 3 , 2 ) -  b' '
( 3 , 3 ) -  b':'
( 3 , 4 ) -  b' '
( 3 , 5 ) -  b'|'
( 3 , 6 ) -  b' '
( 3 , 7 ) -  b':'
( 3 , 8 ) -  b' '
( 4 , 0 ) -  b'Y'
( 4 , 1 ) -  b'|'
( 4 , 2 ) -  b' '
( 4 , 3 ) -  b':'
( 4 , 4 ) -  b' '
( 4 , 5 ) -  b'|'
( 4 , 6 ) -  b'B'
( 4 , 7 ) -  b':'
( 4 , 8 ) -  b' '


In [2]:
env.reset()
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



## Without RL (Random)

In [26]:
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 3611
Penalties incurred: 1105


In [28]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)

+---------+
|[35mR[0m: | : :G|
| : | : :[42m_[0m|
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 1002
State: 196
Action: 5
Reward: -10


## Q-Learning (Q-Table)

In [11]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [12]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output
import matplotlib.pyplot as plt
import seaborn as sns
from time import sleep
%matplotlib inline

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 1min 8s, sys: 14.9 s, total: 1min 23s
Wall time: 1min 7s


In [14]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties, total_rewards = 0, 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward, rewards = 0, 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1
        rewards += reward

    total_penalties += penalties
    total_epochs += epochs
    total_rewards += rewards

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")
print(f"Average reward per episode: {total_rewards / episodes}")

NameError: name 'total_rewards' is not defined

## Deep Q-Learning (DQN)

In [27]:
import gym
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Embedding, Reshape
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [10]:
ENV_NAME = 'Taxi-v3'

# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
action_size = env.action_space.n

In [28]:
model = Sequential()
model.add(Embedding(500, 10, input_length=1))
model.add(Reshape((10,)))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(action_size, activation='linear'))
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             5000      
_________________________________________________________________
reshape_1 (Reshape)          (None, 10)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 50)                550       
_________________________________________________________________
dense_11 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_12 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_13 (Dense)             (None, 6)                 306       
Total params: 10,956
Trainable params: 10,956
Non-trainable params: 0
__________________________________________________

In [4]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=action_size, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [5]:
dqn.fit(env, nb_steps=1000000, visualize=False, verbose=1, nb_max_episode_steps=99, log_interval=100000)

Training for 1000000 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
1065 episodes - episode_reward: -124.972 [-891.000, 14.000] - loss: 4.205 - mae: 25.872 - mean_q: -26.502 - prob: 1.000

Interval 2 (100000 steps performed)
4659 episodes - episode_reward: -7.689 [-630.000, 15.000] - loss: 0.896 - mae: 9.606 - mean_q: 3.227 - prob: 1.000

Interval 3 (200000 steps performed)
6798 episodes - episode_reward: 2.367 [-68.000, 15.000] - loss: 0.003 - mae: 7.362 - mean_q: 12.680 - prob: 1.000

Interval 4 (300000 steps performed)
6770 episodes - episode_reward: 2.321 [-93.000, 15.000] - loss: 0.002 - mae: 7.363 - mean_q: 12.690 - prob: 1.000

Interval 5 (400000 steps performed)
6777 episodes - episode_reward: 2.283 [-135.000, 15.000] - loss: 0.002 - mae: 7.359 - mean_q: 12.696 - prob: 1.000

Interval 6 (500000 steps performed)
6789 episodes - episode_reward: 2.392 [-87.000, 15.000] - 

<tensorflow.python.keras.callbacks.History at 0x7f9d946cc550>

In [6]:
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [10]:
dqn.test(env, nb_episodes=5 , visualize=False, nb_max_episode_steps=99)

Testing for 5 episodes ...
Episode 1: reward: 6.000, steps: 15
Episode 2: reward: 6.000, steps: 15
Episode 3: reward: 8.000, steps: 13
Episode 4: reward: 7.000, steps: 14
Episode 5: reward: 5.000, steps: 16


<tensorflow.python.keras.callbacks.History at 0x7f9d08f25910>