In this Notebook, we'll implement an agent that plays OpenAI Taxi-V2.

### How to play the Game ?

The goal of this game is that our agent must pick up the passenger at one location and drop him off to the goal as fast as possible.


### Rules
There are 4 locations (labeled by different letters) and your job is to pick up the passenger at one location and drop him off in another.

* +20 points for a successful dropoff
* -1 points for every timestep it takes.
* -10 points for every illegal pick-up and drop-off actions (if you don't drop the passenger in one of the 3 other locations)


In [1]:
import numpy as np
import gym
import random


In [5]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[43mY[0m| : |B: |
+---------+



In [6]:
action_size = env.action_space.n
print("Possible actions :" +str(action_size))

state_size = env.observation_space.n
print("State size : "+str(state_size))

Possible actions :6
State size : 500


In [7]:
q_table = np.zeros((state_size, action_size))
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [8]:
total_episodes = 50000
total_test_episodes = 100
max_steps = 99

learning_rate = 0.7
gamma = 0.618

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01


In [None]:
%%time
for episode in range(total_episodes):
    state = env.reset()
    done = False
    
    for step in range(max_steps):
        exp_tradeoff = random.uniform(0,1)
        
        if exp_tradeoff > epsilon : #eploit
            action = np.argmax(qtable[state,:])
        else: #explore
            action = env.action_space.sample()
            
        obs, reward, done, info = env.step(action)
        
        q_table[state, action] = q_table[state, action] + learning_rate * (reward + gamma * np.max(q_table[obs, :]) - q_table[state, action])
        
        state = obs
        
        if done:
            break
        
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(decay_rate * episode)

In [None]:
%%time
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        action = np.argmax(q_table[state,:])
        obs, reward, done, info = env.step(action)
        total_rewards += reward
        env.render()
        
        if done:
            rewards.append(total_rewards)
            break
        
        state = obs
    
env.close()
print('Score over time: '+ str(sum(rewards)/total_test_episodes))

In [None]:
env.reset()

for step in range(max_steps):
    action = np.argmax(q_table[state,:])
    obs, reward, done, info = env.step(action)
    total_rewards += reward
    env.render()

    if done:
        rewards.append(total_rewards)
        break

    state = obs