# OpenAI Taxi v2

## State 0: Importing Dependencies

In [13]:
import numpy as np
import gym
import random

## State 1: Create Environment

In [14]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+



## State 2: Create the Q-table and initialize it

In [15]:
action_space = env.action_space.n
print("Action space's size: ", action_size)

state_space = env.observation_space.n
print("State space's size: ", state_space)

Action space's size:  6
State space's size:  500


In [16]:
qtable = np.zeros((state_space, action_space))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


## State 3: Create Hyperparameters

In [17]:
total_episodes = 50000
total_test_episodes = 100
max_steps = 99

learning_rate = 0.7
gamma = 0.618

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

## State 4: Q-learning Algorithm

In [1]:
for episode in range(total_episodes):
    #Reset environment in the beginning of each episode
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        #Exploration and exploitation tradeoff
        exp_exp_tradeoff = random.uniform(0, 1)
        
        #Exploitation if number is greater than epsilon
        if exp_exp_tradeoff > epsilon:
            #Look in Q-table
            action = np.argmax(qtable[state, :])
            
        #Exploration
        else:
            action = env.action_space.sample()
            
        
        #Take an action in environment
        new_state, reward, done, info = env.step(action)
        
        #Update Q-table with Bellman equation
        qtable[state, action] = qtable[state, action] + learning_rate*(reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action])
        
        #Update state
        state = new_state
        
        if done:
            break
        
    episode += 1
    
    #Reduce epsilon to get more exploitation
    epsilon = min_epsilon +(max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        
        

NameError: name 'total_episodes' is not defined

## State 5: Use Q-table to play Taxi v0

In [25]:
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    print("*************************************")
    print("Episode: ", episode)
    
    for step in range(max_steps):
        env.render()
        
        #Take action with biggest expected future reward
        action = np.argmax(qtable[state, :])
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            print("Score: ", total_rewards)
            break
            
        state = new_state
    
env.close()
print("Score over time: " + str(sum(rewards)/total_test_episodes))
        
        
    


*************************************
Episode:  0
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |B: |
+---------+

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[42mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |B: |
+---------+
  (North

  (Pickup)
+---------+
|R:[42m_[0m| : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| :[42m_[0m: : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : :[42m_[0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (South)
Score:  8
*************************************
Episode:  65
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+

+---------+
|R: | : :[35mG[0m|
