In [8]:
import numpy as np
import gym
import random

In [9]:
env = gym.make('Taxi-v2')
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [10]:
action_size = env.action_space.n
state_size = env.observation_space.n

print("Action size: ", action_size)
print("State size: ", state_size)

Action size:  6
State size:  500


In [11]:
Q_table = np.zeros((state_size,action_size))
print(Q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [12]:
train_episodes = 50000
test_episodes = 100
max_steps = 99

learning_rate = 0.7
gamma = 0.618             #Discount rate

#Exploration parameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

In [13]:
for episode in range(train_episodes):
    #Reset the enviornment
    state = env.reset()
    step = 0
    done = False
    for step in range(max_steps):
        #Choose an action
        exp_exp_tradeoff = random.uniform(0,1)  #the exploration-exploitation tradeoff
        if exp_exp_tradeoff > epsilon:          #Exploit: take the biggest Q value action
            action = np.argmax(Q_table[state,:])
        else:                                   #Explore: take a random action
            action = env.action_space.sample()
        
        #Take action and observe the outcome
        new_state, reward, done, info = env.step(action)
        
        #Update Q: Q(s,a) = Q(s,a) + alpha [R(s,a) + gamma * max(Q(s',a')) - Q(s,a)]
        Q_table[state,action] = Q_table[state,action] + learning_rate * (reward + (gamma * np.max(Q_table[new_state,:])) - Q_table[state,action])
        
        #Update state
        state = new_state
        
        #end episode
        if done == True:
            break
    
    
        
    #Decrease epsilon w.r.t decay rate
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate * episode)

In [14]:
env.reset()
rewards = []

for episode in range(test_episodes):
    state = env.reset()
    episode_rewards = 0
    print("************************************")
    print("Episode ", episode)
    
    for step in range(max_steps):
        env.render()
        action = np.argmax(Q_table[state,:])              #Do the action that gives highest Q value for that state
        
        new_state, reward, done, info = env.step(action)  #Make the step and check the outcomes
    
        episode_rewards += reward                         #add the rewards to the episode's reward
        
        state = new_state                                 #update state
        
        if done:
            rewards.append(episode_rewards)
            print("Score ", episode_rewards)
            break

env.close()
print("Average rate change of score ", sum(rewards)/test_episodes)

************************************
Episode  0
+---------+
|R: | : :[35mG[0m|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (West)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------