# Step1: Import libraries
- Numpy for Q Table
- OpenAI Gym for Environment
- Random for random number

In [1]:
import numpy as np
import gym
import random

# Step2: Init env

In [2]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+



### Tính không gian states và actions

In [3]:
action_space = env.action_space.n
print(action_space)
state_space = env.observation_space.n
print(state_space)

6
500


### Khởi tạo Q-Table

In [4]:
q_table = np.zeros(action_space*state_space).reshape(state_space, action_space)
print(q_table)


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


### Khởi tạo tham số epsilon, decay_rate


In [5]:
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

# Step 3: Training function

In [6]:
def traning(max_episodes, max_steps, learning_rate, gamma, epsilon):
    show_episodes = [0, 99, 499, 999, 49999]
    for i_episode in range(max_episodes):
        sum_reward = 0
        cur_state = env.reset()
        done = False
        if i_episode in show_episodes:
            print('Start episode {}:'.format(i_episode))
            env.render()
            print('state={}'.format(cur_state))
        
        for i_step in range(max_steps):
            if (random.random() < epsilon):
#               exploration 
                action = env.action_space.sample()
            else:
#               exploitation
                action = np.argmax(q_table[cur_state, :])
#           simulate action, observe new_state, reward,..
            new_state, reward, done, info = env.step(action)
            if i_episode in show_episodes:
                print('Step {}:'.format(i_step))
                print('action={}, reward={}, new_state={}'.format(action, reward, new_state))
                print('{}=(1-{})*{}+{}*({}+{}*{})'.format((1 - learning_rate) * q_table[cur_state, action] + \
                learning_rate * (reward + gamma * np.max(q_table[new_state])), learning_rate,
                                                         q_table[cur_state, action], learning_rate, reward,
                                                         gamma, np.max(q_table[new_state])))
                env.render()
#           update q-table     
            q_table[cur_state, action] = (1 - learning_rate) * q_table[cur_state, action] + \
                learning_rate * (reward + gamma * np.max(q_table[new_state]))
#           update state
            cur_state = new_state
            sum_reward += reward
            
            if done or i_step == max_steps-1:
                if i_episode in show_episodes:
                    print('End episode {}! with sum_reward:{}'.format(i_episode, sum_reward))
                break
#           update epsilon(k)        
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*(i_episode+1))
                

# Step 4: Playing function

In [11]:
def playing(episode_num):
    
    for i_episode in range(episode_num):
        sum_reward = 0
        done = False
        cur_state = env.reset()
        print('Start episode {}:'.format(i_episode))
        env.render()
        
        for i_step in range(30):
            import time
            time.sleep(1)
            print('step {}:'.format(i_step))
            action = np.argmax(q_table[cur_state])
            new_state, reward, done, info = env.step(action)
            env.render()
            cur_state = new_state
            sum_reward += reward
            if (done):
                print('End episode {}! with sum_reward={}'.format(i_episode, sum_reward))
                break


### Lựa chọn hyperparameter:
- max_episodes = 50000
- max_steps = 100
- learning_rate = 0.7
- gamma = 0.681
- epsilon = 1.0

In [8]:
traning(50000, 100, 0.7, 0.681, 1.0)

Start episode 0:
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+

state=103
Step 0:
action=4, reward=-10, new_state=103
-7.0=(1-0.7)*0.0+0.7*(-10+0.681*0.0)
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
Step 1:
action=5, reward=-10, new_state=103
-7.0=(1-0.7)*0.0+0.7*(-10+0.681*0.0)
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Dropoff)
Step 2:
action=0, reward=-1, new_state=203
-0.7=(1-0.7)*0.0+0.7*(-1+0.681*0.0)
+---------+
|[34;1mR[0m: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
Step 3:
action=3, reward=-1, new_state=203
-0.7=(1-0.7)*0.0+0.7*(-1+0.681*0.0)
+---------+
|[34;1mR[0m: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
Step 4:
action=3, reward=-1, new_state=203
-0

-10.494648619=(1-0.7)*-10.06369+0.7*(-10+0.681*-0.9975700000000001)
+---------+
|R: | : :G|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (Dropoff)
Step 7:
action=5, reward=-10, new_state=151
-10.6239362047=(1-0.7)*-10.494648619+0.7*(-10+0.681*-0.9975700000000001)
+---------+
|R: | : :G|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (Dropoff)
Step 8:
action=4, reward=-10, new_state=151
-9.575541619=(1-0.7)*-7.0+0.7*(-10+0.681*-0.9975700000000001)
+---------+
|R: | : :G|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (Pickup)
Step 9:
action=3, reward=-1, new_state=131
-1.6816036489000001=(1-0.7)*-0.9975700000000001+0.7*(-1+0.681*-1.431367)
+---------+
|R: | : :G|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
Step 10:
action=0, reward=-1, new_state=231
-1.807669213=(1-0.7)*-1.6159002999999998+0.7*(-1+0.68

Start episode 499:
+---------+
|[35mR[0m: | : :G|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

state=108
Step 0:
action=0, reward=-1, new_state=208
-0.09198431025094005=(1-0.7)*-0.09737882462172538+0.7*(-1+0.681*1.3367512841107143)
+---------+
|[35mR[0m: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
Step 1:
action=0, reward=-1, new_state=308
1.3367540868260461=(1-0.7)*1.3367512841107143+0.7*(-1+0.681*3.4313587195150657)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
Step 2:
action=0, reward=-1, new_state=408
3.431359806334133=(1-0.7)*3.4313587195150657+0.7*(-1+0.681*6.507136963456289)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
Step 3:
action=4, reward=-1, new_state=416
6.507137448525474=(1-0.7)*6.507136963456289+0.7*(-1+0.681*11.0236969991369

In [9]:
q_table

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -1.06106789,  -0.08984107,  -2.17716929,  -0.11085477,
          1.33675689,  -9.08966898],
       [  1.33431743,   3.43099769,   1.3367343 ,   3.43099302,
          6.50713814,  -5.56867113],
       ...,
       [ -2.03196759,   3.20579081,  -1.99396202,  -2.07793538,
         -9.73      , -10.52416982],
       [ -2.63105407,  -0.09181404,  -1.64739908,  -2.69892435,
        -11.07588497, -10.54749575],
       [ -0.7       ,  -0.7       ,  -0.7       ,  41.69592476,
         -7.        ,   0.        ]])

In [12]:
playing(1)

Start episode 0:
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+

step 0:
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
step 1:
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[42mY[0m| : |B: |
+---------+
  (Pickup)
step 2:
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |B: |
+---------+
  (North)
step 3:
+---------+
|[35mR[0m: | : :G|
| : : : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
step 4:
+---------+
|[35mR[0m: | : :G|
|[42m_[0m: : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
step 5:
+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
step 6:
+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
End episode 0! 

In [13]:
env.reset()
env.render()

+---------+
|R: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+

