# Step1: Import libraries
- Numpy for Q Table
- OpenAI Gym for Environment
- Random for random number

In [1]:
import numpy as np
import gym
import random

# Step2: Init env

In [2]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|R: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



### Tính không gian states và actions

In [3]:
action_space = env.action_space.n
print(action_space)
state_space = env.observation_space.n
print(state_space)

6
500


### Khởi tạo Q-Table

In [4]:
q_table = np.zeros(action_space*state_space).reshape(state_space, action_space)
print(q_table)


[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


### Khởi tạo tham số epsilon, decay_rate


In [5]:
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

# Step 3: Training function

In [6]:
def traning(max_episodes, max_steps, learning_rate, gamma, epsilon):
    show_episodes = [99, 499, 999, 49999]
    for i_episode in range(max_episodes):
        sum_reward = 0
        cur_state = env.reset()
        done = False
        if i_episode in show_episodes:
            print('Start episode {}:'.format(i_episode))
            env.render()
        
        for i_step in range(max_steps):
            if (random.random() < epsilon):
#               exploration 
                action = env.action_space.sample()
            else:
#               exploitation
                action = np.argmax(q_table[cur_state, :])
#           simulate action, observe new_state, reward,..
            new_state, reward, done, info = env.step(action)
            if i_episode in show_episodes:
                print('Step {}:'.format(i_step+1))
                env.render()
#           update q-table     
            q_table[cur_state, action] = (1 - learning_rate) * q_table[cur_state, action] + \
                learning_rate * (reward + gamma * np.max(q_table[new_state]))
#           update state
            cur_state = new_state
            sum_reward += reward
            if i_episode in show_episodes:
                
            if (done or i_step == max_steps-1):
                print('End episode {}! with sum_reward:{}'.format(i_episode, sum_reward))
                break
#           update epsilon(k)        
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*(i_episode+1))
                

# Step 4: Playing function

In [7]:
def playing(episode_num):
    
    for i_episode in range(episode_num):
        done = False
        cur_state = env.reset()
        print('Start episode {}:'.format(i_episode))
        env.render()
        
        for i_step in range(30):
            print('step {}:'.format(i_step))
            action = np.argmax(q_table[cur_state])
            new_state, reward, done, info = env.step(action)
            env.render()
            cur_state = new_state
            if (done):
                print('End episode {}!'.format(i_episode))
                break


### Lựa chọn hyperparameter:
- max_episodes = 50000
- max_steps = 100
- learning_rate = 0.7
- gamma = 0.681
- epsilon = 1.0

In [8]:
traning(50000, 100, 0.7, 0.681, 1.0)

In [9]:
playing(1)

Start episode 0:
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|[35mB[0m: |
+---------+

step 0:
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)
step 1:
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
step 2:
+---------+
|R: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)
step 3:
+---------+
|R: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
step 4:
+---------+
|R: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
step 5:
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
step 6:
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[42mY[0m| : |