In [1]:
import time
import random
import math
import itertools

In [2]:
import numpy as np

import gym
from gym.envs.registration import register

# Q learning

Q learning requires creating a table that has `number_of_actions` columns and `number_of_states` rows.

The `number_of_actions` has to be `discreted`. That means environment with action like `set the car speed` is not suitable for `Q learning`.

The `number_of_states` also has to be `discreted`.

One example environment is suitable for `Q learning` is `FrozenLake-v0`.

It has `action_sapce` contains 4 actions (move north, south, east, and west).

The `observation_space` contains 16 states (the position of the agent in 4x4 grid world).

In [3]:
class QTable:
    def __init__(self, num_states, num_actions, alpha=0.2, gamma=0.8):
        """
        `alpha`: learning rate

        `gamma`: discount factor
        """
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        # Initialize Q table with 0
        self.q_table = np.zeros((num_states, num_actions), dtype=np.float)

    def update_table(self, state, action, reward, new_state):
        # self.q_table[state, action] = self.q_table[state, action] - self.alpha * (reward + self.gamma * np.max(self.q_table[new_state]) - self.q_table[state, action])

        # or
        self.q_table[state, action] = (1 - self.alpha) * self.q_table[state, action] + \
            self.alpha * (reward + self.gamma *
                          np.max(self.q_table[new_state]))

    def get_next_action(self, state):
        return np.argmax(self.q_table[state])

### Register an un-slippery version of `FrozenLake`

In [4]:
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name': '4x4', 'is_slippery': False},
)

In [15]:
# env = gym.make('FrozenLakeNotSlippery-v0')
env = gym.make('FrozenLake-v0')
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [6]:
hole_states = [5, 7, 11, 12]
goal_states = [15]

hole_reward = -10
goal_reward = 10

In [163]:
#==========Create a q-table==========#
num_states = env.observation_space.n
num_actions = env.action_space.n
q_table = QTable(num_states, num_actions)

num_episodes = 1000
num_timesteps = 128
# set the percent you want to explore
epsilon = 0.8
decrease_step_per_episode = epsilon / num_episodes

training_log = []
success_count = 0
for episode in range(num_episodes):
    print(f'Episode {episode+1}/{num_episodes} ', end='', flush=True)

    episode_log = []
    state = env.reset()
    for timestep in range(num_timesteps):
        print('-', end='', flush=True)
        #==========Taking action: Explore or Exploit==========#

        if random.uniform(0, 1) < epsilon:
            # Explore: select a random action
            action = random.randrange(num_actions)
            isRand = True
        else:
            # Exploit: select the action with max value (future reward)
            action = q_table.get_next_action(state)
            isRand = False

        new_state, reward, done, info = env.step(action)

        if done and (new_state in hole_states):
            # if fall into hole
            mReward = hole_reward
            print('FAILED', end='', flush=True)
        elif done and (new_state in goal_states):
            # reach the goal
            mReward = goal_reward
            success_count += 1
            print('SUCCESS', end='', flush=True)
        else:
            # haven't fallen into hole
            if state == new_state:
                # the agent slams into the wall (not moving to get points)
                mReward = -1
            else:
                # NOTE
                # do not give reward to the agent for repeated tasks
                # the agent will try to exploit them
                mReward = 0

        #==========Updating the q-table==========#
        q_table.update_table(state, action, mReward, new_state)

        episode_log.append({
            'state': state,
            'action': action,
            'isRand': isRand,
            'reward': mReward,
            'new_state': new_state
        })

        state = new_state
        if done:
            break

    epsilon -= decrease_step_per_episode
    training_log.append(episode_log)
    print()
    
print(f'Completed {success_count}/{num_episodes}')

Episode 1/1000 ---------------------------FAILED
Episode 2/1000 ------FAILED
Episode 3/1000 ----FAILED
Episode 4/1000 --------FAILED
Episode 5/1000 ---FAILED
Episode 6/1000 -----------------FAILED
Episode 7/1000 -----FAILED
Episode 8/1000 --------------------------FAILED
Episode 9/1000 -------FAILED
Episode 10/1000 ---------FAILED
Episode 11/1000 ------FAILED
Episode 12/1000 -------------FAILED
Episode 13/1000 --FAILED
Episode 14/1000 --------------FAILED
Episode 15/1000 -------------------FAILED
Episode 16/1000 ------FAILED
Episode 17/1000 --------------FAILED
Episode 18/1000 ---FAILED
Episode 19/1000 -----FAILED
Episode 20/1000 ---FAILED
Episode 21/1000 ----------FAILED
Episode 22/1000 -------FAILED
Episode 23/1000 ------FAILED
Episode 24/1000 --------FAILED
Episode 25/1000 ---FAILED
Episode 26/1000 -----FAILED
Episode 27/1000 -----FAILED
Episode 28/1000 --FAILED
Episode 29/1000 ------FAILED
Episode 30/1000 --------FAILED
Episode 31/1000 ---------FAILED
Episode 32/1000 --------------

Episode 475/1000 ----FAILED
Episode 476/1000 ---FAILED
Episode 477/1000 -----------FAILED
Episode 478/1000 --FAILED
Episode 479/1000 -------FAILED
Episode 480/1000 ------FAILED
Episode 481/1000 -----FAILED
Episode 482/1000 ----------FAILED
Episode 483/1000 --FAILED
Episode 484/1000 --FAILED
Episode 485/1000 -------------------FAILED
Episode 486/1000 ---FAILED
Episode 487/1000 --------------FAILED
Episode 488/1000 -------------FAILED
Episode 489/1000 -----FAILED
Episode 490/1000 --FAILED
Episode 491/1000 ------FAILED
Episode 492/1000 -----------FAILED
Episode 493/1000 -------------FAILED
Episode 494/1000 -----------------FAILED
Episode 495/1000 ---FAILED
Episode 496/1000 -------FAILED
Episode 497/1000 --------------SUCCESS
Episode 498/1000 -----------FAILED
Episode 499/1000 -------------------------SUCCESS
Episode 500/1000 ------FAILED
Episode 501/1000 ---FAILED
Episode 502/1000 ---FAILED
Episode 503/1000 --FAILED
Episode 504/1000 ---FAILED
Episode 505/1000 ---------------FAILED
Episode

Episode 862/1000 -----FAILED
Episode 863/1000 --------------FAILED
Episode 864/1000 --------------------------FAILED
Episode 865/1000 --------------------------------------FAILED
Episode 866/1000 ---------------------------------------FAILED
Episode 867/1000 -----------------------------------------FAILED
Episode 868/1000 ----------------------------------------------------------------------------------------------------
Episode 869/1000 ----------------------------------------------------------------------------------------------------
Episode 870/1000 ------------------------------------------------------------FAILED
Episode 871/1000 ------------------SUCCESS
Episode 872/1000 --------------------------------------------------SUCCESS
Episode 873/1000 ----------------------------------------FAILED
Episode 874/1000 ----------------FAILED
Episode 875/1000 --------------------FAILED
Episode 876/1000 ---------------FAILED
Episode 877/1000 -------FAILED
Episode 878/1000 --------------------

## Test Q-table (the `agent`)

In [164]:
q_table.q_table

array([[-2.06952362, -1.18700144, -1.9975492 , -2.01918666],
       [-6.63367372, -6.10958537, -4.73099712, -1.18219919],
       [-3.01577792, -2.80847895, -2.63912003, -2.09304602],
       [-4.97200811, -4.82077609, -4.23417131, -2.41452257],
       [-1.1113254 , -3.68132603, -3.6246798 , -4.40315683],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-7.62319521, -8.36069396, -4.00623342, -7.08713114],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-4.42125827, -2.55609826, -5.29127536, -0.61116544],
       [-3.44395802,  1.06824206, -5.9806985 , -2.42490136],
       [ 2.17252481, -1.96198101, -6.6115975 , -4.56528322],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-5.7740482 , -2.60441218,  1.64062806, -3.06680809],
       [ 1.6155042 ,  6.79209893,  3.07980494,  1.58253021],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])

In [165]:
state = env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


Run this `cell` repeatedly to observe the `agent`

In [194]:
obs = env.step(q_table.get_next_action(state))
print(obs)

new_state, reward, done, info = obs
state = new_state
env.render()

(15, 0, True, {'prob': 1.0})
  (Left)
SFFF
FHFH
FFFH
HFF[41mG[0m
