In [1]:
import time
import random
import math
import itertools

In [2]:
import numpy as np

import gym
from gym.envs.registration import register

In [3]:
class QTable:
    def __init__(self, num_states, num_actions, alpha=0.2, gamma=0.8):
        """
        `alpha`: learning rate

        `gamma`: discount factor
        """
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        # Initialize Q table with 0
        self.q_table = np.zeros((num_states, num_actions), dtype=np.float)

    def update_table(self, state, action, reward, new_state):
        # self.q_table[state, action] = self.q_table[state, action] - self.alpha * (reward + self.gamma * np.max(self.q_table[new_state]) - self.q_table[state, action])

        # or
        self.q_table[state, action] = (1 - self.alpha) * self.q_table[state, action] + \
            self.alpha * (reward + self.gamma *
                          np.max(self.q_table[new_state]))

    def get_next_action(self, state):
        return np.argmax(self.q_table[state])

### Register an un-slippery version of `FrozenLake`

In [4]:
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name': '4x4', 'is_slippery': False},
)

In [5]:
env = gym.make('FrozenLakeNotSlippery-v0')
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [6]:
hole_states = [5, 7, 11, 12]
goal_states = [15]

hole_reward = -10
goal_reward = 10

In [7]:
#==========Create a q-table==========#
num_states = env.observation_space.n
num_actions = env.action_space.n
q_table = QTable(num_states, num_actions)

num_episodes = 100
num_timesteps = 128
# set the percent you want to explore
epsilon = 0.8
decrease_step_per_episode = epsilon / num_episodes

training_log = []
for episode in range(num_episodes):
    print(f'Episode {episode+1}/{num_episodes} ', end='', flush=True)

    episode_log = []
    state = env.reset()
    for timestep in range(num_timesteps):
        print('-', end='', flush=True)
        #==========Taking action: Explore or Exploit==========#

        if random.uniform(0, 1) < epsilon:
            # Explore: select a random action
            action = random.randrange(num_actions)
            isRand = True
        else:
            # Exploit: select the action with max value (future reward)
            action = q_table.get_next_action(state)
            isRand = False

        new_state, reward, done, info = env.step(action)

        if done and (new_state in hole_states):
            # if fall into hole
            mReward = hole_reward
        elif done and (new_state in goal_states):
            # reach the goal
            mReward = goal_reward
        else:
            # haven't fallen into hole
            if state == new_state:
                # the agent slams into the wall (not moving to get points)
                mReward = -1
            else:
                # NOTE
                # do not give reward to the agent for repeated tasks
                # the agent will try to exploit them
                mReward = 0

        #==========Updating the q-table==========#
        q_table.update_table(state, action, mReward, new_state)

        episode_log.append({
            'state': state,
            'action': action,
            'isRand': isRand,
            'reward': mReward,
            'new_state': new_state
        })

        state = new_state
        if done:
            break

    epsilon -= decrease_step_per_episode
    training_log.append(episode_log)
    print()

Episode 1/100 -------
Episode 2/100 ----------
Episode 3/100 ----------
Episode 4/100 --------
Episode 5/100 --
Episode 6/100 ------------
Episode 7/100 ------
Episode 8/100 ----------------
Episode 9/100 --
Episode 10/100 ----
Episode 11/100 ----------------
Episode 12/100 -----
Episode 13/100 --
Episode 14/100 ----------------
Episode 15/100 --
Episode 16/100 ---------------
Episode 17/100 ------
Episode 18/100 -----
Episode 19/100 ----
Episode 20/100 --------
Episode 21/100 ---------
Episode 22/100 ------
Episode 23/100 --
Episode 24/100 --
Episode 25/100 --
Episode 26/100 --------
Episode 27/100 ----
Episode 28/100 ------
Episode 29/100 ------------
Episode 30/100 --------------
Episode 31/100 ----
Episode 32/100 --
Episode 33/100 ---------------
Episode 34/100 -----------
Episode 35/100 --
Episode 36/100 --
Episode 37/100 --------
Episode 38/100 ----
Episode 39/100 -----
Episode 40/100 -----
Episode 41/100 --------------
Episode 42/100 ---------
Episode 43/100 --------
Episode 44/

## Test Q-table (the `agent`)

In [8]:
state = env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


Run this `cell` repeatedly to observe the `agent`

In [14]:
obs = env.step(q_table.get_next_action(state))
print(obs)

new_state, reward, done, info = obs
state = new_state
env.render()

(15, 1.0, True, {'prob': 1.0})
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
