In [17]:
import gym

import numpy as np
import time
import random

from IPython.display import clear_output

In [18]:
# Step 1:Prepare the environment

env = gym.make("FrozenLake-v0")
actions = env.action_space.n
states = env.observation_space.n
env.reset()
actions, states


(4, 16)

In [19]:
# Create the q table
table = np.zeros((states, actions))
table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [20]:
# Step 2 : Hyperparams
gamma = 0.99
learning_rate = 0.1 

# Exploration rate - This will be updated during training
explorate_start = 1
explorate_end = 0.01
explorate_decay = 0.001
exploration = 1

max_episodes = 10000
max_steps_episode = 100

rewards = []

In [21]:
# Training

for episode in range(max_episodes):
    episode_reward = 0
    state = env.reset()
    done = False
    
    for step in range(max_steps_episode):
        # Choose an action based on explore or exploit
        if exploration > random.random():
            # Explore
            action = random.randrange(actions)
            
        else:
            # exploit using existing information
            # Find the action in the current state which has the max q value
            action = np.argmax(table[state, :])
        
        # Take action
        new_state, reward, done, info  = env.step(action)
        episode_reward+=new_state
        
        # Update table[state, action]
        # The optimal q value is the max of all [new_state, action] values in the new state.
        # Note that this is not the np.argmax() which is the action index
        optimal_q_value = np.max(table[new_state, :])
        table[state, action] =  (1-learning_rate)*table[state, action] + learning_rate*(reward + gamma*optimal_q_value)
        
        # Update the state
        state = new_state
        
        if done:
            rewards.append(episode_reward)
            break
    
    # Update exploration after each episode
    exploration = explorate_end + (explorate_start-explorate_end)*(np.exp(-explorate_decay*episode))
    

In [22]:
# Sanity check on rewards

for i in range(0, len(rewards), 100):
    print("Average rewards per 100 episodes", np.average(rewards[i:i+100]))


Average rewards per 100 episodes 23.23
Average rewards per 100 episodes 22.21
Average rewards per 100 episodes 26.91
Average rewards per 100 episodes 31.66
Average rewards per 100 episodes 33.61
Average rewards per 100 episodes 35.13
Average rewards per 100 episodes 44.31
Average rewards per 100 episodes 46.96
Average rewards per 100 episodes 45.93
Average rewards per 100 episodes 72.06
Average rewards per 100 episodes 61.51
Average rewards per 100 episodes 71.72
Average rewards per 100 episodes 70.1
Average rewards per 100 episodes 91.65
Average rewards per 100 episodes 85.5
Average rewards per 100 episodes 95.24
Average rewards per 100 episodes 107.6
Average rewards per 100 episodes 105.71
Average rewards per 100 episodes 108.35
Average rewards per 100 episodes 125.91
Average rewards per 100 episodes 110.54
Average rewards per 100 episodes 125.06
Average rewards per 100 episodes 117.78
Average rewards per 100 episodes 155.26
Average rewards per 100 episodes 178.16
Average rewards per

In [None]:
# Test

for episode in range(5):
    state = env.reset()
    done = False
    clear_output(wait=True)
    time.sleep(1)
    
    for step in range(max_steps_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        # Best action
        action = np.argmax(table[state, :])
        
        # Apply action
        new_state, reward, done, info = env.step(action)
        state = new_state
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("Won")
                time.sleep(3)
            else:
                print("Lost")
                time.sleep(3)
            clear_output(wait=True)
            break
        
    

  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
