In [59]:
import numpy as np
import random
import gym
from tqdm import tqdm_notebook as tqdm
from IPython.display import clear_output
import time

# Reinforcement Learning With the Frozen Lake Game

The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. Additionally, the movement direction of the agent is uncertain and only partially depends on the chosen direction. The agent is rewarded for finding a walkable path to a goal tile.

#### Example Board:
* SFFF       (S: starting point, safe)
* FHFH       (F: frozen surface, safe)
* FFFH       (H: hole, fall to your doom)
* HFFG       (G: goal, where the frisbee is located)

The episode ends when you reach the goal or fall in a hole. You receive a reward of 1 if you reach the goal, and zero otherwise.

###### https://gym.openai.com/envs/FrozenLake-v0/

# Load Game Environment

In [12]:
environment = gym.make("FrozenLake-v0")

  result = entry_point.load(False)


# Create Q-Table

In [3]:
action_space = environment.action_space.n
state_space = environment.observation_space.n
q_table = np.zeros((state_space, action_space))

# Learning HyperParams

In [43]:
total_episodes = 20000
learning_rate = 0.2
max_steps = 100
discount_rate = 0.99

# Exploitation vs Exploration params.
exploration_rate = 1
max_exploration_probability = 1
min_exploration_probability = 0.01
exploration_probability_decay_rate = 0.005

# Q-Learning Algorithm
* Continue learning until either the algorithm runs out of time(steps) or the game ends (death, reaches end, etc.).
* For each step choose some action (a) from the state (s) based on the values stored in the q-table (Q(s,a)).
* Perform chosen action and observe new state (s') and reward (r).
* Update q-table using the Bellman equation.

In [44]:
rewards = []
for episode in tqdm(range(total_episodes)):
    state = environment.reset()
    step = 0
    done = False
    total_reward = 0
    
    for step in range(max_steps):
        # First the algorithm must decide whether or not to take a random
            # action to explore or the best possible action to exploit.
            
        exploration_exploitation_value = random.uniform(0,1)
        
        if exploration_exploitation_value > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = environment.action_space.sample()
                
        # After choosing an action we update the environment with that action.
        new_state, reward, done, info = environment.step(action)

        # Now we must update the q table with the bellman equation.
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))
        
        total_reward += reward
        state = new_state
        
        if done: break
        
    exploration_rate = min_exploration_probability + (max_exploration_probability - min_exploration_probability) * np.exp(-exploration_probability_decay_rate * episode)
        
    rewards.append(total_reward)

n = 1000
rewards_per_n_episodes = np.split(np.array(rewards), total_episodes / n)
for index, reward in enumerate(rewards_per_n_episodes):
    print("After {} episodes of training the average reward was: {} .".format(n * index, str(sum(reward) / n)))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

After 0 episodes of training the average reward was: 0.326 .
After 1000 episodes of training the average reward was: 0.651 .
After 2000 episodes of training the average reward was: 0.671 .
After 3000 episodes of training the average reward was: 0.668 .
After 4000 episodes of training the average reward was: 0.651 .
After 5000 episodes of training the average reward was: 0.666 .
After 6000 episodes of training the average reward was: 0.678 .
After 7000 episodes of training the average reward was: 0.68 .
After 8000 episodes of training the average reward was: 0.678 .
After 9000 episodes of training the average reward was: 0.696 .
After 10000 episodes of training the average reward was: 0.645 .
After 11000 episodes of training the average reward was: 0.668 .
After 12000 episodes of training the average reward was: 0.672 .
After 13000 episodes of training the average reward was: 0.692 .
After 14000 episodes of training the average reward was: 0.645 .
After 15000 episodes of training the av

# Watch the algorithm play FrozenLake

In [61]:
environment.reset()


state = environment.reset()
step = 0
done = False

for step in range(max_steps):
    action = np.argmax(q_table[state,:])

    new_state, reward, done, info = environment.step(action)

    environment.render()
    state = new_state
    
    if done: break
    
    time.sleep(0.2)
    clear_output(wait=True)

            
environment.close()
        

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
