# Q-learning with FrozenLake

Based on https://github.com/ioarun/openai-gym/blob/master/frozenlake/frozenlake-qlearning.py 

Environment: https://www.gymlibrary.dev/environments/toy_text/frozen_lake/

In [9]:
import numpy as np
import random
import math

First we need to install pygame.

In [10]:
!pip install pygame

Defaulting to user installation because normal site-packages is not writeable
Collecting pygame
  Downloading pygame-2.1.2-cp39-cp39-win_amd64.whl (8.4 MB)
Installing collected packages: pygame
Successfully installed pygame-2.1.2


In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

Next, we install the needed requirements to display an Atari game.

In [None]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install pyglet==1.2.4

Next, we define the functions used to show the video by adding it to the CoLab notebook.

In [2]:
import gym
from gym.wrappers import RecordVideo 
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""


def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")


def wrap_env(env):
    env = RecordVideo(env, './video')
    return env


ModuleNotFoundError: No module named 'pyvirtualdisplay'

## Problem description

In [None]:
'''
The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. Additionally, the movement direction of the agent is uncertain and only partially depends on the chosen direction. The agent is rewarded for finding a walkable path to a goal tile.

A frozenlake-v0 is a 4x4 grid world that looks as follows:
SFFF       
FHFH       
FFFH       
HFFG       

Meaning of the letters:
S: starting point, safe
F: frozen surface, safe
H: hole, fall to your doom
G: goal, where the frisbee is located

The 16 states (position of the agent): 
State 0: upper left corner (Start)
...
State 15: Lower right corner (Goal)

The 4 actions (moves of the agent):
LEFT = 0,
DOWN = 1,
RIGHT = 2,
UP = 3.

Reward:
The episode ends when you reach the goal or fall into the water. 
You receive a reward of 1 if you reach the goal, and 0 otherwise.

Effect of actions:
        def inc(row, col, a):
            if a == LEFT:
                col = max(col-1,0)
            elif a == DOWN:
                row = min(row+1,nrow-1)
            elif a == RIGHT:
                col = min(col+1,ncol-1)
            elif a == UP:
                row = max(row-1,0)
            return (row, col)
'''

## Define environment

In [6]:
env = wrap_env(gym.make("FrozenLake-v1",is_slippery=False,new_step_api=True,render_mode="rgb_array"))

NameError: name 'wrap_env' is not defined

In [7]:
env.reset()

NameError: name 'env' is not defined

In [8]:
env.render()

NameError: name 'env' is not defined

If you don't see the "video", click on the Folder icon in the left bar in Colab, click 'video', download the mp4-file and run. 

## Actions

In [None]:
#Sample actions for exploration:
env.action_space.sample()

## Initialization

In [None]:
import numpy as np
import random

In [4]:
num_episodes = 15000 #20000 #60000
gamma = 0.95 #0.99
learning_rate = 0.7 #0.95 #0.85
epsilon = 0.5#1 #0.15 #0.1

# initialize the Q table
Q = np.zeros([16, 4])
Q

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

## Training the Q-table

In [5]:
for _ in range(num_episodes):
	state = env.reset()
	done = False
	while done == False:
        # First we select an action:
		if random.uniform(0, 1) < epsilon: # Flip a skewed coin
			action = env.action_space.sample() # Explore action space
		else:
			action = np.argmax(Q[state,:]) # Exploit learned values
        # Then we perform the action and receive the feedback from the environment
		new_state, reward, done, info = env.step(action)
        # Finally we learn from the experience by updating the Q-value of the selected action
		update = reward + (gamma*np.max(Q[new_state,:])) - Q[state, action]
		Q[state,action] += learning_rate*update 
		state = new_state

NameError: name 'env' is not defined

In [None]:
Q

## Sanity check

In [None]:
'''
Let us sanity check some of the Q-values. 
First we recall what the environment looks like:
SFFF       
FHFH       
FFFH       
HFFG       

And what the 4 actions are:
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
'''

'\nLet us sanity check some of the Q-values. \nFirst we recall what the environment looks like:\nSFFF       \nFHFH       \nFFFH       \nHFFG       \n\nAnd what the 4 actions are:\nLEFT = 0\nDOWN = 1\nRIGHT = 2\nUP = 3\n'

In [None]:
np.argmax(Q[0])
#Should be 1 or 2

In [None]:
np.argmax(Q[3])
#Should be 0

In [None]:
np.argmax(Q[10])
#Should be 1

In [None]:
np.argmax(Q[14])
#Should be 2

## Using the Q-table

In [None]:
# Is our Q good enough to guide us from start to goal without falling into the water?
env = wrap_env(gym.make("FrozenLake-v1",is_slippery=False))
state = env.reset()

for step in range(10):
    env.render()
    # Take the action (index) with the maximum expected discounted future reward given that state
    #action = env.action_space.sample()
    action = np.argmax(Q[state,:])
    print("Step ",step,": Action ",action)
    state, reward, done, info = env.step(action)

    if done:
        break

#env.close()
show_video()

Note: If old videos are in the video catalogue, then the first video will always be shown. By removing all videos from there you will be sure that the latest Frozen Lake video will be shown.  