# Frozen lake

Read environment description: 
https://gym.openai.com/envs/FrozenLake-v0/

In [242]:
import gym
import numpy as np
import random

In [441]:
env = gym.make('FrozenLake-v0')
state = env.reset()
print('State:', state)
env.render()

State: 0

[41mS[0mFFF
FHFH
FFFH
HFFG


In [501]:
(state, reward, done, _) = env.step(3)
print((state, reward, done))
env.render()

(0, 0.0, False)
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG


# Monte-Carlo (v1)

## Simulate random sequences

In [616]:
gamma = 0.9

def run_episode_random():
    env = gym.make('FrozenLake-v0')
    observations = []
    state = env.reset()
    while True:
        action = env.action_space.sample()
        (next_state, reward, done, _) = env.step(action)
        observations.append([state, action, next_state, reward, done, 0.])
        if done:
            break
        state = next_state
        
    g = 0.
    for i in reversed(range(len(observations))):
        g = g * gamma + observations[i][3]
        observations[i][5] = g
    
    return observations

In [534]:
run_episode_random()

[[0, 1, 4, 0.0, False, 0.0],
 [4, 2, 0, 0.0, False, 0.0],
 [0, 1, 4, 0.0, False, 0.0],
 [4, 2, 8, 0.0, False, 0.0],
 [8, 0, 12, 0.0, True, 0.0]]

In [611]:
sum_rewards =np.zeros((16, 4))
counts = np.zeros((16, 4))

for i in range(10000):
    observations = run_episode_random()
    for obs in observations:
        state = obs[0]
        action = obs[1]
        g = obs[5]
        counts[state, action] += 1
        sum_rewards[state, action] += g
        
q = sum_rewards/counts

  del sys.path[0]


In [605]:
q

array([[0.0148462 , 0.01254439, 0.01190693, 0.01202221],
       [0.00505569, 0.00842471, 0.00777432, 0.01141634],
       [0.01791014, 0.0165918 , 0.01894413, 0.00933774],
       [0.00790739, 0.00855484, 0.00638967, 0.01219068],
       [0.02350841, 0.01811445, 0.01387072, 0.01263384],
       [       nan,        nan,        nan,        nan],
       [0.04883013, 0.04321282, 0.03818029, 0.00239562],
       [       nan,        nan,        nan,        nan],
       [0.02065839, 0.04878105, 0.03393753, 0.05339776],
       [0.08031372, 0.10282985, 0.11987337, 0.07174003],
       [0.14430159, 0.14968003, 0.17355491, 0.05090913],
       [       nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan],
       [0.06249295, 0.17137273, 0.25648404, 0.16876812],
       [0.15170053, 0.52348244, 0.48350566, 0.35708213],
       [       nan,        nan,        nan,        nan]])

## Act according to q

In [549]:
env = gym.make('FrozenLake-v0')
state = env.reset()
print('State:', state)
env.render()

State: 0

[41mS[0mFFF
FHFH
FFFH
HFFG


In [595]:
action = np.argmax(q[state])
(state, reward, done, _) = env.step(action)
env.render()

  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG


## Evaluate

In [596]:
def run_episode_q():
    env = gym.make('FrozenLake-v0')
    observations = []
    state = env.reset()
    while True:
        action = np.argmax(q[state])
        (next_state, reward, done, _) = env.step(action)
        observations.append([state, action, next_state, reward, done])
        if done:
            break
        state = next_state
    return observations

In [597]:
# Random agent
episodes = 1000
sum = 0.
for i in range(episodes):
    observations = run_episode_random()
    sum += observations[-1][3]
print('Average reward: ', sum/episodes)

Average reward:  0.011


In [615]:
# Q-greedy agent
episodes = 1000
sum = 0.
for i in range(episodes):
    observations = run_episode_q()
    sum += observations[-1][3]
print('Average reward: ', sum/episodes)

Average reward:  0.092


# Monte-Carlo (v2)

In [696]:
counts = np.zeros((16, 4))
sum_rewards =np.zeros((16, 4))
epsilon = 0.1
gamma = 0.99
epochs = 0

def act_eps_greedy(state):
    if random.random() < epsilon:
        return random.randint(0, 3)
    else:
        return np.argmax(sum_rewards[state] / counts[state])

def run_episode():
    env = gym.make('FrozenLake-v0')
    observations = []
    state = env.reset()
    while True:
        action = act_eps_greedy(state)
        (next_state, reward, done, _) = env.step(action)
        observations.append([state, action, next_state, reward, done, 0.])
        if done:
            break
        state = next_state
        
    g = 0.
    for i in reversed(range(len(observations))):
        g = g * gamma + observations[i][3]
        observations[i][5] = g
    
    return observations

In [697]:
while True:

    sum = 0.
    episodes = 1000

    for _ in range(episodes):
        observations = run_episode()
        for obs in observations:
            state = obs[0]
            action = obs[1]
            g = obs[5]
            counts[state, action] += 1
            sum_rewards[state, action] += g
            sum += obs[3]

    epochs += 1
    print(epochs, '- Average reward: ', sum/episodes)

  # This is added back by InteractiveShellApp.init_path()


1 - Average reward:  0.087
2 - Average reward:  0.272
3 - Average reward:  0.34
4 - Average reward:  0.369
5 - Average reward:  0.385
6 - Average reward:  0.375
7 - Average reward:  0.351
8 - Average reward:  0.351
9 - Average reward:  0.353
10 - Average reward:  0.355
11 - Average reward:  0.364
12 - Average reward:  0.357
13 - Average reward:  0.373
14 - Average reward:  0.362
15 - Average reward:  0.375
16 - Average reward:  0.378
17 - Average reward:  0.38


KeyboardInterrupt: 

In [628]:
epsilon=0.1