# Frozen lake

Read environment description: 
https://gym.openai.com/envs/FrozenLake-v0/

In [1]:
import gym
import numpy as np
import random

# Q-learning / Sarsa

In [17]:
q = np.zeros((16, 4))
sum_rewards =np.zeros((16, 4))
epsilon = 0.1
learning_rate = 0.1
gamma = 0.99
epochs = 0

def act_eps_greedy(state):
    if random.random() < epsilon:
        return random.randint(0, 3)
    else:
        return np.argmax(q[state])

def run_episode():
    env = gym.make('FrozenLake-v0')
    observations = []
    state = env.reset()
    while True:
        action = act_eps_greedy(state)
        (next_state, reward, done, _) = env.step(action)
        observations.append([state, action, next_state, reward, done])
        if done:
            break
        state = next_state
    return observations

In [18]:
# Sarsa
while True:
    total_reward = 0.
    episodes = 1000
    epsilon = np.maximum(np.exp(-epochs/2), 0.01)

    for _ in range(episodes):
        observations = run_episode()
        for i in range(len(observations)):
            (state, action, next_state, reward, done) = observations[i]
            target_q = reward
            if not done:
                next_action = observations[i+1][1]
                next_q = q[next_state, next_action]
                target_q = gamma * next_q + reward
            q[state, action] += learning_rate * (target_q - q[state, action])
            total_reward += reward

    epochs += 1
    print(epochs, '- Average reward:', total_reward/episodes, ' (eps', epsilon, ')')

1 - Average reward: 0.014  (eps 1.0 )
2 - Average reward: 0.046  (eps 0.6065306597126334 )
3 - Average reward: 0.1  (eps 0.36787944117144233 )
4 - Average reward: 0.174  (eps 0.22313016014842982 )
5 - Average reward: 0.333  (eps 0.1353352832366127 )
6 - Average reward: 0.409  (eps 0.0820849986238988 )
7 - Average reward: 0.512  (eps 0.049787068367863944 )
8 - Average reward: 0.533  (eps 0.0301973834223185 )
9 - Average reward: 0.614  (eps 0.01831563888873418 )
10 - Average reward: 0.678  (eps 0.011108996538242306 )
11 - Average reward: 0.641  (eps 0.01 )
12 - Average reward: 0.663  (eps 0.01 )
13 - Average reward: 0.691  (eps 0.01 )
14 - Average reward: 0.669  (eps 0.01 )
15 - Average reward: 0.664  (eps 0.01 )
16 - Average reward: 0.682  (eps 0.01 )
17 - Average reward: 0.676  (eps 0.01 )
18 - Average reward: 0.664  (eps 0.01 )
19 - Average reward: 0.667  (eps 0.01 )
20 - Average reward: 0.635  (eps 0.01 )
21 - Average reward: 0.66  (eps 0.01 )
22 - Average reward: 0.622  (eps 0.01 )


KeyboardInterrupt: 

In [None]:
# Q-learning - unstable without target updates?
# while True:
#     total_reward = 0.
#     episodes = 1000
#     epsilon = np.maximum(np.exp(-epochs/5), 0.01)

#     for _ in range(episodes):
#         observations = run_episode()
#         for (state, action, next_state, reward, _) in observations:
#             next_q = np.max(q[next_state])
#             target_q = next_q + reward
#             q[state, action] += alpha * (target_q - q[state, action])
#             total_reward += reward

#     epochs += 1
#     print(epochs, '- Average reward:', total_reward/episodes, ' (eps', epsilon, ')')