# Frozen lake

Read environment description: 
https://gym.openai.com/envs/FrozenLake-v0/

In [242]:
import gym
import numpy as np
import random

In [104]:
env = gym.make('FrozenLake-v0')
state = env.reset()
print('State:', state)
env.render()

State: 0

[41mS[0mFFF
FHFH
FFFH
HFFG


In [117]:
(state, reward, done, _) = env.step(2)
print((state, reward, done))
env.render()

(5, 0, True)
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG


# Monte-Carlo (v1)

## Simulate random sequences

In [215]:
def run_episode_random():
    env = gym.make('FrozenLake-v0')
    observations = []
    state = env.reset()
    while True:
        action = env.action_space.sample()
        (next_state, reward, done, _) = env.step(action)
        observations.append([state, action, next_state, reward, done])
        if done:
            break
        state = next_state
    return observations

In [159]:
run_episode_random()

[[0, 1, 1, 0.0, False],
 [1, 3, 1, 0.0, False],
 [1, 1, 2, 0.0, False],
 [2, 3, 3, 0.0, False],
 [3, 0, 7, 0.0, True]]

In [220]:
counts = np.zeros((16, 4))
sum_rewards =np.zeros((16, 4))

for i in range(10000):
    observations = run_episode_random()
    last_reward = observations[-1][3]
    for obs in observations:
        state = obs[0]
        action = obs[1]
        counts[state, action] += 1
        sum_rewards[state, action] += last_reward
        
q = sum_rewards/counts

  del sys.path[0]


In [221]:
counts

array([[8262., 8227., 8057., 8410.],
       [3170., 3191., 3132., 3262.],
       [1475., 1430., 1427., 1436.],
       [ 697.,  656.,  700.,  670.],
       [3080., 3031., 3087., 3229.],
       [   0.,    0.,    0.,    0.],
       [ 472.,  429.,  427.,  414.],
       [   0.,    0.,    0.,    0.],
       [1180., 1133., 1157., 1132.],
       [ 370.,  401.,  387.,  370.],
       [ 224.,  229.,  240.,  228.],
       [   0.,    0.,    0.,    0.],
       [   0.,    0.,    0.,    0.],
       [ 174.,  176.,  184.,  180.],
       [ 125.,  150.,  144.,  140.],
       [   0.,    0.,    0.,    0.]])

In [222]:
sum_rewards

array([[121., 116., 115., 111.],
       [ 27.,  44.,  35.,  54.],
       [ 45.,  28.,  45.,  19.],
       [  7.,   7.,   2.,   8.],
       [ 72.,  41.,  53.,  20.],
       [  0.,   0.,   0.,   0.],
       [ 29.,  22.,  24.,   3.],
       [  0.,   0.,   0.,   0.],
       [ 16.,  45.,  37.,  52.],
       [ 22.,  43.,  42.,  24.],
       [ 53.,  45.,  43.,   7.],
       [  0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.],
       [ 14.,  34.,  43.,  30.],
       [ 35.,  75.,  78.,  63.],
       [  0.,   0.,   0.,   0.]])

In [223]:
q

array([[0.01464536, 0.01409991, 0.0142733 , 0.01319857],
       [0.00851735, 0.01378878, 0.01117497, 0.01655426],
       [0.03050847, 0.01958042, 0.03153469, 0.0132312 ],
       [0.01004304, 0.01067073, 0.00285714, 0.0119403 ],
       [0.02337662, 0.01352689, 0.01716877, 0.00619387],
       [       nan,        nan,        nan,        nan],
       [0.06144068, 0.05128205, 0.05620609, 0.00724638],
       [       nan,        nan,        nan,        nan],
       [0.01355932, 0.03971756, 0.03197926, 0.0459364 ],
       [0.05945946, 0.10723192, 0.10852713, 0.06486486],
       [0.23660714, 0.19650655, 0.17916667, 0.03070175],
       [       nan,        nan,        nan,        nan],
       [       nan,        nan,        nan,        nan],
       [0.08045977, 0.19318182, 0.23369565, 0.16666667],
       [0.28      , 0.5       , 0.54166667, 0.45      ],
       [       nan,        nan,        nan,        nan]])

## Act according to q

In [224]:
env = gym.make('FrozenLake-v0')
state = env.reset()
print('State:', state)
env.render()

State: 0

[41mS[0mFFF
FHFH
FFFH
HFFG


In [225]:
action = np.argmax(q[state])
(state, reward, done, _) = env.step(action)
env.render()

  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG


## Evaluate

In [174]:
def run_episode_q():
    env = gym.make('FrozenLake-v0')
    observations = []
    state = env.reset()
    while True:
        action = np.argmax(q[state])
        (next_state, reward, done, _) = env.step(action)
        observations.append([state, action, next_state, reward, done])
        if done:
            break
        state = next_state
    return observations

In [227]:
# Random agent
episodes = 1000
sum = 0.
for i in range(episodes):
    observations = run_episode_random()
    sum += observations[-1][3]
print('Average reward: ', sum/episodes)

Average reward:  0.018


In [230]:
# Q-greedy agent
episodes = 1000
sum = 0.
for i in range(episodes):
    observations = run_episode_q()
    sum += observations[-1][3]
print('Average reward: ', sum/episodes)

Average reward:  0.288


# Monte-Carlo (v2)

In [316]:
counts = np.zeros((16, 4))
sum_rewards =np.zeros((16, 4))
epsilon = 0.1
epochs = 0

def act_eps_greedy(state):
    if random.random() < epsilon:
        return random.randint(0, 3)
    else:
        return np.argmax(sum_rewards[state] / counts[state])

def run_episode():
    env = gym.make('FrozenLake-v0')
    observations = []
    state = env.reset()
    while True:
        action = act_eps_greedy(state)
        (next_state, reward, done, _) = env.step(action)
        observations.append([state, action, next_state, reward, done])
        if done:
            break
        state = next_state
    return observations

In [322]:
while True:

    sum = 0.
    episodes = 1000

    for _ in range(episodes):
        observations = run_episode()
        last_reward = observations[-1][3]
        sum += last_reward
        for obs in observations:
            state = obs[0]
            action = obs[1]
            counts[state, action] += 1
            sum_rewards[state, action] += last_reward

    epochs += 1
    print(epochs, '- Average reward: ', sum/episodes)

94 - Average reward:  0.297
95 - Average reward:  0.317
96 - Average reward:  0.282
97 - Average reward:  0.314
98 - Average reward:  0.316
99 - Average reward:  0.292
100 - Average reward:  0.32
101 - Average reward:  0.3
102 - Average reward:  0.31
103 - Average reward:  0.294
104 - Average reward:  0.325
105 - Average reward:  0.308
106 - Average reward:  0.292
107 - Average reward:  0.305
108 - Average reward:  0.3
109 - Average reward:  0.281
110 - Average reward:  0.303
111 - Average reward:  0.308
112 - Average reward:  0.301
113 - Average reward:  0.291
114 - Average reward:  0.312
115 - Average reward:  0.291
116 - Average reward:  0.313
117 - Average reward:  0.309


KeyboardInterrupt: 

In [321]:
epsilon=0.1