<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#FrozenLake-v0-Environment-Solution" data-toc-modified-id="FrozenLake-v0-Environment-Solution-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>FrozenLake-v0 Environment Solution</a></span><ul class="toc-item"><li><span><a href="#Random-Walk" data-toc-modified-id="Random-Walk-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Random Walk</a></span></li><li><span><a href="#Q-learning" data-toc-modified-id="Q-learning-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Q-learning</a></span></li><li><span><a href="#Deep-Q-learning" data-toc-modified-id="Deep-Q-learning-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Deep Q-learning</a></span></li></ul></li></ul></div>

# FrozenLake-v0 Environment Solution

## Random Walk

In [114]:
import gym
import random

env = gym.make("FrozenLake-v0")
env.reset()
env.render()
reward=0.00
forbidden=[5,7,11,12]
actions = {
    'Left': 0,
    'Down': 1,
    'Right': 2,
    'Up': 3}
counter=0
done = True
while done:
    counter=counter+1
    winning_sequence=[random.choice(["Left","Down","Right"]),
                      random.choice(["Left","Down","Right"]),
                      random.choice(["Left","Down"]),
                      random.choice(["Left","Down","Right","Up"])]
    for a in winning_sequence:
        new_state, reward, done, info = env.step(actions[a])
        print()
        env.render()
        print("Reward: {:.2f}".format(reward))
        if new_state in forbidden:
            env.reset()
            break
        if new_state==15:
            done=False
            break
print("no.of attempts",counter)
print("the winning sequence",winning_sequence)


[41mS[0mFFF
FHFH
FFFH
HFFG

  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
Reward: 0.00

  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
Reward: 0.00

  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
Reward: 0.00

  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00

  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
Reward: 0.00

  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
Reward: 0.00

  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
Reward: 0.00
no.of attempts 2
the winning sequence ['Down', 'Right', 'Left', 'Right']


## Q-learning
Please find the original solution [here](https://gym.openai.com/evaluations/eval_OAbMaV0TKe71Cq5Mtof7g/).

In [115]:
# https://gym.openai.com/evaluations/eval_OAbMaV0TKe71Cq5Mtof7g/
# used tutorial https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0
# author Akhan Ismailov
# Q-learning algorithm, line 24

import gym
import numpy as np
from gym import wrappers
from tqdm import tqdm

In [116]:
Q = np.zeros([env.observation_space.n, env.action_space.n])
lr = 0.85
gamma = 0.99
num_episodes = 2000
num_iterations = 200
rewards = np.zeros(num_episodes)

for episode in tqdm(range(num_episodes)):
    state = env.reset()
    for iteration in range(num_iterations):
        action = np.argmax( Q[state, :] + np.random.randn(1, env.action_space.n)*(1./(episode+1)) )
        state_new, reward, done, _ = env.step(action)
        Q[state, action] = Q[state, action] + lr*(reward + gamma*np.max(Q[state_new,:]) - Q[state, action])
        state = state_new

        if done or iteration == num_iterations-1:
            rewards[episode] = reward

        if done:
            break

def find_conseq_max():
    sum_cur = sum(rewards[0:100])
    maxx = sum_cur
    for i in range(100, num_episodes):
        sum_cur += rewards[i] - rewards[i-100]
        maxx = max(maxx, sum_cur)
    return maxx / 100

# From the website, solved after 413 episodes. Best 100-episode average reward was 0.82 ± 0.04.

print(find_conseq_max())

100%|██████████| 2000/2000 [00:07<00:00, 254.11it/s]

0.76





## Deep Q-learning 

In [117]:
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [118]:
print("Number of actions: %d" % env.action_space.n)
print("Number of states: %d" % env.observation_space.n)

Number of actions: 4
Number of states: 16


In [119]:
# ！pip install keras==2.3.1 The latest keras version that doesn't requrie TF 2.2+
# !git clone https://github.com/wau/keras-rl2.git
# !cd keras-rl
# !python install .

In [120]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Embedding, Reshape
from tensorflow.keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import Policy, BoltzmannQPolicy
from rl.memory import SequentialMemory

In [121]:
class DecayEpsGreedyQPolicy(Policy):
    def __init__(self, max_eps=.1, min_eps=.05, lamb=0.001):
        super(DecayEpsGreedyQPolicy, self).__init__()
        self.max_eps = max_eps
        self.lambd = lamb
        self._steps = 0
        self.min_eps = min_eps

    def select_action(self, q_values):
        assert q_values.ndim == 1
        nb_actions = q_values.shape[0]
        eps = self.min_eps + (self.max_eps - self.min_eps) * \
            np.exp(-self.lambd * self._steps)
        self._steps += 1
        if self._steps % 1e3 == 0:
            print("Current eps:", eps)
        if np.random.uniform() < eps:
            action = np.random.random_integers(0, nb_actions - 1)
        else:
            action = np.argmax(q_values)
        return action

In [134]:
np.set_printoptions(threshold=np.inf)
np.set_printoptions(precision=4)

# Get the environment and extract the number of actions.
env.reset()
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

def get_keras_model(action_space_shape):
    model = Sequential()
    model.add(Embedding(16, 4, input_length=1))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(4, activation='relu'))
    model.add(Reshape((4,)))
    print(model.summary())
    return model

model = get_keras_model(nb_actions)

memory = SequentialMemory(limit=10000, window_length=1)
policy = DecayEpsGreedyQPolicy(max_eps=0.9, min_eps=0, lamb=1 / (1e4))
dqn = DQNAgent(model=model, nb_actions=nb_actions,
               memory=memory, nb_steps_warmup=500,
               target_model_update=1e-2, policy=policy,
               enable_double_dqn=False, batch_size=512
               )
dqn.compile(Adam());

Model: "sequential_54"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_48 (Embedding)     (None, 1, 4)              64        
_________________________________________________________________
dense_101 (Dense)            (None, 1, 64)             320       
_________________________________________________________________
dense_102 (Dense)            (None, 1, 16)             1040      
_________________________________________________________________
dense_103 (Dense)            (None, 1, 4)              68        
_________________________________________________________________
reshape_49 (Reshape)         (None, 4)                 0         
Total params: 1,492
Trainable params: 1,492
Non-trainable params: 0
_________________________________________________________________
None


In [137]:
%%time
dqn.fit(env, nb_steps=4000, visualize=False, verbose=1)

Training for 4000 steps ...
Interval 1 (0 steps performed)
   21/10000 [..............................] - ETA: 1:19 - reward: 0.0000e+00



  788/10000 [=>............................] - ETA: 4:27 - reward: 0.0013Current eps: 0.7369313672223718
 1694/10000 [====>.........................] - ETA: 6:40 - reward: 0.0012Current eps: 0.6668030755872005
CPU times: user 4min 8s, sys: 7.14 s, total: 4min 15s
Wall time: 3min 51s


<tensorflow.python.keras.callbacks.History at 0x7f769556e6d0>

In [124]:
ENV_NAME = 'FrozenLake-v0'

In [125]:
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [138]:
# Finally, evaluate our algorithm for 20 episodes.
dqn.test(env, nb_episodes=20, visualize=False)

Testing for 20 episodes ...
Episode 1: reward: 1.000, steps: 16
Episode 2: reward: 0.000, steps: 20
Episode 3: reward: 0.000, steps: 11
Episode 4: reward: 1.000, steps: 24
Episode 5: reward: 1.000, steps: 20
Episode 6: reward: 0.000, steps: 16
Episode 7: reward: 1.000, steps: 34
Episode 8: reward: 1.000, steps: 79
Episode 9: reward: 1.000, steps: 38
Episode 10: reward: 1.000, steps: 19
Episode 11: reward: 0.000, steps: 14
Episode 12: reward: 0.000, steps: 36
Episode 13: reward: 1.000, steps: 46
Episode 14: reward: 0.000, steps: 13
Episode 15: reward: 1.000, steps: 46
Episode 16: reward: 0.000, steps: 100
Episode 17: reward: 1.000, steps: 24
Episode 18: reward: 0.000, steps: 42
Episode 19: reward: 0.000, steps: 10
Episode 20: reward: 0.000, steps: 62


<tensorflow.python.keras.callbacks.History at 0x7f76950d0ed0>