In [1]:
# !pip3 install box2d-py

In [2]:
import os
from tqdm import tqdm
import pandas as pd
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import random
from collections import deque
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
torch.cuda.is_available()

True

## Task
This environment is a continuous control tasks in the Box2D simulator. The goal is to train an agent to control the landing of a rocket into a landing pad. In this environment, landing outside the landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land on its first attempt. The environment is found in this [link](https://gym.openai.com/envs/LunarLander-v2/).

### Actions
The agent has to decide between four actions:
    Action space (Discrete)
- 0- Do nothing
- 1- Fire left engine
- 2- Fire down engine
- 3- Fire right engine

### States
The state of the lander is encoded in 8 variables:
- x position
- y position
- x velocity
- y velocity
- angle
- angular velocity
- left leg touching ground
- right leg touching ground

### Rewards
As the agent observes the current state of the environment and chooses
an action, the environment *transitions* to a new state, and also
returns a reward that indicates the consequences of the action.
This environment rewards the agent for the following:
- -100 lander crashed or lands outside landing pad (ends an episode)
- +100 lander comes to rest within landing pad (ends an episode)
- +10 for each leg currently on the ground (lifting a leg incurs a -10 reward)
- -0.3 for each frame the main engine is used
- -0.03 for using the side engines
- There are miscellaneous positive (negative) rewards for decreasing (increasing) the distance to the landing pads.

The rewards incentivise the agent for landing inside the landing pad on both legs, while using the least amount of fuel as possible.



# Deep Reinforcement Learning using DQN

The DQN algorithm implemented on this notebook was copied from Doc. Damian's DQN architecture discussed in class.

## Initialize environment

In [4]:
# intializing
seed = 0

# load the environment from openai gym
env = gym.make("LunarLander-v2").env
env.seed(seed)
state = env.reset()


## Creating the model

In [5]:
def create_q_model(num_observations, num_actions):
    inputs = layers.Input(shape=(num_observations))
    layer1 = layers.Dense(64, activation="relu")(inputs)
    layer2 = layers.Dense(64, activation="relu")(layer1)
    action = layers.Dense(num_actions, activation=None)(layer2)
    return keras.Model(inputs=inputs, outputs=action)

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append((*args))

    def sample(self, batch_size):
        samples = random.sample(self.memory, batch_size)
        action_sample = [sample[0] for sample in samples]
        state_sample = np.array([sample[1] for sample in samples])
        state_next_sample = np.array([sample[2] for sample in samples])
        rewards_sample = [sample[3] for sample in samples]
        done_sample = tf.convert_to_tensor([float(sample[4]) for sample in samples])

        return (
            action_sample,
            state_sample,
            state_next_sample,
            rewards_sample,
            done_sample,
        )

    def __len__(self):
        return len(self.memory)

class Agent:
    "Interacts with the environment"

    def __init__(self, num_observations, num_actions):
        self.num_observations = num_observations
        self.num_actions = num_actions

        # The first model makes the predictions for Q-values which are used to make a action.
        self.model_policy = create_q_model(num_observations, num_actions)
        self.model_target = create_q_model(num_observations, num_actions)
        # Deepmind paper used RMSProp however then Adam optimizer is faster
        self.optimizer = keras.optimizers.Adam(learning_rate=1e-3)
        self.memory = ReplayBuffer(buffer_size)
        self.step_count = 0

    def step(self, action, state, state_next, reward, done):
        # Save actions and states in replay buffer
        self.memory.push((action, state, state_next, reward, done))

        self.step_count += 1
        # Update every `train_freq` frame if `batch_size` samples available
        if self.step_count % train_freq == 0 and len(self.memory) > batch_size:
            # sample the replay buffer
            experience_sample = self.memory.sample(batch_size)
            self.learn(experience_sample)

        if self.step_count % update_target_network == 0:
            # update the the target network with new weights
            self.model_target.set_weights(self.model_policy.get_weights())

    def act(self, state, eps=0):
        # Use epsilon-greedy for exploration
        if epsilon > np.random.random():
            # Take random action
            action = np.random.choice(self.num_actions)
        else:
            # Predict action Q-values from state
            action_probs = self.model_policy(state[np.newaxis], training=False)
            # Take best action
            action = tf.argmax(action_probs[0]).numpy()
        return action

    def learn(self, experiences):
        loss_function = keras.losses.Huber()  # Using huber loss for stability

        (
            action_sample,
            state_sample,
            state_next_sample,
            rewards_sample,
            done_sample,
        ) = experiences
        # Build the updated Q-values for the sampled future states
        # Use the target model for stability
        future_rewards = self.model_target.predict(state_next_sample)
        # Q value = reward + discount factor * expected future reward
        updated_q_values = rewards_sample + gamma * tf.reduce_max(
            future_rewards, axis=1
        ) * (1 - done_sample)
        # final frame has no future reward

        # Create a mask so we only calculate loss on the updated Q-values
        masks = tf.one_hot(action_sample, self.num_actions)

        with tf.GradientTape() as tape:
            # Train the model on the states and updated Q-values
            q_values = self.model_policy(state_sample)

            # Apply the masks to the Q-values to get the Q-value for action taken
            q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            # Calculate loss between new Q-value and old Q-value
            loss = loss_function(updated_q_values, q_action)

        # Backpropagation
        grads = tape.gradient(loss, self.model_policy.trainable_variables)
        self.optimizer.apply_gradients(
            zip(grads, self.model_policy.trainable_variables)
        )

## Training the agent

In [7]:
# Configuration paramaters for the whole setup
gamma = 0.99  # Discount factor for past rewards
epsilon_min = 0.05  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon = epsilon_max  # Epsilon greedy parameter
batch_size = 256  # Size of batch taken from replay buffer
max_steps_per_episode = 1000  # just a safety constraint
exploration_fraction = 0.6  # Fraction of frames for exploration
buffer_size = int(1e6)  # Maximum replay length
train_freq = 4  # Train the model after 4 actions
update_target_network = 200  # How often to update the target network


episode_rewards = [0.0]
episode_running_mean_list = []
episode_list = []
frame_count_list = []

num_timesteps = 1_000_000  # longer to train
epsilon_greedy_frames = num_timesteps * exploration_fraction

agent = Agent(num_observations=8, num_actions=4)
state = env.reset()
step_count = 0
for frame_count in tqdm(range(1, num_timesteps + 1)):
    action = agent.act(state, epsilon)

    # Apply the sampled action in our environment
    state_next, reward, done, _ = env.step(action)
    
    agent.step(action, state, state_next, reward, done)
    state = state_next
    episode_rewards[-1] += reward

    # Linear Decay probability of taking random action
    epsilon -= (epsilon_max - epsilon_min) / epsilon_greedy_frames
    epsilon = max(epsilon, epsilon_min)

    # Log details
    if frame_count % (5000) == 0:
        running_mean_reward = np.mean(episode_rewards[-20:])
        episodes = len(episode_rewards)
        episode_running_mean_list.append(running_mean_reward)
        episode_list.append(episodes)
        frame_count_list.append(frame_count)

        df = pd.DataFrame([episode_running_mean_list, episode_list, frame_count_list]).T
        df.columns = ['episode_running_mean_list', 'episode_list', 'frame_count_list']

    if done:
        state = env.reset()
        episode_rewards.append(0)


    if frame_count in [5000, 300_000, 400_000, 500_000, 600_000, 700_000, 800_000, 900_000, 1000_000]:
        agent.model_policy.save('models/dqn_'+str(frame_count)+".h5")
        rewards_done = pd.DataFrame(episode_rewards)
        df.to_csv('models/dqn_mean_rewards_'+str(frame_count)+'.csv')
        rewards_done.to_csv('models/dqn_episode_rewards_'+str(frame_count)+'.csv')

  0%|          | 4992/1000000 [01:30<5:11:37, 53.22it/s] 



 27%|██▋       | 271392/1000000 [1:37:54<6:12:14, 32.62it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 30%|██▉       | 299863/1000000 [1:59:34<10:39:25, 18.25it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 32%|███▏      | 317866/1000000 [2:22:05<18:26:47, 10.27it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config 



 40%|████      | 401738/1000000 [3:28:58<5:06:41, 32.51it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 50%|████▉     | 499992/1000000 [4:25:10<3:35:19, 38.70it/s] 



 60%|█████▉    | 599996/1000000 [5:08:20<3:07:52, 35.49it/s]



 70%|██████▉   | 699996/1000000 [5:53:48<2:16:55, 36.52it/s]



 80%|███████▉  | 799999/1000000 [6:46:26<2:57:05, 18.82it/s]



 90%|████████▉ | 899996/1000000 [7:40:37<48:09, 34.61it/s]  



100%|█████████▉| 999999/1000000 [8:26:55<00:00, 34.53it/s]  



100%|██████████| 1000000/1000000 [8:26:55<00:00, 32.88it/s]


## Evaluating the trained agent

In [8]:
def evaluate_policy_dqn(policy, env_name, seed, eval_episodes=10, render=False):
    eval_env = gym.make(env_name)
    eval_env.seed(seed + 100)
    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        while not done:
            action_probs = model_policy(state[np.newaxis], training=False)
            action = tf.argmax(action_probs[0]).numpy()
            if render:
                eval_env.render()
            state, reward, done, _ = eval_env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    return avg_reward

In [10]:
# DQN
env_name = "LunarLander-v2"
seed = 0
model_policy = keras.models.load_model("models/dqn_1000000.h5", compile=False)
evaluate_policy_dqn(model_policy, env_name, seed, eval_episodes=10, render=False)

82.97793510116728