In [1]:
%matplotlib inline

import gym
from gym.wrappers import Monitor
import itertools
import numpy as np
import os
import random
import sys
import tensorflow as tf

if "../" not in sys.path:
  sys.path.append("../")

from lib import plotting
from collections import deque, namedtuple

In [2]:
env = gym.envs.make("Breakout-v0")

In [3]:
# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions
VALID_ACTIONS = [0, 1, 2, 3]

In [4]:
class StateProcessor():
    """
    Processes a raw Atari images. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        pass

    def process(self, state):
        output = tf.image.rgb_to_grayscale(state)
        output = tf.image.crop_to_bounding_box(output, 34, 0, 160, 160)
        output = tf.image.resize(
                output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
        output = tf.squeeze(output)
        return output
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84] state representing grayscale values.
        """

In [5]:
class Estimator():
    """Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    """
    def __init__(self, name, checkpoint_path=None, summaries_dir=None):
        self.tf_global_step = tf.Variable(0)
        self.global_step = 0
        self.episode = tf.Variable(0)
        # Writes Tensorboard summaries to disk
        self.summary_writer = None
        self._build_model()
        self.ckpt = None
        self.ckm = None
        if checkpoint_path:
            self.ckpt = tf.train.Checkpoint(optimizer=self.optimizer, 
                                            model=self.model, 
                                            tf_global_step=self.tf_global_step,
                                            episode = self.episode)
            self.ckm = tf.train.CheckpointManager(self.ckpt, checkpoint_path, max_to_keep=3)
        if summaries_dir:
            summary_dir = os.path.join(summaries_dir, "summaries_{}".format(name))
            if not os.path.exists(summary_dir):
                os.makedirs(summary_dir)
            self.summary_writer = tf.summary.create_file_writer(summary_dir)

    def _build_model(self):
        """
        Builds the Tensorflow graph.
        """
        self.model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, 8, 4, activation='relu'),
            tf.keras.layers.Conv2D(64, 4, 2, activation='relu'),
            tf.keras.layers.Conv2D(64, 3, 1, activation='relu'),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(512),
            tf.keras.layers.Dense(len(VALID_ACTIONS))
        ])
        self.optimizer = tf.keras.optimizers.RMSprop(0.00025, 0.99, 0.0, 1e-6)


    def predict(self, s):
        X = s / 255.0
        batch_size = tf.shape(X)[0]
        predictions = self.model(X)
        return predictions
    
    def save(self):
        self.episode.assign_add(1)
        if self.ckm:
            self.ckm.save()
        
    def load(self):
        if self.ckpt:
            self.ckpt.restore(self.ckm.latest_checkpoint)
            if self.ckm.latest_checkpoint:
                self.global_step = self.tf_global_step.numpy()
                print("Restored from {}".format(self.ckm.latest_checkpoint))
                print("Starting from step {}".format(self.global_step))
            else:
                print("Initializing from scratch.")
    
    def update(self, s, a, y):
        
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 84, 84, 1]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        with tf.GradientTape() as tape:
            X = s / 255.0
            batch_size = tf.shape(X)[0]
            predictions = self.model(X)
            gather_indices = tf.range(batch_size) * tf.shape(predictions)[1] + a
            action_predictions = tf.gather(tf.reshape(predictions, [-1]), gather_indices)
            # Calculate the loss
            losses = tf.math.squared_difference(y, action_predictions)
            loss = tf.reduce_mean(losses)
            # Optimizer Parameters from original paper
        
        grads = tape.gradient(loss, self.model.trainable_variables)
        train_op = self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        self.global_step = self.global_step + 1
        if self.ckpt:
            self.ckpt.tf_global_step.assign_add(1)
        # Summaries for Tensorboard
        if self.summary_writer:
            with self.summary_writer.as_default():
                tf.summary.scalar("loss", loss,step=self.global_step)
                tf.summary.histogram("loss_hist", losses,step=self.global_step)
                tf.summary.histogram("q_values_hist", predictions,step=self.global_step)
                tf.summary.scalar("max_q_value", tf.reduce_max(predictions),step=self.global_step)
                self.summary_writer.flush()

        return loss

In [6]:
e = Estimator(name="test")
sp = StateProcessor()


observation = env.reset()
    
observation_p = sp.process(observation)
observation = np.stack([observation_p] * 4, axis=2)
observations = np.array([observation] * 2)
    
    # Test Prediction
print(e.predict(observations))

    # Test training step
y = np.array([10.0, 10.0])
a = np.array([1, 3])
print(e.update(observations, a, y))

tf.Tensor(
[[-0.11248106 -0.03248368 -0.00904444  0.08085841]
 [-0.11248107 -0.03248368 -0.00904444  0.08085841]], shape=(2, 4), dtype=float32)
tf.Tensor(99.52005, shape=(), dtype=float32)


In [7]:
def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        nA: Number of actions in the environment.

    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

    """
    def policy_fn(observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(np.expand_dims(observation, 0))[0]
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [15]:
np.arange(32)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [8]:
def deep_q_learning(env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    total_t = 0,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)
        
    q_estimator.load()
    total_t = q_estimator.global_step
    target_estimator.model = tf.keras.models.clone_model(q_estimator.model, input_tensors=None, clone_function=None)
    
    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state
    
    print("Replay memory filled!")
    # Record videos
    env= Monitor(env,
                 directory=monitor_path,
                 resume=True,
                 video_callable=lambda count: count % record_video_every == 0)
    
    for i_episode in range(q_estimator.episode.numpy(), num_episodes):

        # Reset the environment
        state = env.reset()
        state = state_processor.process(state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

            # Add epsilon to Tensorboard
            with q_estimator.summary_writer.as_default():
                tf.summary.histogram("epsilon",epsilon,step=total_t)
                q_estimator.summary_writer.flush()

            # TODO: Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                target_estimator.model = tf.keras.models.clone_model(q_estimator.model, input_tensors=None, clone_function=None)

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            # Take a step in the environment
            # TODO: Implement!
            action_probs = policy(state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # TODO: Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done)) 

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # TODO: Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))
            
            # Calculate q values and targets
            # This is where Double Q-Learning comes in!
            q_values_next = q_estimator.predict(next_states_batch)
            best_actions = np.argmax(q_values_next, axis=1)
            q_values_next_target = target_estimator.predict(next_states_batch)
            idx = tf.constant(best_actions, dtype=tf.int32)
            row_idx = tf.constant(np.arange(batch_size))
            indices = tf.stack([row_idx, idx], axis=1)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * tf.gather_nd(q_values_next_target, indices=indices)

            # TODO Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(states_batch, action_batch, targets_batch)

            if done:
                break

            state = next_state
            total_t += 1
            
        # Save the current checkpoint
        q_estimator.save()

        # Add summaries to tensorboard
        with q_estimator.summary_writer.as_default():
            tf.summary.histogram("episode_reward",stats.episode_rewards[i_episode],step=total_t)
            tf.summary.histogram("episode_length",stats.episode_lengths[i_episode],step=total_t)
            q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])

    env.monitor.close()
    return stats

In [None]:
# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("./experiments_DDQN/{}".format(env.spec.id))
checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
# Create a glboal step variable
global_step_q = 0
global_step_t = 0
    
# Create estimators
q_estimator = Estimator(name="q",checkpoint_path=checkpoint_dir,summaries_dir=experiment_dir)
target_estimator = Estimator(name="t",summaries_dir=experiment_dir)

# State processor
state_processor = StateProcessor()

# Run it!
for t, stats in deep_q_learning(env,
                                q_estimator=q_estimator,
                                target_estimator=target_estimator,
                                state_processor=state_processor,
                                experiment_dir=experiment_dir,
                                num_episodes=10000,
                                replay_memory_size=500000,
                                replay_memory_init_size=50000,
                                update_target_estimator_every=10000,
                                epsilon_start=1.0,
                                epsilon_end=0.1,
                                epsilon_decay_steps=500000,
                                discount_factor=0.99,
                                batch_size=32):
    print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))