# DQN on Pong

Done with reference to Denny Britz's DQN implementation found at
https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/Deep%20Q%20Learning.ipynb

In [2]:
import tensorflow as tf
import numpy as np
import gym
from gym.wrappers import Monitor
import itertools
import os

  from ._conv import register_converters as _register_converters


In [3]:
class ReplayBuffer:
    """
    A replay buffer for implementing experience replay
    """
    def __init__(self, size):
        """
        Args:
            size: the size of the replay buffer, items will be evicted in a FIFO manner
        """
        self.size = size
        self.buffer = []
        
    def add(self, state, action, reward, next_state, is_terminal):
        """
        Args:
            state
            action
            reward
            next_state
            is_terminal
            
        Store experience into replay buffer, evicting old experience if buffer is full
        """
        if len(self.buffer) >= self.size:
            self.buffer.pop(0)
        self.buffer.append([state, action, reward, next_state, is_terminal])
        
    def sample(self, no_of_samples):
        """
        Args:
            no_of_sample: number of samples desired
            
        Return:
            samples of length no_of_sample
            
        Samples from replay buffer
        """
        idx = np.random.choice(len(self.buffer), no_of_samples)
        return np.array(self.buffer)[idx]

In [4]:
# from https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/Deep%20Q%20Learning.ipynb
class StateProcessor():
    """
    Processes a raw Atari images. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        # Build the Tensorflow graph
        with tf.variable_scope("state_processor"):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.input_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
            self.output = tf.image.resize_images(
                self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)

    def process(self, sess, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84, 1] state representing grayscale values.
        """
        return sess.run(self.output, { self.input_state: state })

In [5]:
def weights_init(shape):
    W = tf.get_variable('W', initializer=tf.truncated_normal(shape, mean=0, stddev=0.1))
    return W

def bias_init(shape):
    b = tf.get_variable('b', initializer=tf.constant(0.1, shape=shape))
    return b

In [6]:
class Qnetwork:
    def __init__(self, frame_size, no_of_frame, no_of_actions, global_step, scope='Estimator', summaries_dir=None):
        """
        Args:
            frame_size: width and height of a single frame
            no_of_frame: number of frames stacked
            no_of_actions: number of actions (i.e output neurons), this varies from game to game
            scope: name of scope. Used to distinguish target and estimator network
        """
        self.scope = scope
        self.summary_writer = None
        self.step = 0
        with tf.variable_scope(scope):
            # build summary writer
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)
            
            
            self.X = tf.placeholder(tf.uint8, shape=[None, frame_size, frame_size, no_of_frame], name='X')
            self.targets = tf.placeholder(tf.float32, shape=[None], name='targets')
            self.selected_actions = tf.placeholder(tf.int32, shape=[None], name='actions')
            X = tf.to_float(self.X) / 255.0
            batch_size = tf.shape(self.X)[0]
            
            with tf.variable_scope('conv1'):
                W1 = weights_init([8,8, no_of_frame, 32])
                b1 = bias_init([32])
                conv1 = tf.nn.relu(tf.nn.conv2d(X, W1, strides=[1,4,4,1], padding='VALID') + b1)
            with tf.variable_scope('conv2'):
                W2 = weights_init([4,4, 32, 64])
                b2 = bias_init([64])
                conv2 = tf.nn.relu(tf.nn.conv2d(conv1, W2, strides=[1,2,2,1], padding='VALID') + b2)
            with tf.variable_scope('conv3'):
                W3 = weights_init([3,3, 64, 64])
                b3 = bias_init([64])
                conv3 = tf.nn.relu(tf.nn.conv2d(conv2, W3, strides=[1,1,1,1], padding='VALID') + b3)
                final_conv_width = (((frame_size - 8)//4 + 1 - 4)//2 + 1 - 3) + 1
            with tf.variable_scope('fc4'):
                W4 = weights_init([final_conv_width**2 * 64, 512])
                b4 = bias_init([512])
                flattened = tf.reshape(conv3, [-1, final_conv_width**2 * 64])
                fc4 = tf.nn.relu(tf.matmul(flattened, W4) + b4)
            with tf.variable_scope('fc5'):
                W5 = weights_init([512, no_of_actions])
                b5 = bias_init([no_of_actions])
                self.predictions = tf.matmul(fc4, W5) + b5

            # compute loss
            # we need to extract the action values of selected actions
            # to do that, we will flatten the predictions into a 1d array
            # we then transform the action index to an index compatible with this 1d array
            # we transform the action index by adding the action index to the offset for each row
            # reference from
            # https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/Deep%20Q%20Learning.ipynb
            with tf.variable_scope('loss'):
                gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.selected_actions
                self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

                self.losses = tf.squared_difference(self.targets, self.action_predictions)
                self.loss = tf.reduce_mean(self.losses)

            # create train op
            # I am neglecting error clipping that was used in the paper
                self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
                self.train_op = self.optimizer.minimize(self.loss, 
                                                        var_list=[W1, b1, W2, b2, W3, b3, W4, b4, W5, b5],
                                                        global_step=global_step)
            
            # Summaries for Tensorboard
            self.summaries = tf.summary.merge([
                tf.summary.scalar("loss", self.loss),
                tf.summary.histogram("loss_hist", self.losses),
                tf.summary.histogram("q_values_hist", self.predictions),
                tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
            ])
            
    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, frame_size, frame_size, no_of_frames]

        Returns:
          Tensor of shape [batch_size, no_of_actions] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X: s })

    def update(self, sess, s, a, targets):
        """
        Updates the network towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, frame_size, frame_size, no_of_frames]
          a: Chosen actions of shape [batch_size]
          targets: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X: s, self.targets: targets, self.selected_actions: a }
        summaries, _, loss, step = sess.run(
            [self.summaries, self.train_op, self.loss, global_step],
            feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, step)
        return loss


In [7]:
# from https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/Deep%20Q%20Learning.ipynb
def copy_model_parameters(sess, network1, network2):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    n1_params = [t for t in tf.trainable_variables() if t.name.startswith(network1.scope)]
    n1_params = sorted(n1_params, key=lambda v: v.name)
    n2_params = [t for t in tf.trainable_variables() if t.name.startswith(network2.scope)]
    n2_params = sorted(n2_params, key=lambda v: v.name)

    update_ops = []
    for n1_v, n2_v in zip(n1_params, n2_params):
        op = n2_v.assign(n1_v)
        update_ops.append(op)

    sess.run(update_ops)

In [8]:
def select_action(sess, env, q_network, state, epsilon, e_greedy=True):
    if e_greedy and np.random.uniform() < epsilon:
        return env.action_space.sample()
    else:
        action_values = q_network.predict(sess, np.expand_dims(state, 0))
        return np.argmax(action_values)

In [9]:
def train_dqn(sess,
              env,
              q_network,
              target_network,
              state_processor,
              num_episodes,
              global_step,
              experiment_dir,
              replay_buffer,
              buffer_init_size,
              target_interval,
              frame_skip,
              epsilon_start,
              epsilon_end,
              epsilon_decay_length,
              gamma,
              batch_size):

    """
    Args:
        buffer_init_size: no of frames used to initialize replay buffer
        frame_skip: no. of frames to skip between decision (paper used 4)
    """
    
    # Keeps track of useful statistics
    episode_lengths=np.zeros(num_episodes)
    episode_rewards=np.zeros(num_episodes)

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
        
    total_t = sess.run(global_step)
    
    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_length)
    
    last_4_frame = []
    
    # init replay buffer
    print("Populating replay memory...")
    observation = env.reset()
    observation = state_processor.process(sess, observation)
    # populate last 4 frames
    last_4_frame = [observation.tolist()] * 4
    
    for i in range(buffer_init_size):
        if i % 1000 == 0:
            print("\rpopulating buffer: %d" %i, end="")
        current_state = np.stack(last_4_frame, axis=2)
        action = env.action_space.sample()
        observation, reward, done, _ = env.step(action)
        env.render()
        observation = state_processor.process(sess, observation)
        last_4_frame.pop(0)
        last_4_frame.append(observation)
        next_state = np.stack(last_4_frame, axis=2)
        replay_buffer.add(current_state, action, reward, next_state, done)
        if done:
            observation = env.reset()
            observation = state_processor.process(sess, observation)
            # populate last 4 frames
            last_4_frame = [observation.tolist()] * 4
            
        
    print("\nPopulated replay memory")
    
    # this can only be done after populating replay buffer
    should_record = False
    # set up env to record video near the end of training
    env = Monitor(env, directory='./', resume=True, video_callable=lambda count: should_record and count % 10 == 0)
    
    
    action = None

    for i_episode in range(num_episodes):
        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)
        
        # if is last 4 episode, record video
        if i_episode >= num_episodes - 4 or i_episode % 5000 == 0:
            should_record = False
        else:
            should_record = False

        # Reset the environment
        observation = env.reset()
        observation = state_processor.process(sess, observation)
        last_4_frame = [observation.tolist()] * 4
        loss = None
        current_state = np.stack(last_4_frame, axis=2)
        for t in itertools.count():
            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_length-1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_network.summary_writer.add_summary(episode_summary, total_t)
            
            # update the target network
            if total_t % target_interval == 0:
                copy_model_parameters(sess, q_network, target_network)
                
            
            if t % frame_skip == 0:
                # only make decision every k (frame_skip) steps
                action = select_action(sess, env, q_network, current_state, epsilon, e_greedy=True)
                
            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            
            observation, reward, done, _ = env.step(action)
            env.render()
            observation = state_processor.process(sess, observation)
            last_4_frame.pop(0)
            last_4_frame.append(observation)
            next_state = np.stack(last_4_frame, axis=2)
            replay_buffer.add(current_state, action, reward, next_state, done)
            
            # Update statistics
            episode_rewards[i_episode] += reward
            episode_lengths[i_episode] = t
            
            # train
            samples = replay_buffer.sample(batch_size)
            states = []
            actions = []
            targets = []
            for s, a, r, next_s, d in samples:
                states.append(s)
                actions.append(a)
                # compute target
                qs = target_network.predict(sess, np.expand_dims(next_s, 0))
                qmax = np.max(qs)
                if r > 0:
                    r_clip = 1
                elif r < 0:
                    r_clip = -1
                else:
                    r_clip = 0
                targets.append(r_clip + gamma * qmax * (1-int(d)))
            loss = q_network.update(sess, states, actions, targets)
                
            
                
                
            current_state = next_state
            total_t += 1
            
            if done:
                break
        print("\nEpisode {}, Reward: {}".format(i_episode + 1, episode_rewards[i_episode]))
        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
        episode_summary.value.add(simple_value=episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
        q_network.summary_writer.add_summary(episode_summary, total_t)
        q_network.summary_writer.flush()
    
    env.render(close=True)
    env.close()
    return episode_rewards, episode_lengths

In [None]:
tf.reset_default_graph()



global_step = tf.Variable(0, trainable=False, name='global_step')


env = gym.make('Pong-v0')

# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))

q_network = Qnetwork(84, 4, env.action_space.n, global_step, scope='estimator', summaries_dir=experiment_dir)
target_network = Qnetwork(84, 4, env.action_space.n, global_step, scope='target')

state_processor = StateProcessor()

replay_buffer = ReplayBuffer(size=500000)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # copy network to target
    copy_model_parameters(sess, q_network, target_network)
    rewards, lengths = train_dqn(sess=sess,
              env=env,
              q_network=q_network,
              target_network=target_network,
              state_processor=state_processor,
              num_episodes=5000,
              global_step=global_step,
              experiment_dir=experiment_dir,
              replay_buffer=replay_buffer,
              buffer_init_size=100000,
              target_interval=10000,
              frame_skip=1,
              epsilon_start=1,
              epsilon_end=0.1,
              epsilon_decay_length=500000,
              gamma=0.99,
              batch_size=32)

[2018-02-12 17:48:48,602] Making new env: Pong-v0
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Loading model checkpoint /home/aiskyscraper/dqn/experiments/Pong-v0/checkpoints/model...

INFO:tensorflow:Restoring parameters from /home/aiskyscraper/dqn/experiments/Pong-v0/checkpoints/model


[2018-02-12 17:48:49,745] Restoring parameters from /home/aiskyscraper/dqn/experiments/Pong-v0/checkpoints/model


Populating replay memory...
populating buffer: 2000