In [1]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.contrib.layers import xavier_initializer_conv2d

import os
import itertools
from skimage.transform import resize
from skimage.color import rgb2gray

In [2]:
env = gym.make('Pong-v0')
NUM_STACKED_FRAMES = 4
NUM_ACTIONS = 2
VALID_ACTIONS = [2, 3]

[2017-01-03 21:13:16,923] Making new env: Pong-v0


In [3]:
def preprocess(img):
    ''' Converts image to grayscale and resizes to 84x84 '''
    # Maybe try preprocess on TF? Faster on GPU?
    return resize(rgb2gray(img), (84, 84))

In [4]:
class ReplayMemory:
    ''' 
    Creates a buffer that stores experience (replays)
    '''
    def __init__(self, max_replays):
        self.max_replays = max_replays
        # Allocate memory
        self.imgs = np.empty((max_replays, 84, 84), dtype=np.float32)
        self.actions = np.empty(max_replays, dtype=np.int32)
        self.rewards = np.empty(max_replays, dtype=np.float32)
        self.done = np.empty(max_replays, dtype=np.bool)
        # Create "pointers"
        self.current = 0 
        self.bottom = 0
        self.size = 0
        
    def add_replay(self, img, action, reward, done):
        '''  Add experience to memory ''' 
        self.imgs[self.current] = img
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.done[self.current] = done
        # Update memory actual size
        if self.size == self.max_replays:
            self.bottom = (self.bottom + 1) % self.max_replays
        else:
            self.size += 1
        # Update current memory "pointer"
        self.current = (self.current + 1) % self.max_replays
        
    def sample(self, batch_size):
        ''' 
        Return an array containing NUM_STACKED_FRAMES sequential imgs.
        '''
        batch_pre_states = np.empty((batch_size, NUM_STACKED_FRAMES, 84, 84), dtype=np.float32)
        batch_pos_states = np.empty((batch_size, NUM_STACKED_FRAMES, 84, 84), dtype=np.float32)
        batch_actions = np.empty(batch_size, dtype=np.int32)
        batch_rewards = np.empty(batch_size, dtype=np.float32)
        batch_done = np.empty(batch_size, dtype=np.bool)
        
        i_batch = 0
        while i_batch < batch_size:
            # Get a random idx to construct a single transition
            start_idx = np.random.randint(self.bottom, self.bottom + self.size - NUM_STACKED_FRAMES - 1)
            pre_states_idx = np.arange(start_idx, start_idx + NUM_STACKED_FRAMES)
            # Only the last frame of pre states can be a finished episode
            if np.any(self.done.take(pre_states_idx[:-1], axis=0, mode='wrap')):
                continue
            pos_states_idx = np.arange(start_idx + 1, start_idx + 1 + NUM_STACKED_FRAMES)

            # Save transitions
            pre_end_idx = pre_states_idx[-1]
            batch_pre_states[i_batch] = self.imgs.take(pre_states_idx, axis=0, mode='wrap')
            batch_pos_states[i_batch] = self.imgs.take(pos_states_idx, axis=0, mode='wrap')
            batch_actions[i_batch] = self.actions.take(pre_end_idx, axis=0, mode='wrap')
            batch_rewards[i_batch] = self.rewards.take(pre_end_idx, axis=0, mode='wrap')
            batch_done[i_batch] = self.done.take(pre_end_idx, axis=0, mode='wrap')
            i_batch += 1
        
        # Roll axis to be of shape: (batch_size, 84, 84, NUM_STACKED_FRAMES)
        return (np.rollaxis(batch_pre_states, 1, 4),
                batch_actions,
                batch_rewards,
                np.rollaxis(batch_pos_states, 1, 4),
                batch_done)

In [5]:
class QNetwork:
    '''
    Creates a neural network to approximate Q values.
    
    Args:
        name: scope name
        trainable: Indicates if the graph should contain train
        operations or not, e.g., the target network don't need 
        to be trainable
        learning_rate: The step size used by the optimizer
        clip_grads: Indicates if gradients should be clipped in
        a (-1, 1) range. Prevents big changes to the network
        
    Returns:
        The values of actions from state.
    '''
    def __init__(self, name, trainable, learning_rate=None, clip_grads=None):
        with tf.variable_scope(name):
            # Create placeholders
            self.states = tf.placeholder(name='states',
                                         shape=(None, 84, 84, NUM_STACKED_FRAMES),
                                         dtype=tf.float32)
            # TD targets
            self.targets = tf.placeholder(name='targets',
                                          shape=(None),
                                          dtype=tf.float32)
            # Picked actions
            self.actions = tf.placeholder(name='actions',
                                          shape=(None),
                                          dtype=tf.int32)
            
            # Convolutional layers
            with slim.arg_scope([slim.conv2d],
                                weights_initializer=xavier_initializer_conv2d()):
                self.conv = slim.stack(self.states, slim.conv2d, [
                        (32, (8, 8), 4),
                        (64, (4, 4), 2),
                        (64, (3, 3), 1)
                    ])            
            # Fully connected layer
            self.fc = slim.fully_connected(slim.flatten(self.conv), 512)            
            # Output layer
            self.values = slim.fully_connected(inputs=self.fc,
                                               num_outputs=NUM_ACTIONS,
                                               activation_fn=None)
            
            # Add training operations
            if trainable:
                batch_size = tf.shape(self.states)[0]
                # Select the ids of picked actions
                # action_ids = (i_batch * NUM_ACTIONS) + action
                action_ids = tf.range(batch_size) * tf.shape(self.values)[1] + self.actions
                # Only use the value of picked action
                picked_actions_value = tf.gather(tf.reshape(self.values, [-1]),
                                                 action_ids)
                # Use mean squared error
                self.loss = tf.reduce_mean(tf.squared_difference(self.targets,
                                                                 picked_actions_value))
                # Compute and apply gradients
                opt = tf.train.AdamOptimizer(learning_rate)
#                opt = tf.train.RMSPropOptimizer(learning_rate, 0.99, 0.0, 1e-6)
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, name)
                self.grads_and_vars = opt.compute_gradients(self.loss, local_vars)
                if clip_grads:
                    self.grads_and_vars = [(tf.clip_by_value(grad, -1, 1), var)
                                           for grad, var in self.grads_and_vars]
                # Create or use an existing global step
                self.global_step = slim.get_or_create_global_step()
                self.train_op = opt.apply_gradients(self.grads_and_vars,
                                                    self.global_step)
                
    def predict(self, sess, states):
        return sess.run(self.values, feed_dict={self.states: states})
    
    def update(self, sess, states, actions, targets):
        feed_dict = {self.states: states,
                     self.actions: actions,
                     self.targets: targets}
        sess.run(self.train_op, feed_dict=feed_dict)
        
    def create_summary(self, sess, log_dir):
        ''' 
        Create summary operations for visualization with tensorboard 
        
        Returns:
            A function from writing the summary
        '''
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)            
        self.writer = tf.summary.FileWriter(log_dir, sess.graph)
        
        # Create placeholders to track some statistics
        self.episode_reward = tf.placeholder(name='episode_reward',
                                             shape=(),
                                             dtype=tf.float32)
        self.episode_length = tf.placeholder(name='episode_length',
                                             shape=(),
                                             dtype=tf.float32)
        
        # Add summary operations
        tf.summary.histogram('convolution', self.conv)
        tf.summary.histogram('last_hidden', self.fc)
        tf.summary.histogram('q_values', self.values)
        tf.summary.scalar('max_q_value', tf.reduce_max(self.values))
        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('reward', self.episode_reward)
        tf.summary.scalar('episode_length', self.episode_length)
        
        # Merge all summaries
        self.merged = tf.summary.merge_all()
        
        def summary_writer(states, actions, targets, ep_reward, ep_length):
            feed_dict = {self.states: states,
                         self.actions: actions,
                         self.targets: targets,
                         self.episode_reward: ep_reward,
                         self.episode_length: ep_length}
            summary, step = sess.run([self.merged, self.global_step],
                                     feed_dict=feed_dict)
            self.writer.add_summary(summary, step)
            
        return summary_writer

In [6]:
def copy_vars(sess, from_scope, to_scope):
    '''
    Create operations to copy variables (weights) between two graphs
    
    Args:
        sess: The current tensorflow session
        from_scope: name of graph to copy varibles from
        to_scope: name of graph to copy varibles to
    '''
    # Get variables within defined scope
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
    # Create operations that copy the variables
    op_holder = [to_var.assign(from_var) for from_var, to_var in zip(from_vars, to_vars)]
    
    def run_op():
        # Runs the operation
        sess.run(op_holder)
        
    return run_op

In [7]:
def epolicy(action_values, epsilon):
    ''' 
    Creates action probabilities based on a epsilon-greedy policy
    '''
    action_probs = (np.ones(NUM_ACTIONS) * epsilon) / NUM_ACTIONS
    best_action = np.argmax(np.squeeze(action_values))
    action_probs[best_action] += (1 - epsilon)
    return action_probs

In [8]:
def test_updates(learning_rate=3e-4):
    '''
    Test if weights updates are affecting mostly the chosen action
    '''
    tf.reset_default_graph()
    test_net = QNetwork(name='test', learning_rate=learning_rate,
                        clip_grads=True, trainable=True)
    # Generates random states
    test_states = np.random.random((100, 84, 84, 4))
    # Generate fake TD targets
    test_targets = 10 * np.ones(100)
    for action in range(NUM_ACTIONS):
        test_actions = action * np.ones(100)
        with tf.Session() as sess:
            tf.global_variables_initializer().run()
            # Compare Q values before and after update
            old_val = test_net.predict(sess, test_states)
            test_net.update(sess, test_states, test_actions, test_targets)
            new_val = test_net.predict(sess, test_states)

        # New values should be closer to target
        print('Action {} value should increase:'.format(action), end=' ')        
        print(np.mean(new_val - old_val, axis=0))
        # TODO: Should use assert here? 
        assert np.mean(new_val - old_val) > 0, 'Wrong weights updates'

In [9]:
def test_copy():
    '''
    Test the copy of variables from one graph to another
    Compare the predictions before and after update,
    change main graph weigths then compare predictions again
    '''
    # Create graphs
    tf.reset_default_graph()
    net_main = QNetwork(name='main', learning_rate=3e-4, clip_grads=True, trainable=True)
    net_target = QNetwork(name='target', trainable=False)
    # Create a random state
    test_state = np.random.random((1, 84, 84, 4))
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        update_target = copy_vars(sess, 'main', 'target')
        print(net_main.predict(sess, test_state))
        print(net_target.predict(sess, test_state))
        print('Copying variables...')
        update_target()
        print(net_main.predict(sess, test_state))
        print(net_target.predict(sess, test_state))
        print('Updating main network...')
        net_main.update(sess, test_state, [0], [100])
        print(net_main.predict(sess, test_state))
        print(net_target.predict(sess, test_state))
        print('Copying variables...')
        update_target()
        print(net_main.predict(sess, test_state))
        print(net_target.predict(sess, test_state))

In [10]:
test_updates()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Action 0 value should increase: [ 1.14318228  0.00145897]
Action 1 value should increase: [-0.00912934  1.10451889]


In [11]:
test_copy()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[[-0.21061181  0.09048082]]
[[-0.13393851 -0.07399513]]
Copying variables...
[[-0.21061181  0.09048082]]
[[-0.21061181  0.09048082]]
Updating main network...
[[ 0.80587202  0.05765251]]
[[-0.21061181  0.09048082]]
Copying variables...
[[ 0.80587202  0.05765251]]
[[ 0.80587202  0.05765251]]


In [12]:
replay_capacity = 500000
min_replays = 50000
stop_exploration = 2500
num_episodes = 5000
epsilon_max = 1
epsilon_min = 0.1
batch_size = 32
discount_factor = 0.99
update_target_frequency = 10000
save_dir = 'checkpoints'

In [13]:
# Creates networks
tf.reset_default_graph()
net_main = QNetwork(name='main', trainable=True,
                    learning_rate=3e-4, clip_grads=True)
net_target = QNetwork(name='target', trainable=False)

# Create saving directory
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
save_path = os.path.join(save_dir, 'graph.ckpt')
    
# Populate replay memory with random agent
print('Populating replay memory...')
replays = ReplayMemory(replay_capacity)
state = env.reset()
# Create equal probability for picking any action
action_probs = np.ones(NUM_ACTIONS) / NUM_ACTIONS
for _ in range(min_replays):
    # Pick a random action
    action = np.random.choice(np.arange(NUM_ACTIONS), p=action_probs)
    valid_action = VALID_ACTIONS[action]
    next_state, reward, done, _ = env.step(valid_action)    
    replays.add_replay(preprocess(state), action, reward, done)
    if done:
        state = env.reset()
    state = next_state

with tf.Session() as sess:
    tf.global_variables_initializer().run()
    # Update target network
    update_target = copy_vars(sess, 'main', 'target')    
    update_target()    
    # Reload last checkpoint if it exists
    saver = tf.train.Saver()
    last_checkpoint = tf.train.latest_checkpoint(save_dir)
    if last_checkpoint:
        saver.restore(sess, last_checkpoint)
    # Create summary writer
    write_summary = net_main.create_summary(sess, 'summaries')    
    
    # Calculate epsilon step size
    epsilon_step = - np.log(epsilon_min) / stop_exploration
    
    steps_sum = 0
    print('Started training...')
    for i_episode in range(num_episodes):
        # Exponentially decay epsilon
        epsilon = epsilon_min + (epsilon_max - epsilon_min) \
        * np.exp(-epsilon_step * i_episode)
        
        state = preprocess(env.reset())
        state_buffer = np.stack([state] * 4, axis=2)   
        ep_reward_sum = 0
        # Repeat until episode is finished
        for i_step in itertools.count():
            # Select an action
            action_values = net_main.predict(sess, state_buffer[np.newaxis, ...])
            action_probs = epolicy(action_values, epsilon)
            action = np.random.choice(np.arange(NUM_ACTIONS), p=action_probs)
            valid_action = VALID_ACTIONS[action]
            # Do the action
            next_state, reward, done, _ = env.step(valid_action)
            next_state = preprocess(next_state)
            ep_reward_sum += reward
            # Record experience
            replays.add_replay(state, action, reward, done)
            
            # Sample replays to train on
            b_states, b_actions, b_rewards, b_next_states, b_done = \
            replays.sample(batch_size)
            # Perform Q learning (using target network)
            value_next = net_target.predict(sess, b_next_states)            
            value_next_max = np.max(value_next, axis=1)
            b_td_targets = b_rewards + np.bitwise_xor(b_done, 1) \
            * (discount_factor * value_next_max)
            # Update main weights
            net_main.update(sess, b_states, b_actions, b_td_targets)
            
            # Update target network
            steps_sum = (steps_sum + 1) % update_target_frequency
#            print('\rSteps sum: {}'.format(steps_sum), end='')
            if steps_sum == 0:
                print('\nUpdating target network...')
                update_target()
            
            # Update state
            if done:
                break
            state_buffer = np.append(state_buffer[:, :, 1:],
                                     next_state[:, :, np.newaxis],
                                     axis=2)
            
        # Write summaries and save model
        write_summary(b_states, b_actions, b_td_targets, ep_reward_sum, i_step)        
        saver.save(sess, save_path)        
        
        print('\rEpisode {}/{}'.format(i_episode, num_episodes), end=' | ')        
        print('Episode reward: {}'.format(ep_reward_sum), end='')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Populating replay memory...
Started training...
Updating target network...
Updating target network...
Updating target network...
Updating target network...
Updating target network...
Episode 0/10 | Episode reward: -21.0Updating target network...
Updating target network...
Updating target network...
Updating target network...
Updating target network...
Updating target network...
Episode 1/10 | Episode reward: -21.0Updating target network...
Updating target network...
Updating target network...
Updating target network...
Updating target network...
Updating target network...
Updating target network...
Episode 2/10 | Episode reward: -21.0Updating target network...
Updating target network...
Updating target network...
Updating target network...
Updating target network...
Updating target network...
Episode 3/10 | Episode reward: -21.0

KeyboardInterrupt: 