In [1]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim

from skimage.transform import resize
from skimage.color import rgb2gray

import os
import itertools

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = gym.make('Pong-v0')
VALID_ACTIONS = [2, 3]
NUM_ACTIONS = len(VALID_ACTIONS)

[2017-01-10 15:55:10,362] Making new env: Pong-v0


In [3]:
def preprocess(img):
    return resize(rgb2gray(img), (84, 84))    

In [4]:
def preprocess(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I

In [5]:
def reward_return(rewards, discount_factor=0.99):
    ''' Calculate the return from step t '''
    G = []
    G_sum = 0
    # Start calculating from the last reward
    for t, reward in enumerate(reversed(rewards)):
        G_sum = G_sum * discount_factor + reward
        G.append(G_sum)
    # Revert list again
    G.reverse()
    # Normalize returns
    G = (G - np.mean(G)) / np.std(G)
    return G

In [6]:
def shared_network(inputs, reuse):
    '''
    Builds 2 convolutional layers and a fully connected layer at the end
    
    Args:
        inputs: Input image for the network 
    
    Returns:
        The last convolutional layer and the fully connect layer
    '''
    with tf.variable_scope('shared', reuse=reuse):
        # Convolutional layers
        conv = slim.stack(inputs, slim.conv2d, [
                (16, 8, 4),
                (32, 4, 3)
            ])

        # Fully connected layer
        flatten = slim.flatten(conv)
        fc = slim.fully_connected(flatten, 216)
        
        return conv, fc

In [7]:
class Policy:
    def __init__(self, learning_rate, reuse=False):
        # Placeholders
        self.states = tf.placeholder(name='states',
                                     shape=(None, 80, 80),
                                     dtype=tf.float32)
        self.returns = tf.placeholder(name='returns',
                                      shape=(None),
                                      dtype=tf.float32)
        self.actions = tf.placeholder(name='chosen_action',
                                      shape=(None),
                                      dtype=tf.int32)

        # Create shared network
        self.conv, self.fc = shared_network(self.states, reuse)
        with tf.variable_scope('policy'):
            # Final/output layer
            self.output = slim.fully_connected(self.fc,
                                               NUM_ACTIONS,
                                               activation_fn=tf.nn.softmax)        
        
        # Optimization process (to increase likelihood of a good action)
        batch_size = tf.shape(self.states)[0]
        # Select the ids of picked actions
        # action_ids = (i_batch * NUM_ACTIONS) + action
        action_ids = tf.range(batch_size) * tf.shape(self.output)[1] + self.actions
        # Select probability of chosen actions
        chosen_actions = tf.gather(tf.reshape(self.output, [-1]), action_ids)
        eligibility = tf.log(chosen_actions)
        # Change the likelihood of taken action using the return (self.returns)        
        self.loss = - tf.reduce_mean(self.returns * eligibility)
        opt = tf.train.AdamOptimizer(learning_rate)
        # We should perform gradient ascent in the likelihood of specified action
        # which is the same as performing gradient descent on the negative of the loss
        local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy') \
                     + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'shared') 
        grads_and_vars = opt.compute_gradients(self.loss, local_vars)
        self.global_step = slim.get_or_create_global_step()
        self.train_op = opt.apply_gradients(grads_and_vars, self.global_step)        
        
        # Add summaries
        tf.summary.histogram('convolution', self.conv)
        tf.summary.histogram('last_hidden', self.fc)
        tf.summary.histogram('action_probs', self.output)
        tf.summary.scalar('policy_loss', self.loss)
        
    def predict(self, sess, states):
        return sess.run(self.output, feed_dict={self.states: states})
    
    def update(self, sess, states, actions, returns):
        feed_dict = {self.states: states,
                     self.actions: actions,
                     self.returns: returns}
        sess.run(self.train_op, feed_dict=feed_dict)

In [8]:
class ValueNet:
    def __init__(self, learning_rate, reuse=False):
        # Placeholders
        self.states = tf.placeholder(name='states',
                                     shape=(None, 80, 80),
                                     dtype=tf.float32)
        # TD targets
        self.targets = tf.placeholder(name='targets',
                                      shape=(None),
                                      dtype=tf.float32)
        
        # Get or create shared network
        self.conv, self.fc = shared_network(self.states, reuse)
        # Final/output layer
        with tf.variable_scope('value_net'):
            self.output = slim.fully_connected(inputs=self.fc,
                                               num_outputs=1,
                                               activation_fn=None)
        
        # Loss (mean squared error)
        self.loss = tf.reduce_mean(tf.squared_difference(self.targets, self.output))
        opt = tf.train.AdamOptimizer(learning_rate)
        local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'value_net') \
                   + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'shared') 
        grads_and_vars = opt.compute_gradients(self.loss, local_vars)
        self.global_step = slim.get_or_create_global_step()
        self.train_op = opt.apply_gradients(grads_and_vars)#, self.global_step)        
        
        # Add summaries
        tf.summary.histogram('state_values', self.output)
        tf.summary.scalar('average_state_value', tf.reduce_mean(self.output))
        tf.summary.scalar('max_state_value', tf.reduce_max(self.output))
        tf.summary.scalar('baseline_loss', self.loss)
        
    def predict(self, sess, states):
        return sess.run(self.output, feed_dict={self.states: states})
    
    def update(self, sess, states, targets):
        feed_dict = {self.states: states,
                     self.targets: targets}
        sess.run(self.train_op, feed_dict=feed_dict)

In [9]:
def test_updates(learning_rate, batch_size=100):
    ''' Test if the weigth updates are giving the desired outputs '''
    # Create a new policy
    tf.reset_default_graph()
    policy = Policy(learning_rate=learning_rate, reuse=False)
    baseline = ValueNet(learning_rate=learning_rate, reuse=True)
    # Generate states
    state = np.random.random((batch_size, 80, 80))
    fake_returns = [(100, 'increase'), (-100, 'decrease')]
    print('Testing policy updates...')
    for action in range(NUM_ACTIONS):
        actions = action * np.ones(batch_size)
        for fake_return, expected in fake_returns:
            # Reinitialize session because ADAM optimizer builds momentum
            with tf.Session() as sess:
                tf.global_variables_initializer().run()
                # Compare new and old probabilities
                old_probs = policy.predict(sess, state)
                policy.update(sess, state, actions, [fake_return])
                new_probs = policy.predict(sess, state)        
                print('Action {} probability should {}:'.format(action, expected), end=' ')
                print(np.mean(new_probs - old_probs, axis=0))
    
    print('\nTesting baseline updates...')
    targets = 100 * np.ones(batch_size)
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        old_value = baseline.predict(sess, state)
        baseline.update(sess, state, targets)
        new_value = baseline.predict(sess, state)
        value_change = np.mean(new_value - old_value)
        print('Value of states should increase: {}'.format(value_change))

In [10]:
def summary_writer_op(sess, logdir, policy_net, value_net):
    '''
    Merge all summaries and returns an function to
    write the summary
    '''
    # Check if path already exists
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    writer = tf.summary.FileWriter(logdir, sess.graph)
    
    # Create placeholders to track some statistics
    episode_reward = tf.placeholder(name='episode_reward',
                                         shape=(),
                                         dtype=tf.float32)
    episode_length = tf.placeholder(name='episode_length',
                                         shape=(),
                                         dtype=tf.float32)
    # Create some summaries
    tf.summary.scalar('reward', episode_reward)
    tf.summary.scalar('episode_length', episode_length)
    
    # Merge all summaries
    merged = tf.summary.merge_all()    
    global_step = slim.get_or_create_global_step()
    
    def summary_writer(states, actions, returns, targets, reward, length):
        feed_dict = {
            policy_net.states: states,
            policy_net.actions: actions,
            policy_net.returns: returns,
            value_net.states: states,
            value_net.targets: targets,
            episode_reward: reward,
            episode_length: length
        }        
        summary, step = sess.run([merged, global_step],
                                 feed_dict=feed_dict)
        # Write summary
        writer.add_summary(summary, step)
        
    return summary_writer

In [11]:
test_updates(3e-4)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Testing policy updates...
Action 0 probability should increase: [ 0.09512234 -0.09512234]
Action 0 probability should decrease: [-0.09389946  0.09389948]
Action 1 probability should increase: [-0.07670062  0.07670062]
Action 1 probability should decrease: [ 0.05909234 -0.05909234]

Testing baseline updates...
Value of states should increase: 0.25253012776374817


In [None]:
num_episodes = 500000
logdir = 'summaries/run1'
savedir = 'checkpoints'
learning_rate = 3e-4

In [None]:
# Create networks
tf.reset_default_graph()
policy = Policy(learning_rate=learning_rate, 
                reuse=False)
baseline = ValueNet(learning_rate=learning_rate,
                    reuse=True)

# Create checkpoint directory
if not os.path.exists(savedir):
    os.makedirs(savedir)
save_path = os.path.join(savedir, 'graph.ckpt')

with tf.Session() as sess:
    # Create tensorflow saver
    saver = tf.train.Saver()
    # Verify if a checkpoint already exists
    latest_checkpoint = tf.train.latest_checkpoint(savedir)
    if latest_checkpoint is not None:
        print('Loading latest checkpoint...')
        saver.restore(sess, latest_checkpoint)
    else:
        tf.global_variables_initializer().run()
    # Create write summary op
    write_summary = summary_writer_op(sess=sess,
                                      logdir=logdir,
                                      policy_net=policy,
                                      value_net=baseline)
    
    print('Started training...')
    for i_episode in range(num_episodes + 1):
        state = env.reset()
        state = preprocess(state)
        states = []
        actions = []
        rewards = []
        # Use a black image for first state
        states.append(np.zeros((80, 80)))

        # Repeat until episode is finished
        for i_step in itertools.count():
#            env.render()
            # Choose an action
            action_probs = np.squeeze(policy.predict(sess, states[-1][np.newaxis, ...]))
            action = np.random.choice(np.arange(NUM_ACTIONS), p=action_probs)
            valid_action = VALID_ACTIONS[action]
            # Do the action
            next_state, reward, done, _ = env.step(valid_action)
            next_state = preprocess(next_state)
            # Store experience
            actions.append(action)
            rewards.append(reward)
            #Update state
            if done:
                break    
            states.append(next_state - state)
            state = next_state

        # Calculate episode returns
        returns = reward_return(rewards)
        states = np.array(states) 
        actions = np.array(actions)
        
        # Calculate baseline values
        states_value = baseline.predict(sess, states)
        base_returns = returns - np.squeeze(states_value)
        
        # Update baseline weights
        baseline.update(sess, states, returns)
        # Update policy weights
        policy.update(sess, states, actions, base_returns)       
        
        # Write summaries
        write_summary(states=states,
                      actions=actions,
                      returns=base_returns,
                      targets=returns,
                      reward=np.sum(rewards),
                      length=i_step)
        
        # Save graph
        if i_episode % 100 == 0:
            saver.save(sess, save_path)
        
        # Print information
        print('\rEpisode {}/{}'.format(i_episode, num_episodes), end=' | ')
        print('Reward: {}'.format(np.sum(rewards)), end='')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Loading latest checkpoint...
Started training...
Episode 5670/500000 | Reward: -13.0