In [1]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim

from skimage.transform import resize
from skimage.color import rgb2gray

import itertools

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = gym.make('Pong-v0')
VALID_ACTIONS = [2, 3]
NUM_ACTIONS = len(VALID_ACTIONS)

[2017-01-08 19:16:40,359] Making new env: Pong-v0


In [3]:
def preprocess(img):
    return resize(rgb2gray(img), (84, 84))    

In [4]:
def preprocess(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I

In [5]:
def shared_network(inputs, reuse):
    '''
    Builds 2 convolutional layers and a fully connected layer at the end
    
    Args:
        inputs: Input image for the network 
    
    Returns:
        The last convolutional layer and the fully connect layer
    '''
    with tf.variable_scope('shared', reuse=reuse):
        # Convolutional layers
        conv = slim.stack(inputs, slim.conv2d, [
                (16, 8, 4),
                (32, 4, 3)
            ])

        # Fully connected layer
        flatten = slim.flatten(conv)
        fc = slim.fully_connected(flatten, 216)
        
        return conv, fc

In [18]:
class Policy:
    def __init__(self, learning_rate, reuse=False):
        # Placeholders
        self.states = tf.placeholder(name='states',
                                     shape=(None, 80, 80),
                                     dtype=tf.float32)
        self.returns = tf.placeholder(name='returns',
                                      shape=(None),
                                      dtype=tf.float32)
        self.actions = tf.placeholder(name='chosen_action',
                                      shape=(None),
                                      dtype=tf.int32)

        # Create shared network
        self.conv, self.fc = shared_network(self.states, reuse)
        with tf.variable_scope('policy'):
            # Final/output layer
            self.output = slim.fully_connected(self.fc,
                                               NUM_ACTIONS,
                                               activation_fn=tf.nn.softmax)        
        
        # Optimization process (to increase likelihood of a good action)
        batch_size = tf.shape(self.states)[0]
        # Select the ids of picked actions
        # action_ids = (i_batch * NUM_ACTIONS) + action
        action_ids = tf.range(batch_size) * tf.shape(self.output)[1] + self.actions
        # Select probability of chosen actions
        chosen_actions = tf.gather(tf.reshape(self.output, [-1]), action_ids)
        eligibility = tf.log(chosen_actions)
        # Change the likelihood of taken action using the return (self.returns)        
        self.loss = - tf.reduce_mean(self.returns * eligibility)
        opt = tf.train.AdamOptimizer(learning_rate)
        # We should perform gradient ascent in the likelihood of specified action
        # which is the same as performing gradient descent on the negative of the loss
        local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy') \
                     + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'shared') 
        grads_and_vars = opt.compute_gradients(self.loss, local_vars)
        self.global_step = slim.get_or_create_global_step()
        self.train_op = opt.apply_gradients(grads_and_vars, self.global_step)        
        
    def predict(self, sess, states):
        return sess.run(self.output, feed_dict={self.states: states})
    
    def update(self, sess, states, actions, returns):
        feed_dict = {self.states: states,
                     self.actions: actions,
                     self.returns: returns}
        sess.run(self.train_op, feed_dict=feed_dict)

In [7]:
def test_updates(learning_rate, batch_size=100):
    ''' Test if the weigth updates are giving the desired outputs '''
    # Create a new policy
    tf.reset_default_graph()
    policy = Policy(learning_rate=learning_rate)
    # Generate states
    state = np.random.random((batch_size, 80, 80))
    fake_returns = [(100, 'increase'), (-100, 'decrease')]
    for action in range(NUM_ACTIONS):
        actions = action * np.ones(batch_size)
        for fake_return, expected in fake_returns:
            # Reinitialize session because ADAM optimizer builds momentum
            with tf.Session() as sess:
                tf.global_variables_initializer().run()
                # Compare new and old probabilities
                old_probs = policy.predict(sess, state)
                policy.update(sess, state, actions, [fake_return])
                new_probs = policy.predict(sess, state)        
                print('Action {} probability should {}:'.format(action, expected), end=' ')
                print(np.mean(new_probs - old_probs, axis=0))

In [19]:
test_updates(3e-4)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Action 0 probability should increase: [ 0.04927379 -0.0492738 ]
Action 0 probability should decrease: [-0.05674707  0.05674708]
Action 1 probability should increase: [-0.05092872  0.05092873]
Action 1 probability should decrease: [ 0.04037581 -0.04037582]


In [25]:
class ValueNet:
    def __init__(self, learning_rate, reuse=False):
        # Placeholders
        self.states = tf.placeholder(name='states',
                                     shape=(None, 80, 80),
                                     dtype=tf.float32)
        # TD targets
        self.targets = tf.placeholder(name='targets',
                                      shape=(None),
                                      dtype=tf.float32)
        
        # Get or create shared network
        self.conv, self.fc = shared_network(self.states, reuse)
        # Final/output layer
        with tf.variable_scope('value_net'):
            self.output = slim.fully_connected(inputs=self.fc,
                                               num_outputs=1,
                                               activation_fn=None)
        
        # Loss (mean squared error)
        self.loss = tf.reduce_mean(tf.squared_difference(self.targets, self.output))
        opt = tf.train.AdamOptimizer(learning_rate)
        local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'value_net') \
                   + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'shared') 
        grads_and_vars = opt.compute_gradients(self.loss, local_vars)
        self.global_step = slim.get_or_create_global_step()
        self.train_op = opt.apply_gradients(grads_and_vars, self.global_step)        
        
    def predict(self, sess, states):
        return sess.run(self.output, feed_dict={self.states: states})
    
    def update(self, sess, states, targets):
        feed_dict = {self.states: states,
                     self.targets: targets}
        sess.run(self.train_op, feed_dict=feed_dict)

In [29]:
tf.reset_default_graph()
policy = Policy(3e-4)
baseline = ValueNet(3e-4, True)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [30]:
g = tf.get_default_graph()
[op.name for op in g.get_operations()]

['states',
 'returns',
 'chosen_action',
 'shared/Stack/convolution_1/weights',
 'shared/Stack/convolution_1/weights/Initializer/random_uniform/shape',
 'shared/Stack/convolution_1/weights/Initializer/random_uniform/min',
 'shared/Stack/convolution_1/weights/Initializer/random_uniform/max',
 'shared/Stack/convolution_1/weights/Initializer/random_uniform/RandomUniform',
 'shared/Stack/convolution_1/weights/Initializer/random_uniform/sub',
 'shared/Stack/convolution_1/weights/Initializer/random_uniform/mul',
 'shared/Stack/convolution_1/weights/Initializer/random_uniform',
 'shared/Stack/convolution_1/weights/Assign',
 'shared/Stack/convolution_1/weights/read',
 'shared/Stack/convolution_1/convolution/Shape',
 'shared/Stack/convolution_1/convolution/dilation_rate',
 'shared/Stack/convolution_1/convolution/ExpandDims/dim',
 'shared/Stack/convolution_1/convolution/ExpandDims',
 'shared/Stack/convolution_1/convolution/ExpandDims_1/dim',
 'shared/Stack/convolution_1/convolution/ExpandDims_1'

In [31]:
with tf.Session() as sess:
    writer = tf.summary.FileWriter('logs/stats/', sess.graph)

In [13]:
tf.reset_default_graph()
mypolicy = Policy(3e-4)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [43]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    probs = mypolicy.predict(sess, states[:100])

In [45]:
probs.shape

(100, 2)

In [40]:
state = env.reset()
state = preprocess(state)
# Use a black image for first state
states = []
states.append(np.zeros((80, 80)))

# Repeat until episode is finished
for i_step in itertools.count():
#    action_probs = policy.predict(sess, states[-1])    
    next_state, reward, done, _ = env.step(2)
    next_state = preprocess(next_state)
    states.append(next_state - state)
    #Update state
    if done:
        break    
    state = next_state
    
states = np.array(states)    

In [None]:
plt.imshow(states[259], cmap='gray')