In [1]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim

from skimage.transform import resize
from skimage.color import rgb2gray

import itertools

import matplotlib.pyplot as plt
%matplotlib inline

In [53]:
env = gym.make('Pong-v0')
VALID_ACTIONS = [2, 3]
NUM_ACTIONS = len(VALID_ACTIONS)

[2017-01-07 19:32:16,241] Making new env: Pong-v0


In [3]:
def preprocess(img):
    return resize(rgb2gray(img), (84, 84))    

In [4]:
def preprocess(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I

In [62]:
class Policy:
    def __init__(self, learning_rate):
        with tf.variable_scope('policy'):
            # Placeholders
            self.states = tf.placeholder(name='states', shape=(None, 80, 80), dtype=tf.float32)
            self.returns = tf.placeholder(name='returns', shape=(None), dtype=tf.float32)
            self.actions = tf.placeholder(name='chosen_action', shape=(None), dtype=tf.int32)

            # Network structure
            # Convolutional layers
            self.conv = slim.stack(self.states, slim.conv2d, [
                    (16, 8, 4),
                    (32, 4, 3)
                ])

            # Fully connected layer
            flatten = slim.flatten(self.conv)
            self.fc = slim.fully_connected(flatten, 216)
            # Final/output layer
            self.output = slim.fully_connected(self.fc, NUM_ACTION, activation_fn=tf.nn.softmax)        
        
        # Optimization process (to increase likelihood of a good action)
        batch_size = tf.shape(self.states)[0]
        # Select the ids of picked actions
        # action_ids = (i_batch * NUM_ACTIONS) + action
        action_ids = tf.range(batch_size) * tf.shape(self.output)[1] + self.actions
        # Select probability of chosen actions
        chosen_actions = tf.gather(tf.reshape(self.output, [-1]), action_ids)
        eligibility = tf.log(chosen_actions)
        # Change the likelihood of taken action using the return (self.returns)        
        self.loss = - tf.reduce_mean(self.returns * eligibility)
        opt = tf.train.AdamOptimizer(learning_rate)
        # We should perform gradient ascent in the likelihood of specified action
        # which is the same as performing gradient descent on the negative of the loss
        local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy')
        grads_and_vars = opt.compute_gradients(self.loss, local_vars)
        self.train_op = opt.apply_gradients(grads_and_vars)        
        
    def predict(self, sess, states):
        return sess.run(self.output, feed_dict={self.states: states})
    
    def update(self, sess, states, actions, returns):
        feed_dict = {self.states: states,
                     self.actions: actions,
                     self.returns: returns}
        sess.run(self.train_op, feed_dict=feed_dict)

In [63]:
def test_updates(learning_rate, batch_size=100):
    ''' Test if the weigth updates are giving the desired outputs '''
    # Create a new policy
    tf.reset_default_graph()
    policy = Policy(learning_rate=learning_rate)
    # Generate states
    state = np.random.random((batch_size, 80, 80))
    fake_returns = [(100, 'increase'), (-100, 'decrease')]
    for action in range(NUM_ACTIONS):
        actions = action * np.ones(batch_size)
        for fake_return, expected in fake_returns:
            # Reinitialize session because ADAM optimizer builds momentum
            with tf.Session() as sess:
                tf.global_variables_initializer().run()
                # Compare new and old probabilities
                old_probs = policy.predict(sess, state)
                policy.update(sess, state, actions, [fake_return])
                new_probs = policy.predict(sess, state)        
                print('Action {} probability should {}:'.format(action, expected), end=' ')
                print(np.mean(new_probs - old_probs, axis=0))

In [64]:
test_updates(3e-4)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Action 0 probability should increase: [ 0.03868949 -0.03868948]
Action 0 probability should decrease: [-0.05844676  0.05844676]
Action 1 probability should increase: [-0.06640681  0.06640682]
Action 1 probability should decrease: [ 0.06081842 -0.06081842]


In [13]:
tf.reset_default_graph()
mypolicy = Policy(3e-4)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [43]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    probs = mypolicy.predict(sess, states[:100])

In [45]:
probs.shape

(100, 2)

In [40]:
state = env.reset()
state = preprocess(state)
# Use a black image for first state
states = []
states.append(np.zeros((80, 80)))

# Repeat until episode is finished
for i_step in itertools.count():
#    action_probs = policy.predict(sess, states[-1])    
    next_state, reward, done, _ = env.step(2)
    next_state = preprocess(next_state)
    states.append(next_state - state)
    #Update state
    if done:
        break    
    state = next_state
    
states = np.array(states)    

In [None]:
plt.imshow(states[259], cmap='gray')