# Demonstration of reinforcement learning

Code written after Andrej's Karpathy description at http://karpathy.github.io/2016/05/31/rl/.

In [1]:
%matplotlib inline

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
from glob import glob

import threading

import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt

In [4]:
from IPython.display import HTML

In [5]:
import gym
import gym.wrappers

In [6]:
env = gym.make("Pong-v0")

INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:07,535] Making new env: Pong-v0


In [7]:
%%time
env.reset()
steps = []
while True:
    step_img, reward, done, extra = env.step(env.action_space.sample())
    steps.append((step_img, reward))
    if done:
        break

print(len(steps))

print(np.sum([s[1] for s in steps]))

1422
-20.0
CPU times: user 660 ms, sys: 40 ms, total: 700 ms
Wall time: 699 ms


In [8]:
env = gym.make("Pong-v0")
env = gym.wrappers.Monitor(env, "./tmpgym/", force=True)

env.reset()

steps = []
while True:
    step_img, reward, done, extra = env.step(env.action_space.sample())
    steps.append((step_img, reward))
    if done:
        break
        
env.close()

INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:08,802] Making new env: Pong-v0
INFO:gym.wrappers.monitoring:Clearing 4 monitor files from previous run (because force=True was provided)
[2017-05-31 12:44:08,983] Clearing 4 monitor files from previous run (because force=True was provided)
INFO:gym.monitoring.video_recorder:Starting new video recorder writing to /pio/lscratch/1/jch/private/Dropbox/work/II/summer17/nnets17/lectures/tmpgym/openaigym.video.0.4543.video000000.mp4
[2017-05-31 12:44:08,992] Starting new video recorder writing to /pio/lscratch/1/jch/private/Dropbox/work/II/summer17/nnets17/lectures/tmpgym/openaigym.video.0.4543.video000000.mp4
INFO:gym.wrappers.monitoring:Finished writing results. You can upload them to the scoreboard via gym.upload('/pio/lscratch/1/jch/private/Dropbox/work/II/summer17/nnets17/lectures/tmpgym')
[2017-05-31 12:44:10,001] Finished writing results. You can upload them to the scoreboard via gym.upload('/pio/lscratch/1/jch/priva

In [9]:
video_template = """
<video width="320" height="240" controls>
  <source src="%s" type="video/mp4">
  Your browser does not support the video tag.
</video>
"""

videos = []

for f in glob("./tmpgym/*.mp4"):
    videos.append(video_template % (f,))

HTML(''.join(videos))

In [10]:
tf.reset_default_graph()
env = gym.make("Pong-v0")

INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:10,206] Making new env: Pong-v0


In [11]:
env.seed()

[12057213055951277873L, 408342734L]

In [12]:
tf.reset_default_graph()
envs = []

NWORKERS = 12
MAX_ROLLOUTS_IN_BATCH = 100

for i in range(NWORKERS):
    envs.append(gym.make("Pong-v0"))
    envs[-i].seed(i)

# Add space for a monitoring env
envs.append(None)

def preproc_state(state):
    state = state[35:195:2, ::2]
    state = ((state[:,:,0] != 144) & (state[:,:,0] != 119)).astype('float32')
    return state


def discount_rewards(r, gamma=0.99):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in xrange(r.size-1, -1, -1):
        if r[t] != 0:
            running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

env_id = tf.placeholder('int32', (), "env_id")
cur_state = tf.placeholder('float32', (80, 80), "cur_state")
prev_state = tf.zeros_like(cur_state)

states = tf.TensorArray('float32', size=2, dynamic_size=True, clear_after_read=False)
states = states.write(0, prev_state)
states = states.write(1, cur_state)

done = tf.constant(False)

rewards = tf.TensorArray('float32', size=0, dynamic_size=True)

action_logits = tf.TensorArray('float32', size=0, dynamic_size=True)
actions = tf.TensorArray('int32', size=0, dynamic_size=True)

def net(prev_state, cur_state):
    # return tf.constant(0.0, dtype=tf.float32)
    with tf.variable_scope("network"):
        input_ = tf.reshape(cur_state - prev_state, (1, 80*80))
        hidden = tf.layers.dense(input_, 200, activation=tf.nn.relu, 
                                 weights_initializer=tf.contrib.layers.xavier_initializer())
        action_logit = tf.layers.dense(hidden, 1, weights_initializer=tf.contrib.layers.xavier_initializer())
    return action_logit

def while_cond(step, done, *args):
    return tf.not_equal(done, True)

def while_body(step, done, states, rewards, action_logits, actions):
    def env_step_fun(env_id, action):
        state, reward, done, _ = envs[env_id].step(action=2 if action else 3)
        return preproc_state(state), np.array(reward, dtype='float32'), done
    
    prev_state = states.read(step)
    cur_state = states.read(step+1)
    
    
    action_logit = net(prev_state, cur_state)
    action_logits = action_logits.write(step, action_logit)
    
    action = tf.cast(tf.random_uniform(()) < tf.nn.sigmoid(action_logit), tf.int32)
    actions = actions.write(step, action)
    
    state, reward, done = tf.py_func(env_step_fun, [env_id, action], 
                                     [tf.float32, tf.float32, tf.bool])
    
    state.set_shape(cur_state.shape)
    reward.set_shape(())
    done.set_shape(())
    
    states = states.write(step+2, state)
    rewards = rewards.write(step, reward)
    
    return step + 1, done, states, rewards, action_logits, actions
    
(steps, done, states, rewards, action_logits,actions
) = tf.while_loop(while_cond, while_body, [0, False, states, rewards, action_logits, actions])

states = states.stack()
rewards = tf.reshape(rewards.stack(), (-1,))
actions = tf.reshape(actions.stack(), (-1,))
action_logits = tf.reshape(action_logits.stack(), (-1,))

rewards_discounted = tf.py_func(discount_rewards, [rewards], tf.float32, stateful=False)
rewards_discounted.set_shape(rewards.shape)

rewards_mean, rewards_var = tf.nn.moments(rewards_discounted, axes=[0])
rewards_discounted = (rewards_discounted - rewards_mean) / tf.sqrt(rewards_var)

action_neg_likelihood = tf.nn.sigmoid_cross_entropy_with_logits(logits=action_logits,
                                                                targets=tf.cast(actions, tf.float32))
loss = tf.reduce_sum(action_neg_likelihood * rewards_discounted)

accums = []
with tf.variable_scope('accums'):
    for var in tf.trainable_variables():
        accums.append(tf.get_variable(var.name.split(':')[0],
                                      var.get_shape(),
                                      var.dtype,
                                      tf.zeros_initializer(),
                                      trainable=False))
    reward_accum = tf.get_variable('rewards', (), tf.float32, tf.zeros_initializer(), trainable=False)
        
grads = tf.gradients(loss, tf.trainable_variables())

rollout_count = tf.Variable(0, name='rollout_count', trainable=False)

grad_count = tf.Variable(0, name='grad_count_count', trainable=False)

grad_update_op = tf.group(tf.assign_add(rollout_count, 1),
                          tf.assign_add(grad_count, 1),
                          tf.assign_add(reward_accum, tf.reduce_sum(rewards)),
                          *[tf.assign_add(a, g, use_locking=True) 
                            for a,g in zip(accums, grads)])

global_step = tf.Variable(0, name='global_step', trainable=False)

trainer = tf.train.AdamOptimizer()
train_op = trainer.apply_gradients(zip([a / tf.cast(grad_count, 'float32') for a in accums], 
                                       tf.trainable_variables()), global_step)

reset_accums_op = tf.group(tf.assign(grad_count, 0),
                           tf.assign(reward_accum, 0.0),
                           *[tf.assign(a, tf.zeros_like(a)) for a in accums])

#train_op = tf.group(reset_accums_op)

INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:10,908] Making new env: Pong-v0
INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:11,331] Making new env: Pong-v0
INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:11,705] Making new env: Pong-v0
INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:12,441] Making new env: Pong-v0
INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:12,835] Making new env: Pong-v0
INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:13,260] Making new env: Pong-v0
INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:13,645] Making new env: Pong-v0
INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:14,046] Making new env: Pong-v0
INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:14,405] Making new env: Pong-v0
INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 12:44:14,768] Making new env

In [13]:
int_placeholder = tf.placeholder('int32', (), 'int_placeholder')
work_queue = tf.FIFOQueue(MAX_ROLLOUTS_IN_BATCH, [tf.int32], shapes=[()])
work_enqueue = work_queue.enqueue(int_placeholder)
work_deque_one = work_queue.dequeue()
work_deque_all = work_queue.dequeue_many(int_placeholder)

done_queue = tf.FIFOQueue(MAX_ROLLOUTS_IN_BATCH, [tf.int32], shapes=[()])
done_enqueue = done_queue.enqueue(int_placeholder)
done_deque_one = done_queue.dequeue()
done_deque_all = done_queue.dequeue_many(int_placeholder)


In [17]:
saver = tf.train.Saver(tf.global_variables(), keep_checkpoint_every_n_hours=1)

Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Please use tf.global_variables instead.
[2017-05-31 14:08:38,645] From <ipython-input-17-c4ac887da2a7>:1: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.


In [14]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [15]:
def env_simulator(env_id_v):
    # print ("Starting env_id %d" %(env_id_v,))
    while True:
        job = sess.run(work_deque_one)
        # print ("Running env_id %d job %d" %(env_id_v, job,))
        sess.run([grad_update_op], 
                 feed_dict={cur_state: preproc_state(envs[env_id_v].reset()),
                            env_id: env_id_v})
        sess.run([done_enqueue], feed_dict={int_placeholder: job})
    
threads = []
for i in range(NWORKERS):
    thread = threading.Thread(target=env_simulator, args=(i,))
    threads.append(thread)
    thread.start()

In [24]:
saver.save(sess, "./tmpgym/tf_save", global_step=global_step_v)

'./tmpgym/tf_save-320'

In [None]:
BATCH_SIZE = 10

# Reset the accumulators, which can be non-zero after a resumed training
sess.run(reset_accums_op)

while True:
    # accumulate gradients over a minibatch of inputs
    for i in range(BATCH_SIZE):
        sess.run(work_enqueue, feed_dict={int_placeholder: i})
    sess.run(done_deque_all, feed_dict={int_placeholder: BATCH_SIZE})
    
    # read the statistics
    grad_count_v, reward_accum_v, global_step_v, rollout_count_v = sess.run(
        [grad_count, reward_accum, global_step, rollout_count])

    assert grad_count_v == BATCH_SIZE
    
    print ("After %d steps (%d rollouts) the mean reward is %f" % 
           (global_step_v, rollout_count_v, reward_accum_v / grad_count_v))
    
    sess.run(train_op)
    sess.run(reset_accums_op)
    
    if (global_step_v % 10) == 0:
        saver.save(sess, "./tmpgym/tf_save", global_step=global_step_v)

gcv: 10
After 321 steps (3230 rollouts) the mean reward is -8.400000
gcv: 10
After 322 steps (3240 rollouts) the mean reward is -5.700000
gcv: 10
After 323 steps (3250 rollouts) the mean reward is -6.800000
gcv: 10
After 324 steps (3260 rollouts) the mean reward is -4.500000
gcv: 10
After 325 steps (3270 rollouts) the mean reward is -9.400000
gcv: 10
After 326 steps (3280 rollouts) the mean reward is -5.200000


In [26]:
# Play a game and recodr a video!

envs[-1] = gym.wrappers.Monitor(gym.make("Pong-v0"), "./tmpgym/videos", force=True)
env_monitor = len(envs) - 1
sess.run(rewards, {cur_state: preproc_state(envs[env_monitor].reset()),
                   env_id: env_monitor})
envs[-1].close()

INFO:gym.envs.registration:Making new env: Pong-v0
[2017-05-31 14:16:53,122] Making new env: Pong-v0
INFO:gym.wrappers.monitoring:Creating monitor directory ./tmpgym/videos
[2017-05-31 14:16:53,423] Creating monitor directory ./tmpgym/videos
INFO:gym.monitoring.video_recorder:Starting new video recorder writing to /pio/lscratch/1/jch/private/Dropbox/work/II/summer17/nnets17/lectures/tmpgym/videos/openaigym.video.1.4543.video000000.mp4
[2017-05-31 14:16:53,433] Starting new video recorder writing to /pio/lscratch/1/jch/private/Dropbox/work/II/summer17/nnets17/lectures/tmpgym/videos/openaigym.video.1.4543.video000000.mp4
INFO:gym.wrappers.monitoring:Finished writing results. You can upload them to the scoreboard via gym.upload('/pio/lscratch/1/jch/private/Dropbox/work/II/summer17/nnets17/lectures/tmpgym/videos')
[2017-05-31 14:17:00,706] Finished writing results. You can upload them to the scoreboard via gym.upload('/pio/lscratch/1/jch/private/Dropbox/work/II/summer17/nnets17/lectures/tm

In [29]:
video_template = """
%s <br>
<video width="320" height="240" controls>
  <source src="%s" type="video/mp4">
  Your browser does not support the video tag.
</video><br>
"""

videos = []

for f in glob("./tmpgym/*/*.mp4"):
    videos.append(video_template % (f,f))

HTML(''.join(videos))