Based on article: https://arxiv.org/pdf/1602.01783v1.pdf
And code https://gist.github.com/kkweon/67bc50fed5d4dc08b844d45479bebe75

In [1]:
import tensorflow as tf
import numpy as np
import gym
from threading import Thread
from time import sleep
from os.path import dirname
from glob import glob
from IPython.display import HTML

In [2]:
possible_movements = np.array([1, 2, 3, 4, 5])
output_dim = len(possible_movements)
first_number = possible_movements[0]
procces_image_size = [80, 80, 1]
max_episode_number = 1000

save_path = "some_model/model"
load_path = "some_model/model"
n_threads = 3

monitor = True
monitor_name = "monitor"

terminate_time = 2 # Program will finish after terminate_time hours

Helper functions for running pong

In [12]:
def procces_image(image):
    """Returns a preprocessed image original pong picture is 210x160x3
    (1) Crop image (top and bottom), and resize it (delete every other pixel)
    (2) Remove background & grayscale

    Args:
        image (3-D array): (210, 160, 3)

    Returns:
        image (3-D array): (80, 80, 1)
    """
    image = image[35:195:2, ::2] # crop image on 80, 80
    image = ((image[:,:,0] != 144) & (image[:,:,0] != 119)).astype('float32')
    image = np.expand_dims(image, axis=2)
    return image

def discount_reward(rewards, gamma=0.99):
    """Returns discounted rewards

    Args:
        rewards (1-D array): Reward array
        gamma (float): Discounted rate

    Returns:
        discounted_rewards: same shape as `rewards`

    Notes:
        In Pong, when the reward can be {-1, 0, 1}.

        However, when the reward is either -1 or 1,
        it means the game has been reset.

        Therefore, it's necessaray to reset `running_add` to 0
        whenever the reward is nonzero

        This will introduce a bias in our gradients: we will train on more steps,
        but we will introduce a systematic error in the gradient.
    """
    advantage = np.zeros_like(rewards)
    running_add = 0
    for t in range(len(rewards)-1, -1, -1):
        if rewards[t] != 0:
            running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
        running_add = running_add * gamma + rewards[t]
        advantage[t] = running_add
    advantage -= advantage.mean()
    advantage /= advantage.std()
#     advantage /= advantage.std() + 1e-8
    return advantage


def copy_src_to_dst(from_scope, to_scope):
    """Creates a copy variable weights operation
    Args:
        from_scope (str): The name of scope to copy from
            It should be "global"
        to_scope (str): The name of scope to copy to
            It should be "thread-{}"
    Returns:
        list: Each element is a copy operation
    """
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var, to_var in zip(from_vars, to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder


In [13]:
class Agent(Thread):

    def __init__(self, session, env, coord, name, global_network, input_shape=[80, 80, 1], output_dim=3, logdir=None):
        """Agent worker thread
        Args:
            session (tf.Session): Tensorflow session needs to be shared
            env (gym.env): Gym environment
            coord (tf.train.Coordinator): Tensorflow Queue Coordinator
            name (str): Name of this worker
            global_network (A3CNetwork): Global network that needs to be updated
            input_shape (list): Required for local A3CNetwork (H, W, C)
            output_dim (int): Number of actions
            logdir (str, optional): If logdir is given, will write summary
        """
        super(Agent, self).__init__()
        self.local = A3CNetwork(name, input_shape, output_dim, logdir)
        self.global_to_local = copy_src_to_dst("global", name)
        self.global_network = global_network

        self.input_shape = input_shape
        self.output_dim = output_dim
        self.env = env
        self.sess = session
        self.coord = coord
        self.name = name
        self.logdir = logdir

    def play_episode(self, episode_number):
        # Update local weights
        self.sess.run(self.global_to_local)

        states = []
        actions = []
        rewards = []

        state = self.env.reset()
        state = procces_image(state)

        done = False
        total_reward = 0
        time_step = 0
        time_max = np.random.randint(15, 25)
        action_repeat = 1
        old_states = 1
        states_old = [np.zeros_like(state) for i in range(old_states)]
        states_old[-1] = state
        state_diff = np.zeros_like(state)
        while not done:

            action = self.choose_action(state_diff)
            sum_reward = 0
            for i in range(action_repeat):
                state_new, reward, done, _ = self.env.step(action)
                sum_reward += reward
                if done:
                    break

            state_new = procces_image(state_new)
            total_reward += sum_reward

            states.append(state_diff)
            actions.append(action)
            rewards.append(sum_reward)

            state_diff = state_new - np.sum(states_old, axis=0)

            for i, j in zip(range(old_states-1), range(1, old_states)):
                states_old[i] = states_old[j]
            states_old[-1] = state_new

            if sum_reward != 0 or done:
                time_step += 1

                if time_step >= time_max or done:
                    self.train(states, actions, rewards)
                    self.sess.run(self.global_to_local)
                    states, actions, rewards = [], [], []
                    time_step = 0

#             time_step += 1

#             if time_step >= time_max or done:
#                 self.train(states, actions, rewards)
#                 self.sess.run(self.global_to_local)
#                 states, actions, rewards = [], [], []
#                 time_step = 0

        print("Agent -> {}, reward = {}, episode = {}".format(self.name, total_reward, episode_number))

    def run(self):
        episode_number = 0
        while not self.coord.should_stop() or episode_number > max_episode_number:
            self.play_episode(episode_number)
            episode_number += 1
        raise SystemExit

    def choose_action(self, states):
        """
        Args:
            states (proccesed image): (H, W, 1)/(input_shape)
        """
        states = np.reshape(states, [-1] + self.input_shape)
        feed = { self.local.states: states }

        action = self.sess.run(self.local.action_prob, feed)
        action = np.squeeze(action)

        return np.random.choice(np.arange(self.output_dim) + 1, p=action)

    def train(self, states, actions, rewards):
        states = np.array(states)
        actions = np.array(actions) - first_number # in atari we don't start from 0
        rewards = np.array(rewards)

        # Calculate value for each state
        feed = { self.local.states: states }
        values = self.sess.run(self.local.values, feed)

        # Smears rewards over all actions (we don't know exact which action did good move)
        rewards = discount_reward(rewards, gamma=0.99)

        advantage = rewards - values
        advantage -= np.mean(advantage)
        advantage /= np.std(advantage) + 1e-8

        feed = {
            self.local.states: states,
            self.local.actions: actions,
            self.local.rewards: rewards,
            self.local.advantage: advantage
        }

        gradients = self.sess.run(self.local.gradients, feed)

        feed = []
        for (grad, _), (placeholder, _) in zip(gradients, self.global_network.gradients_placeholders):
            feed.append((placeholder, grad))

        feed = dict(feed)
        self.sess.run(self.global_network.apply_gradients, feed)


In [16]:
class A3CNetwork(object):

    def __init__(self, name, input_shape, output_dim, logdir=None):
        """Network structure is defined here
        Args:
            name (str): The name of scope
            input_shape (list): The shape of input image [H, W, C]
            output_dim (int): Number of actions
            logdir (str, optional): directory to save summaries
                TODO: create a summary op
        """
        with tf.variable_scope(name):
            self.states = tf.placeholder(tf.float32, shape=[None] + input_shape, name="states")
            self.actions = tf.placeholder(tf.uint8, shape=[None], name="actions")
            self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards")
            self.advantage = tf.placeholder(tf.float32, shape=[None], name="advantage")

            action_onehot = tf.one_hot(self.actions, output_dim, name="action_onehot")
            net = self.states

            with tf.variable_scope("layer1"):
                net = tf.layers.conv2d(net,
                                       filters=16,
                                       kernel_size=(8, 8),
                                       strides=(4, 4),
                                       name="conv")
                net = tf.nn.relu(net, name="relu")

            with tf.variable_scope("layer2"):
                net = tf.layers.conv2d(net,
                                       filters=32,
                                       kernel_size=(4, 4),
                                       strides=(2, 2),
                                       name="conv")
                net = tf.nn.relu(net, name="relu")

            with tf.variable_scope("fc1"):
                net = tf.contrib.layers.flatten(net)
                net = tf.layers.dense(net, 256, name='dense')
                net = tf.nn.relu(net, name='relu')

            # actor network
            actions = tf.layers.dense(net, output_dim, name="final_fc")
            self.action_prob = tf.nn.softmax(actions, name="action_prob")
            single_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1)

            entropy = - self.action_prob * tf.log(self.action_prob + 1e-7)
            entropy = tf.reduce_sum(entropy, axis=1)

            log_action_prob = tf.log(single_action_prob + 1e-7)
            maximize_objective = log_action_prob * self.advantage
#             maximize_objective = log_action_prob * self.advantage + entropy * 0.005
            self.actor_loss = - tf.reduce_mean(maximize_objective)

            # value network
            self.values = tf.squeeze(tf.layers.dense(net, 1, name="values"))
            self.value_loss = tf.losses.mean_squared_error(labels=self.rewards,
                                                           predictions=self.values)

            self.total_loss = self.actor_loss + self.value_loss * .5
            self.optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=.99)

        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
        self.gradients = self.optimizer.compute_gradients(self.total_loss, var_list)
        self.gradients_placeholders = []

        for grad, var in self.gradients:
            self.gradients_placeholders.append((tf.placeholder(var.dtype, shape=var.get_shape()), var))
        self.apply_gradients = self.optimizer.apply_gradients(self.gradients_placeholders)

        if logdir:
            loss_summary = tf.summary.scalar("total_loss", self.total_loss)
            value_summary = tf.summary.histogram("values", self.values)

            self.summary_op = tf.summary.merge([loss_summary, value_summary])
            self.summary_writer = tf.summary.FileWriter(logdir)


In [5]:
def run_pong():
    try:
        tf.reset_default_graph()
        sess = tf.InteractiveSession()
        coord = tf.train.Coordinator()

        global_network = A3CNetwork(name="global", input_shape=procces_image_size, output_dim=output_dim)

        thread_list = []
        env_list = []

        for id in range(n_threads):
            env = gym.make("Pong-v0")

            if id == 0 and monitor:
                env = gym.wrappers.Monitor(env, monitor_name, force=True)

            single_agent = Agent(env=env, session=sess, coord=coord,
                                 name="thread_{}".format(id), global_network=global_network,
                                 input_shape=procces_image_size, output_dim=output_dim)

            thread_list.append(single_agent)
            env_list.append(env)

        if tf.train.get_checkpoint_state(dirname(load_path)):
            var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global")
            saver = tf.train.Saver(var_list=var_list)
            saver.restore(sess, load_path)
            print("Load from", load_path)
            print("Model restored to global")
        else:
            init = tf.global_variables_initializer()
            sess.run(init)
            print("No model is found")

        for t in thread_list:
            t.start()

        print("Keyboard or System interupt to close")
        sleep(60*60*terminate_time)
        raise SystemExit
        # coord.wait_for_stop()

    except (KeyboardInterrupt, SystemExit):
        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global")
        saver = tf.train.Saver(var_list=var_list)
        saver.save(sess, save_path)
        print()
        print("=" * 10)
        print('Checkpoint Saved to {}'.format(save_path))
        print("=" * 10)

        print("Closing threads")
        coord.request_stop()
        coord.join(thread_list)

        print("Closing environments")
        for env in env_list:
            env.close()

        sess.close()


In [7]:
# possible_movements = np.array([1, 2, 3])
possible_movements = np.array([1, 2, 3, 4, 5])
output_dim = len(possible_movements)
first_number = possible_movements[0]
procces_image_size = [80, 80, 1]
max_episode_number = 1000

save_path = "tmp_model/model"
load_path = "dim5_model_v0/model"
n_threads = 3

monitor = True
monitor_name = "tmp_monitor"

terminate_time = 2 # Program will finish after terminate_time hours

run_pong()

[2017-06-29 11:37:35,237] Making new env: Pong-v0
[2017-06-29 11:37:35,424] Clearing 2 monitor files from previous run (because force=True was provided)
[2017-06-29 11:37:35,811] Making new env: Pong-v0
[2017-06-29 11:37:36,317] Making new env: Pong-v0


INFO:tensorflow:Restoring parameters from dim5_model_v0/model


[2017-06-29 11:37:37,067] Restoring parameters from dim5_model_v0/model


('Load from', 'dim5_model_v0/model')
Model restored to global
Keyboard or System interupt to close


[2017-06-29 11:37:37,270] Starting new video recorder writing to /home/ivan/Dropbox/Faks/5_godina/nn_assignments/pong/tmp_monitor/openaigym.video.0.32278.video000000.mp4


Agent -> thread_1, reward = 2.0, episode = 0
Agent -> thread_2, reward = -5.0, episode = 0


[2017-06-29 11:38:00,741] Starting new video recorder writing to /home/ivan/Dropbox/Faks/5_godina/nn_assignments/pong/tmp_monitor/openaigym.video.0.32278.video000001.mp4


Agent -> thread_0, reward = -3.0, episode = 0
Agent -> thread_1, reward = 3.0, episode = 1
Agent -> thread_2, reward = -1.0, episode = 1
Agent -> thread_0, reward = -3.0, episode = 1
Agent -> thread_1, reward = -8.0, episode = 2
Agent -> thread_2, reward = -5.0, episode = 2
Agent -> thread_0, reward = -5.0, episode = 2
()
Checkpoint Saved to tmp_model/model
Closing threads
Agent -> thread_1, reward = -8.0, episode = 3
Agent -> thread_2, reward = -9.0, episode = 3
Agent -> thread_0, reward = -4.0, episode = 3


[2017-06-29 11:39:23,269] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/ivan/Dropbox/Faks/5_godina/nn_assignments/pong/tmp_monitor')


Closing environments


In [13]:
def play_videos():
    video_template = """
    %s <br>
    <video width="320" height="240" controls>
      <source src="%s" type="video/mp4">
      Your browser does not support the video tag.
    </video><br>
    """

    videos = []

    for f in glob(monitor_name + "*/*.mp4"):
        videos.append(video_template % (f[len(monitor_name):],f))

    return HTML(''.join(videos))

play_videos()


In [17]:
# possible_movements = np.array([1, 2, 3])
possible_movements = np.array([1, 2, 3, 4, 5])
output_dim = len(possible_movements)
first_number = possible_movements[0]
procces_image_size = [80, 80, 1]
max_episode_number = 1000

save_path = "tmp_model/model"
load_path = "dim5_model_v0/model"
n_threads = 5

monitor = True
monitor_name = "tmp2_monitor"

terminate_time = 2 # Program will finish after terminate_time hours

run_pong()

[2017-06-29 13:52:36,190] Making new env: Pong-v0
[2017-06-29 13:52:36,360] Clearing 8 monitor files from previous run (because force=True was provided)
[2017-06-29 13:52:36,811] Making new env: Pong-v0
[2017-06-29 13:52:37,254] Making new env: Pong-v0
[2017-06-29 13:52:37,693] Making new env: Pong-v0
[2017-06-29 13:52:38,288] Making new env: Pong-v0


INFO:tensorflow:Restoring parameters from dim5_model_v0/model


[2017-06-29 13:52:38,770] Restoring parameters from dim5_model_v0/model


('Load from', 'dim5_model_v0/model')
Model restored to global
Keyboard or System interupt to close


[2017-06-29 13:52:39,016] Starting new video recorder writing to /home/ivan/Dropbox/Faks/5_godina/nn_assignments/pong/tmp2_monitor/openaigym.video.3.8333.video000000.mp4


Agent -> thread_4, reward = -4.0, episode = 0
Agent -> thread_1, reward = -8.0, episode = 0
Agent -> thread_2, reward = -2.0, episode = 0
Agent -> thread_3, reward = -2.0, episode = 0


[2017-06-29 13:53:21,351] Starting new video recorder writing to /home/ivan/Dropbox/Faks/5_godina/nn_assignments/pong/tmp2_monitor/openaigym.video.3.8333.video000001.mp4


Agent -> thread_0, reward = -4.0, episode = 0
Agent -> thread_4, reward = -10.0, episode = 1
Agent -> thread_1, reward = -7.0, episode = 1
Agent -> thread_2, reward = -2.0, episode = 1
Agent -> thread_3, reward = 3.0, episode = 1
Agent -> thread_0, reward = -10.0, episode = 1
Agent -> thread_2, reward = -13.0, episode = 2
Agent -> thread_4, reward = 1.0, episode = 2
Agent -> thread_1, reward = 1.0, episode = 2
Agent -> thread_3, reward = -7.0, episode = 2
Agent -> thread_0, reward = -1.0, episode = 2
Agent -> thread_4, reward = -7.0, episode = 3
Agent -> thread_2, reward = -3.0, episode = 3
Agent -> thread_1, reward = -7.0, episode = 3
Agent -> thread_3, reward = -4.0, episode = 3
Agent -> thread_0, reward = -7.0, episode = 3
Agent -> thread_2, reward = -7.0, episode = 4
Agent -> thread_4, reward = -8.0, episode = 4
Agent -> thread_1, reward = -3.0, episode = 4
Agent -> thread_0, reward = -8.0, episode = 4
Agent -> thread_2, reward = -10.0, episode = 5
Agent -> thread_3, reward = -2.0,

[2017-06-29 13:57:22,394] Starting new video recorder writing to /home/ivan/Dropbox/Faks/5_godina/nn_assignments/pong/tmp2_monitor/openaigym.video.3.8333.video000008.mp4


Agent -> thread_0, reward = -3.0, episode = 7
Agent -> thread_2, reward = -10.0, episode = 8
Agent -> thread_1, reward = -7.0, episode = 8
Agent -> thread_3, reward = -1.0, episode = 7
Agent -> thread_4, reward = -9.0, episode = 8
Agent -> thread_0, reward = 3.0, episode = 8
Agent -> thread_2, reward = -8.0, episode = 9
Agent -> thread_1, reward = -4.0, episode = 9
Agent -> thread_3, reward = -8.0, episode = 8
Agent -> thread_4, reward = -3.0, episode = 9
Agent -> thread_0, reward = -9.0, episode = 9
Agent -> thread_2, reward = 3.0, episode = 10
Agent -> thread_1, reward = -5.0, episode = 10
Agent -> thread_3, reward = -4.0, episode = 9
Agent -> thread_4, reward = -4.0, episode = 10
Agent -> thread_2, reward = -9.0, episode = 11
Agent -> thread_0, reward = -2.0, episode = 10
Agent -> thread_3, reward = -6.0, episode = 10
Agent -> thread_1, reward = -3.0, episode = 11
Agent -> thread_4, reward = -3.0, episode = 11
Agent -> thread_0, reward = -10.0, episode = 11
Agent -> thread_2, reward

[2017-06-29 14:08:13,073] Starting new video recorder writing to /home/ivan/Dropbox/Faks/5_godina/nn_assignments/pong/tmp2_monitor/openaigym.video.3.8333.video000027.mp4


Agent -> thread_0, reward = -3.0, episode = 26
Agent -> thread_1, reward = -5.0, episode = 27
Agent -> thread_4, reward = -3.0, episode = 27
Agent -> thread_2, reward = -9.0, episode = 28
Agent -> thread_3, reward = -4.0, episode = 26
Agent -> thread_0, reward = -6.0, episode = 27
Agent -> thread_1, reward = -8.0, episode = 28
Agent -> thread_4, reward = -7.0, episode = 28
Agent -> thread_3, reward = -7.0, episode = 27
Agent -> thread_2, reward = -1.0, episode = 29
Agent -> thread_1, reward = -15.0, episode = 29
Agent -> thread_0, reward = -7.0, episode = 28
Agent -> thread_4, reward = -9.0, episode = 29
()
Checkpoint Saved to tmp_model/model
Closing threads
Agent -> thread_2, reward = -10.0, episode = 30
Agent -> thread_0, reward = -13.0, episode = 29
Agent -> thread_3, reward = -6.0, episode = 28
Agent -> thread_1, reward = -1.0, episode = 30
Agent -> thread_4, reward = -10.0, episode = 30


[2017-06-29 14:10:13,493] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/ivan/Dropbox/Faks/5_godina/nn_assignments/pong/tmp2_monitor')


Closing environments
