In [2]:
import threading
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.contrib.layers as layers
import tensorflow.contrib.slim as slim
import scipy.signal
%matplotlib inline
from helper import *
from vizdoom import *

from random import choice
from time import sleep
from time import time

  from ._conv import register_converters as _register_converters


Helper Functions

In [9]:
# Copies one set of variables to another.
# Used to set worker network parameters to those of global network.
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

# Processes Doom screen image to produce cropped and resized image. 
def process_frame(frame):
    s = frame[10:-10,30:-30]
    s = scipy.misc.imresize(s,[84,84])
    s = np.reshape(s,[np.prod(s.shape)]) / 255.0
    return s

# Discounting function used to calculate discounted returns.
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

#Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

In [10]:
class Agent(object):
    def __init__(self,
                 actor,
                 critic,
                 value,
                 obs_dim,
                 num_actions,
                 replay_buffer,
                 batch_size=4,
                 action_scale=2.0,
                 gamma=0.9,
                 tau=0.01,
                 actor_lr=3*1e-3,
                 critic_lr=3*1e-3,
                 value_lr=3*1e-3,
                 reg_factor=1e-3):
        self.batch_size = batch_size
        self.num_actions = num_actions
        self.gamma = gamma
        self.obs_dim = obs_dim
        self.action_scale = action_scale
        self.last_obs = None
        self.t = 0
        self.replay_buffer = replay_buffer

        self._act,\
        self._train_actor,\
        self._train_critic,\
        self._train_value,\
        self._update_target = build_graph(
            actor=actor,
            critic=critic,
            value=value,
            obs_dim=obs_dim,
            num_actions=num_actions,
            batch_size=batch_size,
            gamma=gamma,
            tau=tau,
            actor_lr=actor_lr,
            critic_lr=critic_lr,
            value_lr=value_lr,
            reg_factor=reg_factor
        )

        self.actor_errors = []
        self.critic_errors = []
        self.value_errors = []

    def act(self, obs, reward, training=True):
        obs = obs[0]
        action, greedy_action = np.clip(self._act([obs]), -1, 1)
        action = action[0]
        greedy_action = greedy_action[0]

        if not training:
            action = greedy_action

        if training and self.t > 10 * 200:
            # sample experiences
            obs_t,\
            actions,\
            rewards,\
            obs_tp1,\
            dones = self.replay_buffer.sample(self.batch_size)

            # update networks
            value_error = self._train_value(obs_t, actions)
            critic_error = self._train_critic(
                obs_t, actions, rewards, obs_tp1, dones)
            actor_error = self._train_actor(obs_t, actions)

            # store errors through episode
            self.value_errors.append(value_error)
            self.critic_errors.append(critic_error)
            self.actor_errors.append(actor_error)

            # update target networks
            self._update_target()

        if training and self.last_obs is not None:
            self.replay_buffer.append(
                obs_t=self.last_obs,
                action=self.last_action,
                reward=reward,
                obs_tp1=obs,
                done=False
            )

        self.t += 1
        self.last_obs = obs
        self.last_action = action
        return action * self.action_scale

    def stop_episode(self, obs, reward, training=True):
        obs = obs[0]
        if training:
            self.replay_buffer.append(
                obs_t=self.last_obs,
                action=self.last_action,
                reward=reward,
                obs_tp1=obs,
                done=True
            )
            print('actor error: {}, critic error: {}, value error: {}'.format(
                sum(self.actor_errors), sum(self.critic_errors), sum(self.value_errors)))
        self.last_obs = None
        self.last_action = []
        self.value_errors = []
        self.critic_errors = []
        self.actor_errors = []


Build SAC Network

In [12]:
class SAC_Network():
    def __init__(self,hiddens,inpt,num_actions,initializer=tf.contrib.layers.xavier_initializer(),reg_factor=1e-3):
        #make actor network
        with tf.variable_scope(scope, reuse=reuse):
            # l2 regularizer
            regularizer = layers.l2_regularizer(scale=reg_factor)
            out = inpt
            for hidden in hiddens:
                out = tf.layers.dense(
                    out, hidden,
                    bias_initializer=tf.constant_initializer(0.0),
                    kernel_initializer=initializer,
                    kernel_regularizer=regularizer)
                out = tf.nn.relu(out)

            # mean value of normal distribution
            mu = tf.layers.dense(
                out, num_actions, kernel_initializer=initializer, name='mu')
            greedy_action = tf.nn.tanh(mu)

            # variance of normal distribution
            sigma = tf.layers.dense(
                out, num_actions, kernel_initializer=initializer, name='sigma')

            # sample actions from normal distribution
            dist = tf.distributions.Normal(mu, tf.exp(sigma))
            out = tf.reshape(dist.sample(num_actions), [-1, num_actions])
            out = tf.stop_gradient(out)
            action = tf.nn.tanh(out)
            log_prob = dist.log_prob(out) - tf.log(1 - action ** 2 + 1e-6)
        self.action= action 
        self.greedy_action = greedy_action 
        self.log_prob = log_prob
        self.regularizer=regularizer
        
        #make critic network
        out = tf.concat([inpt, action], axis=3)
        for hidden in hiddens:
            out = tf.layers.dense(out, hidden,
                                  bias_initializer=tf.constant_initializer(0.0),
                                  kernel_initializer=initializer)
            out = tf.nn.relu(out)
        out = tf.layers.dense(out, 1, kernel_initializer=initializer)
        self.critic_network=out
        #make value network
        with tf.variable_scope(scope, reuse=reuse):
            out = inpt
            for hidden in hiddens:
                out = tf.layers.dense(
                    out, hidden,
                    bias_initializer=tf.constant_initializer(0.0),
                    kernel_initializer=initializer)
                out = tf.nn.relu(out)

            out = tf.layers.dense(out, 1, kernel_initializer=initializer)
        self.value_network=out
            

# def make_actor_network(hiddens):
#     return lambda *args, **kwargs: _make_actor_network(hiddens, *args, **kwargs)

# def make_critic_network(hiddens):
#     return lambda *args, **kwargs: _make_critic_network(hiddens, *args, **kwargs)

# def make_value_network(hiddens):
#     return lambda *args, **kwargs: _make_value_network(hiddens, *args, **kwargs)


Build graph for updating

In [13]:
def build_graph(actor,
                critic,
                value,
                obs_dim,
                num_actions,
                batch_size,
                gamma,
                tau,
                actor_lr,
                critic_lr,
                value_lr,
                reg_factor,
                scope='sac',
                reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # input placeholders
        print("OBS DIM in graph " + str(obs_dim))
        obs_t_input = tf.placeholder(tf.float32, shape=[None,obs_dim[0],obs_dim[1],obs_dim[2]], name='obs_t')
        act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name='action')
        rew_t_ph = tf.placeholder(tf.float32, [None], name='reward')
        obs_tp1_input = tf.placeholder(tf.float32, shape=[None,obs_dim[0],obs_dim[1],obs_dim[2]], name='obs_tp1')
        done_mask_ph = tf.placeholder(tf.float32, [None], name='done')
        print("obs_t_input " + str(obs_t_input))
        # actor network
        policy_t, greedy_policy_t, log_pi_t, reg = actor(
            obs_t_input, num_actions, reg_factor=reg_factor, scope='actor')
        actor_func_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, '{}/actor'.format(scope))
        print("finished setting actor network")
        # critic network
        q_t = critic(obs_t_input, act_t_ph, scope='critic')
        q_t_with_actor = critic(
            obs_t_input, policy_t, scope='critic', reuse=True)
        critic_func_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, '{}/critic'.format(scope))
        print("finished setting critic network")
        # value network
        v_t = value(obs_t_input, scope='value')
        value_func_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, '{}/value'.format(scope))
        print("finished setting value network")
        # target value network
        v_tp1 = value(obs_tp1_input, scope='target_value')
        target_func_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, '{}/target_value'.format(scope))
        print("finished setting target value network")
        with tf.variable_scope('value_loss'):
            target = q_t - log_pi_t
            value_loss = tf.reduce_mean(
                0.5 * tf.square(v_t - tf.stop_gradient(target)))

        with tf.variable_scope('critic_loss'):
            target = rew_t_ph + gamma * v_tp1 * (1.0 - done_mask_ph)
            critic_loss = tf.reduce_mean(
                0.5 * tf.square(q_t - tf.stop_gradient(target)))

        with tf.variable_scope('policy_loss'):
            target = q_t_with_actor - v_t
            actor_loss = 0.5 * tf.reduce_mean(
                log_pi_t * tf.stop_gradient(log_pi_t - target))
            reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = layers.apply_regularization(reg, reg_variables)
            actor_loss = actor_loss + l2_loss

        # optimize operations
        critic_optimizer = tf.train.AdamOptimizer(critic_lr)
        critic_optimize_expr = critic_optimizer.minimize(
            critic_loss, var_list=critic_func_vars)
        actor_optimizer = tf.train.AdamOptimizer(actor_lr)
        actor_optimize_expr = actor_optimizer.minimize(
            actor_loss, var_list=actor_func_vars)
        value_optimizer = tf.train.AdamOptimizer(value_lr)
        value_optimize_expr = value_optimizer.minimize(
            value_loss, var_list=value_func_vars)

        # update critic target operations
        with tf.variable_scope('update_value_target'):
            update_target_expr = []
            sorted_vars = sorted(value_func_vars, key=lambda v: v.name)
            sorted_target_vars = sorted(target_func_vars, key=lambda v: v.name)
            # assign value variables to target value variables
            for var, var_target in zip(sorted_vars, sorted_target_vars):
                new_var = tau * var + (1 - tau) * var_target
                update_target_expr.append(var_target.assign(new_var))
            update_target_expr = tf.group(*update_target_expr)

        def act(obs):
            feed_dict = {
                obs_t_input: obs
            }
            print("policy t " + str(policy_t))
            print("greedy policy "  + str(greedy_policy_t))
            # feed_dict=tf.reshape(feed_dict,[1,240,320,3])
            print("feed_dict "  + str(feed_dict))
            return tf.get_default_session().run(
                [policy_t, greedy_policy_t], feed_dict=feed_dict)

        def train_actor(obs, action):
            feed_dict = {
                obs_t_input: obs,
                act_t_ph: action
            }
            loss_val, _ = tf.get_default_session().run(
                [actor_loss, actor_optimize_expr], feed_dict=feed_dict)
            return loss_val

        def train_critic(obs_t, action, rew, obs_tp1, done):
            feed_dict = {
                obs_t_input: obs_t,
                act_t_ph: action,
                rew_t_ph: rew,
                obs_tp1_input: obs_tp1,
                done_mask_ph: done
            }
            loss_val, _ = tf.get_default_session().run(
                [critic_loss, critic_optimize_expr], feed_dict=feed_dict)
            return loss_val

        def train_value(obs_t, action):
            feed_dict = {
                obs_t_input: obs_t,
                act_t_ph: action
            }
            loss_val, _ = tf.get_default_session().run(
                [value_loss, value_optimize_expr], feed_dict=feed_dict)
            return loss_val

        def update_target():
            tf.get_default_session().run(update_target_expr)

        return act, train_actor, train_critic, train_value, update_target

In [8]:
class Worker():
    def __init__(self,game,name,s_size,a_size,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name        
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))
        hiddens=[128,128]
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_SAC = SAC_Network(s_size,a_size,self.name,trainer)
        self.update_local_ops = update_target_graph('global',self.name)        
        
        #The Below code is related to setting up the Doom environment
        game.set_doom_scenario_path("scenarios/health_gathering_supreme.wad") #This corresponds to the simple task we will pose our agent
        game.set_doom_map("map01")
        game.set_screen_resolution(ScreenResolution.RES_160X120)
        game.set_screen_format(ScreenFormat.GRAY8)
        game.set_render_hud(False)
        game.set_render_crosshair(False)
        game.set_render_weapon(True)
        game.set_render_decals(False)
        game.set_render_particles(False)
        game.add_available_button(Button.MOVE_LEFT)
        game.add_available_button(Button.MOVE_RIGHT)
        game.add_available_button(Button.ATTACK)
        game.add_available_game_variable(GameVariable.AMMO2)
        game.add_available_game_variable(GameVariable.POSITION_X)
        game.add_available_game_variable(GameVariable.POSITION_Y)
        game.set_episode_timeout(300)
        game.set_episode_start_time(10)
        game.set_window_visible(False)
        game.set_sound_enabled(False)
        game.set_living_reward(-1)
        game.set_mode(Mode.PLAYER)
        game.init()
        self.actions = self.actions = np.identity(a_size,dtype=bool).tolist()
        #End Doom set-up
        self.env = game
        
    def train(self,rollout,sess,gamma,bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:,0]
        actions = rollout[:,1]
        rewards = rollout[:,2]
        next_observations = rollout[:,3]
        values = rollout[:,5]
        
        # Here we take the rewards and values from the rollout, and use them to 
        # generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages,gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {self.local_SAC.target_v:discounted_rewards,
            self.local_SAC.inputs:np.vstack(observations),
            self.local_SAC.actions:actions,
            self.local_SAC.advantages:advantages,
            self.local_SAC.state_in[0]:self.batch_rnn_state[0],
            self.local_SAC.state_in[1]:self.batch_rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n, self.batch_rnn_state,_ = sess.run([self.local_SAC.value_loss,
            self.local_SAC.policy_loss,
            self.local_SAC.entropy,
            self.local_SAC.grad_norms,
            self.local_SAC.var_norms,
            self.local_SAC.state_out,
            self.local_SAC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
        
    def work(self,max_episode_length,gamma,sess,coord,saver):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print ("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():                 
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                
                self.env.new_episode()
                s = self.env.get_state().screen_buffer
                episode_frames.append(s)
                s = process_frame(s)
                rnn_state = self.local_SAC.state_init
                self.batch_rnn_state = rnn_state
                while self.env.is_episode_finished() == False:
                    #Take an action using probabilities from policy network output.
                    a_dist,v,rnn_state = sess.run([self.local_SAC.policy,self.local_SAC.value,self.local_SAC.state_out], 
                        feed_dict={self.local_AC.inputs:[s],
                        self.local_SAC.state_in[0]:rnn_state[0],
                        self.local_SAC.state_in[1]:rnn_state[1]})
                    a = np.random.choice(a_dist[0],p=a_dist[0])
                    a = np.argmax(a_dist == a)

                    r = self.env.make_action(self.actions[a]) / 100.0
                    d = self.env.is_episode_finished()
                    if d == False:
                        s1 = self.env.get_state().screen_buffer
                        episode_frames.append(s1)
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                        
                    episode_buffer.append([s,a,r,s1,d,v[0,0]])
                    episode_values.append(v[0,0])

                    episode_reward += r
                    s = s1                    
                    total_steps += 1
                    episode_step_count += 1
                    
                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
                    if len(episode_buffer) == 30 and d != True and episode_step_count != max_episode_length - 1:
                        # Since we don't know what the true final return is, we "bootstrap" from our current
                        # value estimation.
                        v1 = sess.run(self.local_AC.value, 
                            feed_dict={self.local_AC.inputs:[s],
                            self.local_AC.state_in[0]:rnn_state[0],
                            self.local_AC.state_in[1]:rnn_state[1]})[0,0]
                        v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)
                    if d == True:
                        break
                                            
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
                
                # Update the network using the episode buffer at the end of the episode.
                if len(episode_buffer) != 0:
                    v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
                                
                    
                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0:
                    if self.name == 'worker_0' and episode_count % 25 == 0:
                        time_per_step = 0.05
                        images = np.array(episode_frames)
                        make_gif(images,'./frames/image'+str(episode_count)+'.gif',
                            duration=len(images)*time_per_step,true_image=True,salience=False)
                    if episode_count % 250 == 0 and self.name == 'worker_0':
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-5:])
                    mean_length = np.mean(self.episode_lengths[-5:])
                    mean_value = np.mean(self.episode_mean_values[-5:])
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                    summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                    summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                    summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                    summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1