# Simple Reinforcement Learning with Tensorflow Part 8: Asynchronus Advantage Actor-Critic (A3C)

This iPython notebook includes an implementation of the [A3C algorithm](https://arxiv.org/pdf/1602.01783.pdf). In it we use A3C to solve a simple 3D Doom challenge using the [VizDoom engine](http://vizdoom.cs.put.edu.pl/). For more information on A3C, see the accompanying [Medium post](https://medium.com/p/c88f72a5e9f2/edit).

This tutorial requires that VizDoom is installed. It can be easily obtained with:

`pip install vizdoom`

We also require `basic.wad` and `helper.py`, both of which are available from the [DeepRL-Agents github repo](https://github.com/awjuliani/DeepRL-Agents).

While training is taking place, statistics on agent performance are available from Tensorboard. To launch it use:

`tensorboard --logdir=worker_0:'./train_0',worker_1:'./train_1',worker_2:'./train_2',worker_3:'./train_3'`

In [1]:
import threading
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.contrib.slim as slim
import scipy.signal
%matplotlib inline
from helper import *
from vizdoom import *

from random import choice
from time import sleep
from time import time

### Helper Functions

In [7]:
# Copies one set of variables to another.
# Used to set worker network parameters to those of global network.
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    
    print('from_vars',from_vars)
    print('to_vars',to_vars)
    
    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

# Processes Doom screen image to produce cropped and resized image. 
def process_frame(frame):
    s = frame[10:-10,30:-30]
    s = scipy.misc.imresize(s,[84,84])
    s = np.reshape(s,[np.prod(s.shape)]) / 255.0
    return s

# Discounting function used to calculate discounted returns.
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

#Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

### Actor-Critic Network

In [3]:
class AC_Network():
    def __init__(self,s_size,a_size,scope,trainer):
        with tf.variable_scope(scope):
            #Input and visual encoding layers
            self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32)
            self.imageIn = tf.reshape(self.inputs,shape=[-1,84,84,1])
            self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.imageIn,num_outputs=16,
                kernel_size=[8,8],stride=[4,4],padding='VALID')
            self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.conv1,num_outputs=32,
                kernel_size=[4,4],stride=[2,2],padding='VALID')
            hidden = slim.fully_connected(slim.flatten(self.conv2),256,activation_fn=tf.nn.elu)
            
            #Recurrent network for temporal dependencies
            lstm_cell = tf.contrib.rnn.BasicLSTMCell(256,state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)
            rnn_in = tf.expand_dims(hidden, [0])
            step_size = tf.shape(self.imageIn)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 256])
            
            #Output layers for policy and value estimations
            self.policy = slim.fully_connected(rnn_out,a_size,
                activation_fn=tf.nn.softmax,
                weights_initializer=normalized_columns_initializer(0.01),
                biases_initializer=None)
            self.value = slim.fully_connected(rnn_out,1,
                activation_fn=None,
                weights_initializer=normalized_columns_initializer(1.0),
                biases_initializer=None)
            
            #Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
                self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

                #Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
                self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

                #Get gradients from local network using local losses
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss,local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)
                
                #Apply local gradients to global network
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

### Worker Agent

In [11]:
class Worker():
    def __init__(self,game,name,s_size,a_size,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name        
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_Network(s_size,a_size,self.name,trainer)
        self.update_local_ops = update_target_graph('global',self.name)        
        
        #The Below code is related to setting up the Doom environment
        game.set_doom_scenario_path("basic.wad") #This corresponds to the simple task we will pose our agent
        game.set_doom_map("map01")
        game.set_screen_resolution(ScreenResolution.RES_160X120)
        game.set_screen_format(ScreenFormat.GRAY8)
        game.set_render_hud(False)
        game.set_render_crosshair(False)
        game.set_render_weapon(True)
        game.set_render_decals(False)
        game.set_render_particles(False)
        game.add_available_button(Button.MOVE_LEFT)
        game.add_available_button(Button.MOVE_RIGHT)
        game.add_available_button(Button.ATTACK)
        game.add_available_game_variable(GameVariable.AMMO2)
        game.add_available_game_variable(GameVariable.POSITION_X)
        game.add_available_game_variable(GameVariable.POSITION_Y)
        game.set_episode_timeout(300)
        game.set_episode_start_time(10)
        game.set_window_visible(False)
        game.set_sound_enabled(False)
        game.set_living_reward(-1)
        game.set_mode(Mode.PLAYER)
        game.init()
        self.actions = self.actions = np.identity(a_size,dtype=bool).tolist()
        #End Doom set-up
        self.env = game
        
    def train(self,rollout,sess,gamma,bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:,0]
        actions = rollout[:,1]
        rewards = rollout[:,2]
        next_observations = rollout[:,3]
        values = rollout[:,5]
        
        # Here we take the rewards and values from the rollout, and use them to 
        # generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages,gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {self.local_AC.target_v:discounted_rewards,
            self.local_AC.inputs:np.vstack(observations),
            self.local_AC.actions:actions,
            self.local_AC.advantages:advantages,
            self.local_AC.state_in[0]:self.batch_rnn_state[0],
            self.local_AC.state_in[1]:self.batch_rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n, self.batch_rnn_state,_ = sess.run([self.local_AC.value_loss,
            self.local_AC.policy_loss,
            self.local_AC.entropy,
            self.local_AC.grad_norms,
            self.local_AC.var_norms,
            self.local_AC.state_out,
            self.local_AC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
        
    def work(self,max_episode_length,gamma,sess,coord,saver):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print ("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():                 
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                
                self.env.new_episode()
                s = self.env.get_state().screen_buffer
                episode_frames.append(s)
                s = process_frame(s)
                rnn_state = self.local_AC.state_init
                self.batch_rnn_state = rnn_state
                while self.env.is_episode_finished() == False:
                    #Take an action using probabilities from policy network output.
                    a_dist,v,rnn_state = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], 
                        feed_dict={self.local_AC.inputs:[s],
                        self.local_AC.state_in[0]:rnn_state[0],
                        self.local_AC.state_in[1]:rnn_state[1]})
                   
                    print('a_dist=',a_dist)
                    print('a_dist[0]=',a_dist[0])
                    
                    a = np.random.choice(a_dist[0],p=a_dist[0])
                    print('before a=',a)
                    a = np.argmax(a_dist == a)
                    
                    
                    print('a=',a)
                    
                    
                    
                    r = self.env.make_action(self.actions[a]) / 100.0
                    d = self.env.is_episode_finished()
                    if d == False:
                        s1 = self.env.get_state().screen_buffer
                        episode_frames.append(s1)
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                        
                    episode_buffer.append([s,a,r,s1,d,v[0,0]])
                    episode_values.append(v[0,0])

                    episode_reward += r
                    s = s1                    
                    total_steps += 1
                    episode_step_count += 1
                    
                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
                    if len(episode_buffer) == 30 and d != True and episode_step_count != max_episode_length - 1:
                        # Since we don't know what the true final return is, we "bootstrap" from our current
                        # value estimation.
                        v1 = sess.run(self.local_AC.value, 
                            feed_dict={self.local_AC.inputs:[s],
                            self.local_AC.state_in[0]:rnn_state[0],
                            self.local_AC.state_in[1]:rnn_state[1]})[0,0]
                        v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)
                    if d == True:
                        break
                                            
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
                
                # Update the network using the episode buffer at the end of the episode.
                if len(episode_buffer) != 0:
                    v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
                                
                    
                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0:
                    if self.name == 'worker_0' and episode_count % 25 == 0:
                        time_per_step = 0.05
                        images = np.array(episode_frames)
                        make_gif(images,'./frames/image'+str(episode_count)+'.gif',
                            duration=len(images)*time_per_step,true_image=True,salience=False)
                    if episode_count % 250 == 0 and self.name == 'worker_0':
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-5:])
                    mean_length = np.mean(self.episode_lengths[-5:])
                    mean_value = np.mean(self.episode_mean_values[-5:])
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                    summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                    summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                    summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                    summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1

In [5]:
max_episode_length = 300
gamma = .99 # discount rate for advantage estimation and reward discounting
s_size = 7056 # Observations are greyscale frames of 84 * 84 * 1
a_size = 3 # Agent can move Left, Right, or Fire
load_model = False
model_path = './model'

In [12]:
tf.reset_default_graph()

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
#Create a directory to save episode playback gifs to
if not os.path.exists('./frames'):
    os.makedirs('./frames')

with tf.device("/cpu:0"): 
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    trainer = tf.train.AdamOptimizer(learning_rate=1e-4)
    master_network = AC_Network(s_size,a_size,'global',None) # Generate global network
    num_workers = multiprocessing.cpu_count() # Set workers ot number of available CPU threads
    num_workers = 2
    workers = []
    # Create worker classes
    for i in range(num_workers):
        workers.append(Worker(DoomGame(),i,s_size,a_size,trainer,model_path,global_episodes))
    saver = tf.train.Saver(max_to_keep=5)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    if load_model == True:
        print ('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        
    # This is where the asynchronous magic happens.
    # Start the "work" process for each worker in a separate threat.
    worker_threads = []
    for worker in workers:
        worker_work = lambda: worker.work(max_episode_length,gamma,sess,coord,saver)
        t = threading.Thread(target=(worker_work))
        t.start()
        sleep(0.5)
        worker_threads.append(t)
    coord.join(worker_threads)

('from_vars', [<tf.Variable 'global/Conv/weights:0' shape=(8, 8, 1, 16) dtype=float32_ref>, <tf.Variable 'global/Conv/biases:0' shape=(16,) dtype=float32_ref>, <tf.Variable 'global/Conv_1/weights:0' shape=(4, 4, 16, 32) dtype=float32_ref>, <tf.Variable 'global/Conv_1/biases:0' shape=(32,) dtype=float32_ref>, <tf.Variable 'global/fully_connected/weights:0' shape=(2592, 256) dtype=float32_ref>, <tf.Variable 'global/fully_connected/biases:0' shape=(256,) dtype=float32_ref>, <tf.Variable 'global/rnn/basic_lstm_cell/kernel:0' shape=(512, 1024) dtype=float32_ref>, <tf.Variable 'global/rnn/basic_lstm_cell/bias:0' shape=(1024,) dtype=float32_ref>, <tf.Variable 'global/fully_connected_1/weights:0' shape=(256, 3) dtype=float32_ref>, <tf.Variable 'global/fully_connected_2/weights:0' shape=(256, 1) dtype=float32_ref>])
('to_vars', [<tf.Variable 'worker_0/Conv/weights:0' shape=(8, 8, 1, 16) dtype=float32_ref>, <tf.Variable 'worker_0/Conv/biases:0' shape=(16,) dtype=float32_ref>, <tf.Variable 'worke

('a_dist=', array([[ 0.33321658,  0.33321103,  0.33357239]], dtype=float32))
('a_dist[0]=', array([ 0.33321658,  0.33321103,  0.33357239], dtype=float32))
('before a=', 0.33321658)
('a=', 0)
('a_dist=', array([[ 0.3332113 ,  0.33321843,  0.33357033]], dtype=float32))
('a_dist[0]=', array([ 0.3332113 ,  0.33321843,  0.33357033], dtype=float32))
('before a=', 0.3332113)
('a=', 0)
('a_dist=', array([[ 0.33322242,  0.33322358,  0.33355403]], dtype=float32))
('a_dist[0]=', array([ 0.33322242,  0.33322358,  0.33355403], dtype=float32))
('before a=', 0.33355403)
('a=', 2)
('a_dist=', array([[ 0.33322588,  0.33323345,  0.33354062]], dtype=float32))
('a_dist[0]=', array([ 0.33322588,  0.33323345,  0.33354062], dtype=float32))
('before a=', 0.33322588)
('a=', 0)
('a_dist=', array([[ 0.33323902,  0.3332431 ,  0.33351788]], dtype=float32))
('a_dist[0]=', array([ 0.33323902,  0.3332431 ,  0.33351788], dtype=float32))
('before a=', 0.3332431)
('a=', 1)
('a_dist=', array([[ 0.3332392 ,  0.33324257,  

('a_dist=', array([[ 0.3332915 ,  0.33317611,  0.33353236]], dtype=float32))
('a_dist[0]=', array([ 0.3332915 ,  0.33317611,  0.33353236], dtype=float32))
('before a=', 0.33317611)
('a=', 1)
('a_dist=', array([[ 0.33329573,  0.33317456,  0.33352968]], dtype=float32))
('a_dist[0]=', array([ 0.33329573,  0.33317456,  0.33352968], dtype=float32))
('before a=', 0.33317456)
('a=', 1)
('a_dist=', array([[ 0.33328459,  0.33317745,  0.33353794]], dtype=float32))
('a_dist[0]=', array([ 0.33328459,  0.33317745,  0.33353794], dtype=float32))
('before a=', 0.33328459)
('a=', 0)
('a_dist=', array([[ 0.33326635,  0.33318654,  0.33354706]], dtype=float32))
('a_dist[0]=', array([ 0.33326635,  0.33318654,  0.33354706], dtype=float32))
('before a=', 0.33318654)
('a=', 1)
('a_dist=', array([[ 0.33325067,  0.33319679,  0.3335526 ]], dtype=float32))
('a_dist[0]=', array([ 0.33325067,  0.33319679,  0.3335526 ], dtype=float32))
('before a=', 0.33319679)
('a=', 1)
('a_dist=', array([[ 0.33323264,  0.33321363,

('before a=', 0.33366171)('a_dist=', array([[ 0.3329685 ,  0.33326504,  0.33376646]], dtype=float32))
('a=', 2)

('a_dist[0]=', array([ 0.3329685 ,  0.33326504,  0.33376646], dtype=float32))('a_dist=', array([[ 0.33301932,  0.33331293,  0.33366773]], dtype=float32))

('a_dist[0]=', array([ 0.33301932,  0.33331293,  0.33366773], dtype=float32))
('before a=', 0.33366773)
('before a=', 0.33376646)('a=', 2)

('a=', 2)
('a_dist=', array([[ 0.33301586,  0.33331376,  0.33367038]], dtype=float32))
('a_dist[0]=', array([ 0.33301586,  0.33331376,  0.33367038], dtype=float32))
('before a=', 0.33301586)
('a=', 0) 
('a_dist=', array([[ 0.33298552,  0.33324829,  0.33376622]], dtype=float32))
('a_dist[0]=', array([ 0.33298552,  0.33324829,  0.33376622], dtype=float32))
('before a=', 0.33324829)
('a=', 1)
('a_dist=', array([[ 0.33300787,  0.33331683,  0.33367527]], dtype=float32))
('a_dist[0]=', array([ 0.33300787,  0.33331683,  0.33367527], dtype=float32))
('before a=', 0.33331683)
('a=', 1)
('a_dist

('a_dist=', array([[ 0.33322924,  0.33306783,  0.33370298]], dtype=float32))
('a_dist[0]=', array([ 0.33322924,  0.33306783,  0.33370298], dtype=float32))
('before a=', 0.33306783)
('a=', 1)
('a_dist=', array([[ 0.33321518,  0.33307171,  0.33371311]], dtype=float32))
('a_dist[0]=', array([ 0.33321518,  0.33307171,  0.33371311], dtype=float32))
('before a=', 0.33371311)
('a=', 2)
('a_dist=', array([[ 0.33321792,  0.33310351,  0.3336786 ]], dtype=float32))
('a_dist[0]=', array([ 0.33321792,  0.33310351,  0.3336786 ], dtype=float32))
('before a=', 0.33321792)
('a=', 0)
('a_dist=', array([[ 0.33320311,  0.33307576,  0.33372119]], dtype=float32))
('a_dist[0]=', array([ 0.33320311,  0.33307576,  0.33372119], dtype=float32)) 
('a_dist=', array([[ 0.33324474,  0.33308733,  0.33366793]], dtype=float32))('before a=', 0.33307576)

('a_dist[0]=', array([ 0.33324474,  0.33308733,  0.33366793], dtype=float32))('a=', 1)

('before a=', 0.33308733)
('a=', 1)
('a_dist=', array([[ 0.33321032,  0.33308387

('a_dist=', array([[ 0.3335731 ,  0.33291346,  0.3335135 ]], dtype=float32))
('a_dist[0]=', array([ 0.3335731 ,  0.33291346,  0.3335135 ], dtype=float32))
('before a=', 0.3335135)
('a=', 2)
('a_dist=', array([[ 0.33355585,  0.33291492,  0.33352917]], dtype=float32))
('a_dist[0]=', array([ 0.33355585,  0.33291492,  0.33352917], dtype=float32))
('before a=', 0.33291492)
('a=', 1)
('a_dist=', array([[ 0.3335394,  0.3329224,  0.3335382]], dtype=float32))
('a_dist[0]=', array([ 0.3335394,  0.3329224,  0.3335382], dtype=float32))
('before a=', 0.3335394)
('a=', 0)
('a_dist=', array([[ 0.33368859,  0.33289036,  0.33342099]], dtype=float32))
('a_dist=', array([[ 0.33352801,  0.33293015,  0.33354178]], dtype=float32))('a_dist[0]=', array([ 0.33368859,  0.33289036,  0.33342099], dtype=float32))

('a_dist[0]=', array([ 0.33352801,  0.33293015,  0.33354178], dtype=float32))('before a=', 0.33342099)

('before a=', 0.33354178)('a=', 2)

('a=', 2)('a_dist=', array([[ 0.33370638,  0.33289096,  0.33340

('a_dist=', array([[ 0.33386511,  0.33258745,  0.33354741]], dtype=float32))('a_dist=', array([[ 0.33390352,  0.33263138,  0.3334651 ]], dtype=float32))

('a_dist[0]=', array([ 0.33390352,  0.33263138,  0.3334651 ], dtype=float32))('a_dist[0]=', array([ 0.33386511,  0.33258745,  0.33354741], dtype=float32))

('before a=', 0.33263138)('before a=', 0.33386511)
('a=', 1)

('a=', 0)
('a_dist=', array([[ 0.33387947,  0.33264756,  0.33347306]], dtype=float32))
('a_dist[0]=', array([ 0.33387947,  0.33264756,  0.33347306], dtype=float32))
('a_dist=', array([[ 0.33387876,  0.33257988,  0.33354139]], dtype=float32))('before a=', 0.33347306)

('a_dist[0]=', array([ 0.33387876,  0.33257988,  0.33354139], dtype=float32))('a=', 2)

('before a=', 0.33257988)
('a=', 1)
('a_dist=', array([[ 0.33385637,  0.33266139,  0.33348224]], dtype=float32))
('a_dist[0]=', array([ 0.33385637,  0.33266139,  0.33348224], dtype=float32))
('before a=', 0.33348224)
('a=', 2)
('a_dist=', array([[ 0.33394775,  0.33253068,

('a_dist=', array([[ 0.33409262,  0.33180451,  0.3341029 ]], dtype=float32))('a_dist=', array([[ 0.33406028,  0.33174387,  0.33419582]], dtype=float32))
('a_dist[0]=', array([ 0.33409262,  0.33180451,  0.3341029 ], dtype=float32))

('a_dist[0]=', array([ 0.33406028,  0.33174387,  0.33419582], dtype=float32))('before a=', 0.33409262)

('before a=', 0.33174387)('a=', 0)

('a=', 1)
('a_dist=', array([[ 0.33405513,  0.33172983,  0.33421499]], dtype=float32))('a_dist=', array([[ 0.33413428,  0.33174211,  0.33412364]], dtype=float32))

('a_dist[0]=', array([ 0.33405513,  0.33172983,  0.33421499], dtype=float32))('a_dist[0]=', array([ 0.33413428,  0.33174211,  0.33412364], dtype=float32))

('before a=', 0.33405513)('before a=', 0.33413428)

('a=', 0)('a=', 0)

('a_dist=', array([[ 0.33416325,  0.33169597,  0.33414069]], dtype=float32))('a_dist=', array([[ 0.33400705,  0.3317683 ,  0.33422464]], dtype=float32))
('a_dist[0]=', array([ 0.33416325,  0.33169597,  0.33414069], dtype=float32))

('a_

('a_dist=', array([[ 0.33419245,  0.33108437,  0.3347232 ]], dtype=float32))
('a_dist[0]=', array([ 0.33419245,  0.33108437,  0.3347232 ], dtype=float32))
('before a=', 0.3347232)
('a=', 2)
('a_dist=', array([[ 0.33422685,  0.33104327,  0.33472985]], dtype=float32))
('a_dist[0]=', array([ 0.33422685,  0.33104327,  0.33472985], dtype=float32))
('before a=', 0.33472985)
('a=', 2)
('a_dist=', array([[ 0.33424866,  0.33101156,  0.33473977]], dtype=float32))
('a_dist[0]=', array([ 0.33424866,  0.33101156,  0.33473977], dtype=float32))
('before a=', 0.33473977)
('a=', 2)
('a_dist=', array([[ 0.33425921,  0.33099151,  0.33474934]], dtype=float32))
('a_dist[0]=', array([ 0.33425921,  0.33099151,  0.33474934], dtype=float32))
('before a=', 0.33425921)
('a=', 0)
('a_dist=', array([[ 0.33427176,  0.33097097,  0.3347573 ]], dtype=float32))
('a_dist[0]=', array([ 0.33427176,  0.33097097,  0.3347573 ], dtype=float32))
('before a=', 0.33427176)
('a=', 0)
('a_dist=', array([[ 0.33426917,  0.33095625, 

('a_dist=', array([[ 0.33424342,  0.33020967,  0.33554691]], dtype=float32))
('a_dist=', array([[ 0.33427247,  0.3306137 ,  0.33511379]], dtype=float32))
('a_dist[0]=', array([ 0.33427247,  0.3306137 ,  0.33511379], dtype=float32))
('a_dist[0]=', array([ 0.33424342,  0.33020967,  0.33554691], dtype=float32))('before a=', 0.33427247)
('a=', 0)

('before a=', 0.33554691)
('a=', 2)
('a_dist=', array([[ 0.33427858,  0.33058116,  0.33514029]], dtype=float32))
('a_dist[0]=', array([ 0.33427858,  0.33058116,  0.33514029], dtype=float32))
('before a=', 0.33514029)
('a=', 2)
('a_dist=', array([[ 0.33421323,  0.33025295,  0.3355338 ]], dtype=float32))
('a_dist[0]=', array([ 0.33421323,  0.33025295,  0.3355338 ], dtype=float32))
('before a=', 0.33025295)
('a=', 1)
('a_dist=', array([[ 0.33427152,  0.33055726,  0.33517119]], dtype=float32))
('a_dist[0]=', array([ 0.33427152,  0.33055726,  0.33517119], dtype=float32))
('before a=', 0.33517119)
('a=', 2)
('a_dist=', array([[ 0.33422077,  0.33026111,

('a_dist[0]=', array([ 0.33427617,  0.32970107,  0.33602273], dtype=float32))
('a_dist=', array([[ 0.33380234,  0.33048177,  0.33571589]], dtype=float32))('before a=', 0.33427617)

('a=', 0)('a_dist[0]=', array([ 0.33380234,  0.33048177,  0.33571589], dtype=float32))

('before a=', 0.33380234)
('a=', 0)
('a_dist=', array([[ 0.33428115,  0.32972899,  0.33598986]], dtype=float32))
('a_dist[0]=', array([ 0.33428115,  0.32972899,  0.33598986], dtype=float32))
('before a=', 0.32972899)
('a=', 1)
('a_dist=', array([[ 0.33382139,  0.33031383,  0.33586478]], dtype=float32))
('a_dist[0]=', array([ 0.33382139,  0.33031383,  0.33586478], dtype=float32))
('before a=', 0.33031383)
('a=', 1)
('a_dist=', array([[ 0.3342849 ,  0.32974967,  0.33596545]], dtype=float32))
('a_dist[0]=', array([ 0.3342849 ,  0.32974967,  0.33596545], dtype=float32))
('before a=', 0.3342849)
('a=', 0)
('a_dist=', array([[ 0.33383164,  0.3301709 ,  0.3359974 ]], dtype=float32))
('a_dist[0]=', array([ 0.33383164,  0.3301709 

('a_dist[0]=', array([ 0.33334181,  0.32991213,  0.33674598], dtype=float32))
('before a=', 0.33334181)
('a=', 0)
('a_dist=', array([[ 0.33307555,  0.32975644,  0.33716807]], dtype=float32))
('a_dist[0]=', array([ 0.33307555,  0.32975644,  0.33716807], dtype=float32))
('before a=', 0.33307555)
('a=', 0)
('a_dist=', array([[ 0.33332554,  0.32994732,  0.33672705]], dtype=float32))
('a_dist[0]=', array([ 0.33332554,  0.32994732,  0.33672705], dtype=float32))
('before a=', 0.33672705)
('a=', 2)
('a_dist=', array([[ 0.3333084 ,  0.32997146,  0.33672008]], dtype=float32))
('a_dist[0]=', array([ 0.3333084 ,  0.32997146,  0.33672008], dtype=float32))
('before a=', 0.33672008)
('a=', 2)('a_dist=', array([[ 0.33305669,  0.32979056,  0.33715266]], dtype=float32))

('a_dist[0]=', array([ 0.33305669,  0.32979056,  0.33715266], dtype=float32))
('before a=', 0.32979056)
('a=', 1)
('a_dist=', array([[ 0.33329168,  0.33000532,  0.336703  ]], dtype=float32))
('a_dist[0]=', array([ 0.33329168,  0.3300053

('a_dist=', array([[ 0.33308601,  0.33072063,  0.33619329]], dtype=float32))
('a_dist[0]=', array([ 0.33308601,  0.33072063,  0.33619329], dtype=float32))
('before a=', 0.33619329)
('a=', 2)
('a_dist=', array([[ 0.33309379,  0.33051518,  0.33639112]], dtype=float32))
('a_dist[0]=', array([ 0.33309379,  0.33051518,  0.33639112], dtype=float32))
('before a=', 0.33309379)
('a=', 0)
('a_dist=', array([[ 0.33258635,  0.32992741,  0.33748621]], dtype=float32))
('a_dist[0]=', array([ 0.33258635,  0.32992741,  0.33748621], dtype=float32))
('before a=', 0.33258635)
('a=', 0)
('a_dist=', array([[ 0.3330873 ,  0.33035782,  0.33655491]], dtype=float32))
('a_dist[0]=', array([ 0.3330873 ,  0.33035782,  0.33655491], dtype=float32))
('before a=', 0.3330873)
('a=', 0)
('a_dist=', array([[ 0.33258525,  0.32998344,  0.33743134]], dtype=float32))
('a_dist[0]=', array([ 0.33258525,  0.32998344,  0.33743134], dtype=float32))
('before a=', 0.32998344)
('a=', 1)
('a_dist=', array([[ 0.33307531,  0.33023295, 

('a_dist=', array([[ 0.33242485,  0.3297728 ,  0.33780232]], dtype=float32))
('a_dist[0]=', array([ 0.33242485,  0.3297728 ,  0.33780232], dtype=float32))
('before a=', 0.3297728)
('a=', 1)
('a_dist=', array([[ 0.33253637,  0.32991517,  0.33754843]], dtype=float32))
('a_dist=', array([[ 0.33242023,  0.32981226,  0.33776757]], dtype=float32))('a_dist[0]=', array([ 0.33253637,  0.32991517,  0.33754843], dtype=float32))
('a_dist[0]=', array([ 0.33242023,  0.32981226,  0.33776757], dtype=float32))

('before a=', 0.33776757)('before a=', 0.33754843)

('a=', 2)('a=', 2)

('a_dist=', array([[ 0.33241436,  0.32986608,  0.33771962]], dtype=float32))
('a_dist[0]=', array([ 0.33241436,  0.32986608,  0.33771962], dtype=float32))
('before a=', 0.32986608)
('a=', 1)
('a_dist=', array([[ 0.33242217,  0.32993025,  0.33764759]], dtype=float32))
('a_dist[0]=', array([ 0.33242217,  0.32993025,  0.33764759], dtype=float32))
('a_dist=', array([[ 0.33251932,  0.3299678 ,  0.33751291]], dtype=float32))
('bef

('a_dist=', array([[ 0.33243328,  0.32976571,  0.33780098]], dtype=float32))
('a_dist[0]=', array([ 0.33243328,  0.32976571,  0.33780098], dtype=float32))
('before a=', 0.33243328)
('a=', 0)
('a_dist=', array([[ 0.33241281,  0.32981202,  0.3377752 ]], dtype=float32))
('a_dist[0]=', array([ 0.33241281,  0.32981202,  0.3377752 ], dtype=float32))
('before a=', 0.32981202)
('a=', 1)
('a_dist=', array([[ 0.33243054,  0.3298322 ,  0.33773729]], dtype=float32))
('a_dist[0]=', array([ 0.33243054,  0.3298322 ,  0.33773729], dtype=float32))
('before a=', 0.3298322)
('a=', 1)
('a_dist=', array([[ 0.33244362,  0.32984689,  0.33770946]], dtype=float32))
('a_dist[0]=', array([ 0.33244362,  0.32984689,  0.33770946], dtype=float32))
('before a=', 0.33244362)
('a=', 0)
('a_dist=', array([[ 0.33245564,  0.32985458,  0.33768976]], dtype=float32))
('a_dist[0]=', array([ 0.33245564,  0.32985458,  0.33768976], dtype=float32))
('before a=', 0.33768976)
('a=', 2)
('a_dist=', array([[ 0.33246487,  0.32962814, 

('a_dist=', array([[ 0.33242321,  0.32972807,  0.33784875]], dtype=float32))
('a_dist[0]=', array([ 0.33242321,  0.32972807,  0.33784875], dtype=float32))
('before a=', 0.33242321)
('a=', 0)
('a_dist=', array([[ 0.33242854,  0.32973439,  0.3378371 ]], dtype=float32))
('a_dist[0]=', array([ 0.33242854,  0.32973439,  0.3378371 ], dtype=float32))
('before a=', 0.33242854)
('a=', 0)
('a_dist=', array([[ 0.33243239,  0.32963151,  0.33793613]], dtype=float32))
('a_dist[0]=', array([ 0.33243239,  0.32963151,  0.33793613], dtype=float32))
('before a=', 0.33243239)
('a=', 0)
('a_dist=', array([[ 0.33246279,  0.32951766,  0.33801958]], dtype=float32))
('a_dist[0]=', array([ 0.33246279,  0.32951766,  0.33801958], dtype=float32))
('before a=', 0.32951766)
('a=', 1)
('a_dist=', array([[ 0.33247906,  0.32942447,  0.33809653]], dtype=float32))
('a_dist[0]=', array([ 0.33247906,  0.32942447,  0.33809653], dtype=float32))
('before a=', 0.33809653)
('a=', 2)
('a_dist=', array([[ 0.33248883,  0.32934734,

('a_dist[0]=', array([ 0.33239961,  0.32934162,  0.33825883], dtype=float32))
('before a=', 0.33825883)
('a=', 2)
('a_dist=', array([[ 0.3324146 ,  0.32926914,  0.33831626]], dtype=float32))
('a_dist[0]=', array([ 0.3324146 ,  0.32926914,  0.33831626], dtype=float32))
('before a=', 0.32926914)
('a=', 1)
('a_dist=', array([[ 0.33244413,  0.32911682,  0.33843905]], dtype=float32))
('a_dist[0]=', array([ 0.33244413,  0.32911682,  0.33843905], dtype=float32))
('before a=', 0.32911682)
('a=', 1)
('a_dist=', array([[ 0.33244768,  0.32903621,  0.33851615]], dtype=float32))
('a_dist[0]=', array([ 0.33244768,  0.32903621,  0.33851615], dtype=float32))
('a_dist=', array([[ 0.33241919,  0.32922295,  0.33835793]], dtype=float32))('before a=', 0.33244768)
('a_dist[0]=', array([ 0.33241919,  0.32922295,  0.33835793], dtype=float32))
('before a=', 0.32922295)
('a=', 1)

('a=', 0)
('a_dist=', array([[ 0.33242449,  0.32918757,  0.33838794]], dtype=float32))
('a_dist[0]=', array([ 0.33242449,  0.3291875

('a_dist[0]=', array([ 0.33241144,  0.32891276,  0.33867577], dtype=float32))
('before a=', 0.33241144)
('a=', 0)
('a_dist=', array([[ 0.33242026,  0.3289206 ,  0.33865914]], dtype=float32))
('a_dist[0]=', array([ 0.33242026,  0.3289206 ,  0.33865914], dtype=float32))
('before a=', 0.33242026)
('a=', 0)
('a_dist=', array([[ 0.33242911,  0.32892233,  0.33864853]], dtype=float32))
('a_dist[0]=', array([ 0.33242911,  0.32892233,  0.33864853], dtype=float32))
('before a=', 0.32892233)
('a=', 1)
('a_dist=', array([[ 0.33243424,  0.32892406,  0.33864167]], dtype=float32))
('a_dist[0]=', array([ 0.33243424,  0.32892406,  0.33864167], dtype=float32))
('before a=', 0.33864167)
('a=', 2)
('a_dist=', array([[ 0.33241484,  0.32895726,  0.33862793]], dtype=float32))
('a_dist[0]=', array([ 0.33241484,  0.32895726,  0.33862793], dtype=float32))
('before a=', 0.33241484)
('a=', 0)
('a_dist=', array([[ 0.33237305,  0.32890624,  0.33872068]], dtype=float32))
('a_dist=', array([[ 0.33240858,  0.32898018,

('a_dist=', array([[ 0.3323527 ,  0.32849407,  0.33915323]], dtype=float32))
('a_dist[0]=', array([ 0.3323527 ,  0.32849407,  0.33915323], dtype=float32))
('before a=', 0.32849407)
('a=', 1)
('a_dist=', array([[ 0.33236441,  0.32843652,  0.33919904]], dtype=float32))
('a_dist[0]=', array([ 0.33236441,  0.32843652,  0.33919904], dtype=float32))
('before a=', 0.32843652)
('a=', 1)
('a_dist=', array([[ 0.33237296,  0.32838446,  0.33924255]], dtype=float32))
('a_dist[0]=', array([ 0.33237296,  0.32838446,  0.33924255], dtype=float32))
('before a=', 0.33924255)
('a=', 2)
('a_dist=', array([[ 0.33236405,  0.32836241,  0.33927357]], dtype=float32))
('a_dist[0]=', array([ 0.33236405,  0.32836241,  0.33927357], dtype=float32))
('before a=', 0.33236405)
('a=', 0)
('a_dist=', array([[ 0.33233348,  0.32844216,  0.3392244 ]], dtype=float32))
('a_dist[0]=', array([ 0.33233348,  0.32844216,  0.3392244 ], dtype=float32))
('before a=', 0.33233348)
('a=', 0)
('a_dist=', array([[ 0.33231413,  0.3285141 ,

('a_dist=', array([[ 0.33257791,  0.32854176,  0.3388803 ]], dtype=float32))
('a_dist[0]=', array([ 0.33257791,  0.32854176,  0.3388803 ], dtype=float32))
('before a=', 0.32854176)
('a=', 1)
('a_dist=', array([[ 0.33218774,  0.32830781,  0.33950442]], dtype=float32))
('a_dist[0]=', array([ 0.33218774,  0.32830781,  0.33950442], dtype=float32))
('before a=', 0.33218774)
('a=', 0)
('a_dist=', array([[ 0.3325232 ,  0.32846209,  0.33901474]], dtype=float32))
('a_dist[0]=', array([ 0.3325232 ,  0.32846209,  0.33901474], dtype=float32))
('before a=', 0.3325232)
('a=', 0)
('a_dist=', array([[ 0.33219579,  0.32832614,  0.33947805]], dtype=float32))
('a_dist[0]=', array([ 0.33219579,  0.32832614,  0.33947805], dtype=float32))
('before a=', 0.33219579)
('a=', 0)
('a_dist=', array([[ 0.33247092,  0.32839033,  0.33913875]], dtype=float32))
('a_dist[0]=', array([ 0.33247092,  0.32839033,  0.33913875], dtype=float32))
('before a=', 0.33247092)
('a=', 0)
('a_dist=', array([[ 0.33219171,  0.32837802, 

('a_dist=', array([[ 0.3323395 ,  0.32822308,  0.33943745]], dtype=float32))
('a_dist[0]=', array([ 0.3323395 ,  0.32822308,  0.33943745], dtype=float32))
('before a=', 0.32822308)
('a=', 1)
('a_dist=', array([[ 0.33232349,  0.32834235,  0.33933416]], dtype=float32))
('a_dist[0]=', array([ 0.33232349,  0.32834235,  0.33933416], dtype=float32))
('before a=', 0.33933416)
('a=', 2)
('a_dist=', array([[ 0.33233604,  0.32818022,  0.33948368]], dtype=float32))
('a_dist[0]=', array([ 0.33233604,  0.32818022,  0.33948368], dtype=float32))
('before a=', 0.33233604)
('a=', 0)
('a_dist=', array([[ 0.33234099,  0.32833585,  0.33932316]], dtype=float32))
('a_dist[0]=', array([ 0.33234099,  0.32833585,  0.33932316], dtype=float32))
('before a=', 0.32833585)
('a=', 1)
('a_dist=', array([[ 0.33235547,  0.3283315 ,  0.33931309]], dtype=float32))
('a_dist[0]=', array([ 0.33235547,  0.3283315 ,  0.33931309], dtype=float32))
('before a=', 0.3283315)
('a=', 1)
('a_dist=', array([[ 0.33232042,  0.32814711, 

('a=', 0)
('a_dist=', array([[ 0.3335574 ,  0.33247647,  0.33396614]], dtype=float32))('a_dist=', array([[ 0.33311969,  0.3284924 ,  0.33838785]], dtype=float32))

('a_dist[0]=', array([ 0.3335574 ,  0.33247647,  0.33396614], dtype=float32))('a_dist[0]=', array([ 0.33311969,  0.3284924 ,  0.33838785], dtype=float32))

('before a=', 0.3335574)('before a=', 0.33311969)

('a=', 0)('a=', 0)

('a_dist=', array([[ 0.33312061,  0.32853687,  0.33834249]], dtype=float32))
('a_dist=', array([[ 0.33369946,  0.33177438,  0.33452618]], dtype=float32))('a_dist[0]=', array([ 0.33312061,  0.32853687,  0.33834249], dtype=float32))

('a_dist[0]=', array([ 0.33369946,  0.33177438,  0.33452618], dtype=float32))('before a=', 0.33834249)

('before a=', 0.33369946)('a=', 2)

('a=', 0)
('a_dist=', array([[ 0.33312348,  0.32855663,  0.33831996]], dtype=float32))
('a_dist=', array([[ 0.33375922,  0.33120897,  0.33503184]], dtype=float32))('a_dist[0]=', array([ 0.33312348,  0.32855663,  0.33831996], dtype=float3

Exception in thread Thread-30:
Traceback (most recent call last):
  File "/usr/local/Cellar/python/2.7.13_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/local/Cellar/python/2.7.13_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "<ipython-input-12-a9a3173b411d>", line 35, in <lambda>
    worker_work = lambda: worker.work(max_episode_length,gamma,sess,coord,saver)
  File "<ipython-input-11-5d2bfd54e466>", line 119, in work
    r = self.env.make_action(self.actions[a]) / 100.0
SignalException: Signal SIGINT received. ViZDoom instance has been closed.

Exception in thread Thread-31:
Traceback (most recent call last):
  File "/usr/local/Cellar/python/2.7.13_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/local/Cellar/python/2.7.13_