# Actor Critic Model

## Initial setup

In [2]:
import sys
sys.path.insert(0, "../python")

In [3]:
from vizdoom import *
from helper import create_agent
import tensorflow as tf

In [4]:
# If running other experiments on GPUs
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [5]:
%load_ext autoreload
%autoreload 2

## Create agent

In [6]:
# Initializes DoomGame from config file
def initialize_vizdoom(config_file):
    game = DoomGame()
    game.load_config(config_file)
    game.init()
    return game  

In [7]:
# Initialize agent and TensorFlow graph
def make_new_agent():
    tf.reset_default_graph()
    agent_file_path = "./actor_critic/ac.json"
    config_file_path = "./actor_critic/ac.cfg"
    results_dir = "./actor_critic/results_dir"
    action_set = "basic_three"
    game = initialize_vizdoom(config_file_path)
    return create_agent(agent_file_path,
                        game=game, 
                        action_set=action_set,
                        output_directory=results_dir)

## N-step learning
First, we will walk through the n-step learning process.

In [167]:
# Prints status of memory buffers
def print_agent_status():
    print("s1_buffer:    \n", agent.s1_buffer[:, :5, 0, -1])
    print("a_buffer:     \n", agent.a_buffer)
    print("s2_buffer:    \n", agent.s2_buffer[:, :5, 0, -1])
    print("r_buffer:     \n", agent.r_buffer)
    print("gamma_buffer: \n", agent.gamma_buffer)
    print("memory r:     \n", agent.memory.r)

In [137]:
# View memory storage
agent = make_new_agent()
agent.initialize_new_episode()
for i in range(5):
    print("Step %d: " % (i+1))
    agent.perform_learning_step(1, 1)
    print_agent_status()
    print()
print("V(s): ", agent.network.get_value_output(agent.s2_buffer[-1]))

Step 1: 
s1_buffer:    
 [[ 0.47058824  0.46568626  0.47058824  0.47058824  0.41549021]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]]
a_buffer:     
 [2 0 0 0 0]
s2_buffer:    
 [[ 0.47058824  0.46568626  0.47058824  0.47058824  0.41549021]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]]
r_buffer:     
 [-0.04  0.    0.    0.    0.  ]
gamma_buffer: 
 [ 1.          0.99        0.9801      0.970299    0.96059601]
memory r:     
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

Step 2: 
s1_buffer:    
 [[ 0.47058824  0.46568626  0.47058824  0.47058824  0.41549021]
 [ 0.47058824  0.46568626 

Initially, during the first $n$ steps, the agent stores transitions consisting of $s1, a, s2, r$. After the $n$th step, the return $R$ can be calculated for time $t-n$. The first $Q(s,a) \approx \mathbb{E}[R_t]$ is given by:

$\sum_{i=0}^{k-1}(\gamma^ir_i)+\gamma^kV(s_t)$

which in this case is vectorized:

$
\begin{bmatrix} 
1.0
\\ 0.99
\\ 0.9801
\\ 0.970299
\\ 0.96059601
\end{bmatrix}
\cdot
\begin{bmatrix} 
-0.04
\\ -0.04
\\ -0.04
\\ -0.04
\\ -0.04
\end{bmatrix}
+ 0.99^5V(s_5) = -0.196 + (0.951)(0.228338) = 0.0211$

which matches the initial return placed in memory. The $s1, s2, a, r$ buffers roll to the beginning, placing the next transition at slot 0, while the $\gamma$ buffer rolls forward by one slot to match the new configuration. Now for the next five steps, we should see a transition added after each learning step, updating the $r$ array in memory as calculated above.

In [138]:
# Now view adding transition to memory
for i in range(5):
    print("Step %d: " % (i+6))
    agent.perform_learning_step(1, 1)
    print_agent_status()
    print("V(s): ", agent.network.get_value_output(agent.s2_buffer[i]), "\n")

Step 6: 
s1_buffer:    
 [[ 0.4509804   0.37450981  0.41274509  0.62352943  0.54509807]
 [ 0.47058824  0.46568626  0.47058824  0.47058824  0.41549021]
 [ 0.47058824  0.46568626  0.47058824  0.47058824  0.41549021]
 [ 0.49990195  0.4509804   0.47058824  0.44549018  0.40000001]
 [ 0.49225491  0.43235293  0.41127452  0.49117646  0.59019607]]
a_buffer:     
 [1 2 2 1 1]
s2_buffer:    
 [[ 0.49225491  0.43235293  0.41127452  0.49117646  0.59019607]
 [ 0.47058824  0.46568626  0.47058824  0.47058824  0.41549021]
 [ 0.49990195  0.4509804   0.47058824  0.44549018  0.40000001]
 [ 0.49225491  0.43235293  0.41127452  0.49117646  0.59019607]
 [ 0.4509804   0.37450981  0.41274509  0.62352943  0.54509807]]
r_buffer:     
 [-0.04 -0.04 -0.04 -0.04 -0.04]
gamma_buffer: 
 [ 0.970299    0.96059601  1.          0.99        0.9801    ]
memory r:     
 [ 0.02110773  0.03489357  0.          0.          0.          0.          0.
  0.          0.          0.        ]
V(s):  [[ 0.24283469]] 

Step 7: 
s1_buffe

For the next four transitions, the returns are calculated as above. For example, in step 6, the return is:

$R_2 = -1.96 + (0.951)(0.242835) = 0.0349$

Note that the $\gamma$ buffer is displayed after it has been updated; thus, the buffer shown for the previous step represents the buffer used in the current calculation. Additionally, because the episode times out after 10 actions (40 tics), during step 10, the agent encounters a terminal state in s2; thus after calculating the last $n$ transitions, the buffers are reset, which is displayed in step 10. The returns for the last $n$ states are given as:

$R_{T-i}=\sum_{i=0}^{n} \gamma^i r_{t-i}$

or recursively:

$R_t \leftarrow r_t + \gamma R_{t+1}$

which in this case, because all rewards were $-0.04$, is simply:

$R_{T-i}=\sum_{i=0}^{n} \gamma^i (-0.04) = (-0.04)\sum_{i=0}^{n} \gamma^i$

Due to the recursive formula, the transitions are added in reverse order, as seen in the last 5 slots of the memory matrix $r$ above.

## Storing transitions into memory
Now that we know the agent is properly calculating the n-step return of states, we need to make sure that the other variables ($s1, a, s2, isterminal$) are being properly associated with these returns in memory

In [157]:
def print_memory_status():
    print("s1: ", agent.memory.s1[:, :5, 0, -1]) # due to storage of overlapping states
    print("a:  ", agent.memory.a)
    print("s2: ", agent.memory.s2[:, :5, 0, 0])
    print("R:  ", agent.memory.r)
    print("isterminal: ", agent.memory.isterminal)

In [168]:
# Load memory with transitions
agent = make_new_agent()
agent.initialize_new_episode()
print(agent.memory.s1.shape)
print(agent.memory.s2.shape)
for i in range(5):
    agent.perform_learning_step(1, 1)

# Compare current status with replay memory
print_agent_status()
print_memory_status()

(10, 30, 45, 4)
(10, 30, 45, 1)
s1_buffer:    
 [[ 0.46557733  0.48518518  0.50230938  0.47058824  0.42763618]
 [ 0.4559913   0.47058824  0.47058824  0.45254901  0.43845317]
 [ 0.40000001  0.40000001  0.40000001  0.40000001  0.40000001]
 [ 0.48007625  0.51218957  0.52156866  0.47058824  0.34901962]
 [ 0.39523965  0.41459695  0.40293029  0.36862746  0.39198259]]
a_buffer:     
 [0 1 2 1 2]
s2_buffer:    
 [[ 0.4559913   0.47058824  0.47058824  0.45254901  0.43845317]
 [ 0.40000001  0.40000001  0.40000001  0.40000001  0.40000001]
 [ 0.48007625  0.51218957  0.52156866  0.47058824  0.34901962]
 [ 0.39523965  0.41459695  0.40293029  0.36862746  0.39198259]
 [ 0.52156866  0.55385619  0.47503269  0.4509804   0.40000001]]
r_buffer:     
 [-0.04 -0.04 -0.04 -0.04 -0.04]
gamma_buffer: 
 [ 0.96059601  1.          0.99        0.9801      0.970299  ]
memory r:     
 [-0.17756607  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
s1:  [[ 0.46557733 

Let's observe the next transitions $(t,...,T-2)$.

In [169]:
for i in range(4):
    agent.perform_learning_step(1, 1)

# Compare current status with replay memory
print_agent_status()
print_memory_status()

s1_buffer:    
 [[ 0.52156866  0.55385619  0.47503269  0.4509804   0.40000001]
 [ 0.45473856  0.51831156  0.4509804   0.4509804   0.34901962]
 [ 0.47058824  0.48962963  0.48007625  0.4516122   0.3685621 ]
 [ 0.5260784   0.52156866  0.40000001  0.52156866  0.54509807]
 [ 0.39523965  0.41459695  0.40293029  0.36862746  0.39198259]]
a_buffer:     
 [0 0 0 2 2]
s2_buffer:    
 [[ 0.45473856  0.51831156  0.4509804   0.4509804   0.34901962]
 [ 0.47058824  0.48962963  0.48007625  0.4516122   0.3685621 ]
 [ 0.5260784   0.52156866  0.40000001  0.52156866  0.54509807]
 [ 0.44149238  0.40000001  0.69411767  0.54509807  0.47058824]
 [ 0.52156866  0.55385619  0.47503269  0.4509804   0.40000001]]
r_buffer:     
 [-0.04 -0.04 -0.04 -0.04 -0.04]
gamma_buffer: 
 [ 1.          0.99        0.9801      0.970299    0.96059601]
memory r:     
 [-0.17756607 -0.16151434 -0.11614615 -0.15517615 -0.1869307   0.          0.
  0.          0.          0.        ]
s1:  [[ 0.46557733  0.48518518  0.50230938  0.47058

And finally to the terminal step:

In [170]:
# Perform terminal learning step
agent.perform_learning_step(1, 1)
print_agent_status()
print_memory_status()

s1_buffer:    
 [[ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]]
a_buffer:     
 [0 0 0 0 0]
s2_buffer:    
 [[ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]]
r_buffer:     
 [ 0.  0.  0.  0.  0.]
gamma_buffer: 
 [ 1.          0.99        0.9801      0.970299    0.96059601]
memory r:     
 [-0.17756607 -0.16151434 -0.11614615 -0.15517615 -0.1869307  -0.04       -0.0796
 -0.118804   -0.15761596 -0.1960398 ]
s1:  [[ 0.46557733  0.48518518  0.50230938  0.47058824  0.42763618]
 [ 0.4559913   0.47058824  0.47058824  0.45254901  0.43845317]
 [ 0.40000001  0.40000001  0.40000001  0.40000001  0.40000001]
 [ 0.48007625  0.51218957  0.52156866  0.47058824  0.34901962]
 [ 0.39523965  0.41459695  0.40293029  0.36862746  0.39198259]
 [ 0.44149238  0.40000001  0.69411767  0.54509807  0.47058824]
 [ 0.5260784   0.52156866  0.40000001  0.52156866  0.54509807]
 [ 0.470588

## Policy and value functions
Next, we will look into the policy and value functions, analyzing both their output and loss functions, as well as the gradients derived from them.

In [187]:
def print_learning_step():
    s1, a, s2, isterminal, R, w = agent.memory.get_sample(1)
    print("s1: ", s1[:, :3, 1, 1])
    print("a:  ", a)
    print("R:  ", R)
    print("w:  ", w)
    print("V:  ", agent.network.get_value_output(s1))
    print("pi: ", agent.network.get_policy_output(s1))
    loss_pi, loss_v = agent.network.learn(s1, a, R, weights=w)
    print("loss_pi: ", loss_pi, " loss_v: ", loss_v)

In [188]:
# Load memory with transitions
agent = make_new_agent()
agent.initialize_new_episode()
for i in range(10):
    agent.perform_learning_step(1, 1)
print_learning_step()

s1:  [[ 0.47058824  0.46568626  0.47058824]]
a:   [1]
R:   [-0.31398159]
w:   [ 1.]
V:   [[-0.11301116]]
pi:  [[ 0.23030756  0.27955541  0.49013704]]
loss_pi:  [[-0.26658764]]  loss_v:  0.0403891


Let's go through the loss calculations. The loss of the policy is given by:

$L_{\pi} = -log(\pi(a_t|s_t))(R_t-V(s_t)) - \beta H(\pi(s_t))$

where the entropy is $\sum_{i}-\pi(a_i|s_t)log(\pi(a_i|s_t))$. For this transition, this becomes:

$L_{\pi} = −log(0.2796)(−0.3140−(−0.1130))+(0.01)((0.2303)log(0.2303) + (0.2796)log(0.2796) + (0.4901)log(0.4901) = -0.2666$

The loss of the value function is simpler, as it just calculates the (mean) squared error:

$L_{V} = \sum_{i} (R_t - V(s_t))^2$

which in this case becomes:

$L_{V} = (-0.3140 - (-0.1130))^2 = 0.0404$

Let's redefine the previous function to now include gradients.

In [43]:
def print_learning_step():
    # Get batch from memory
    s1, a, s2, isterminal, R, w = agent.memory.get_sample(1)
    s1 = agent.network._check_state(s1)
    a = agent.network._check_actions(a)
    if w is None:
        w = np.ones(a.shape[0])
        
    # Print values of batch
    print("s1: ", s1[:, :3, 1, 1])
    print("a:  ", a)
    print("R:  ", R)
    print("w:  ", w)
    print("V:  ", agent.network.get_value_output(s1))
    print("pi: ", agent.network.get_policy_output(s1))
    
    # Get gradients prior to learning step (gradient descent)
    opt = agent.network.optimizer
    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 
                                 scope=agent.network.scope)
    fd = {agent.network.state: s1, agent.network.actions: a, 
          agent.network.q_sa: R, agent.network.IS_weights: w}
    sess = agent.network.sess
    gvs_pi = opt.compute_gradients(agent.network.loss_pi, var_list=var_list)
    grads_pi = sess.run([g for g, v in gvs_pi], feed_dict=fd)
    gvs_v = opt.compute_gradients(agent.network.loss_v, var_list=var_list)
    gvs_v = [[g, v] for g, v in gvs_v if g is not None]
    grads_v = sess.run([g for g, v in gvs_v], feed_dict=fd)
    
    # Print fully-connected hidden layer output
    fc = tf.get_default_graph().get_tensor_by_name("global_network/FC_1/Relu:0")
    print("FC_output: ", sess.run(fc[0, :5], feed_dict=fd))
    
    # Print losses and gradients
    loss_pi, loss_v = agent.network.learn(s1, a, R, weights=w)
    print("loss_pi: ", loss_pi, " loss_v: ", loss_v)
    print("d(loss_pi)/d(b_pi): %s" % gvs_pi[-1][1].name[:-2], grads_pi[-1])
    print("d(loss_pi)/d(w_pi): %s" % gvs_pi[-2][1].name[:-2], grads_pi[-2][:5, :])
    print("d(loss_pi)/d(b_V): %s" % gvs_pi[-3][1].name[:-2], grads_pi[-3])
    print("d(loss_V)/d(b_V): %s" % gvs_v[-1][1].name[:-2], grads_v[-1])

In [44]:
# Load memory with transitions
agent = make_new_agent()
agent.initialize_new_episode()
for i in range(10):
    agent.perform_learning_step(1, 1)
print_learning_step()

s1:  [[ 0.41960785  0.45869282  0.41960785]]
a:   [[0 1]]
R:   [-0.1960398]
w:   [ 1.]
V:   [[ 0.63908535]]
pi:  [[ 0.36095497  0.30637842  0.33266661]]
FC_output:  [ 0.          0.14301239  0.11612199  0.04790126  0.        ]
loss_pi:  [[-0.99886197]]  loss_v:  0.697434
d(loss_pi)/d(b_pi): global_network/pi/biases [-0.30116326  0.57899565 -0.27783233]
d(loss_pi)/d(w_pi): global_network/pi/weights [[ 0.          0.          0.        ]
 [-0.04307008  0.08280355 -0.03973347]
 [-0.03497168  0.06723412 -0.03226244]
 [-0.0144261   0.02773462 -0.01330852]
 [ 0.          0.          0.        ]]
d(loss_pi)/d(b_V): global_network/V/biases [-1.18293428]
d(loss_V)/d(b_V): global_network/V/biases [ 1.6702503]


The derivatives of the loss functions are as follows:

$\dfrac{\partial L_\pi}{\partial \pi(a|s_t)} = 
\left\{\begin{matrix}
   -\dfrac{R_t - V(s_t)}{\pi(a|s_t)} + \beta (1 + log\pi(a|s_t)) \text{ if } a = a_t \\ 
   \beta (1 + log\pi(a|s_t)) \text{ if } a \neq a_t
\end{matrix}\right.$

$\dfrac{\partial L_\pi}{\partial V(s_t)} = log\pi(a_t|s_t)$

$\dfrac{\partial L_V}{\partial \pi(a|s_t)} = 0$

$\dfrac{\partial L_V}{\partial V(s_t)} = -2(R_t - V(s_t))$

Additionally, due to the softmax activation in the $\pi$ output layer, the derivatives for its biases and weights must backpropagate through the softmax function. Further, because of the summation term ($\sum_i e^f_i$) in softmax, the derivatives $\frac{\partial \pi(a_i|s_t)}{\partial w_{\pi(a_j|s_t)}}$, where $i \neq j$, are nonzero. Therefore:

$\dfrac{\partial L_\pi}{\partial w_{\pi(a_j|s_t)}}
= \dfrac{\partial L_\pi}{\partial \pi(s_t)} \dfrac{\partial \pi(s_t)}{\partial w_{\pi(a_j|s_t)}}
= \sum_i \left ( \dfrac{\partial L_\pi}{\partial \pi(a_i|s_t)} \dfrac{\partial \pi(a_i|s_t)}{\partial w_{\pi(a_j|s_t)}} \right )$,

where, for $f_k = b_{\pi(a_k|s_t)} + w_{\pi(a_k|s_t)}^{(1)}x^{(1)} + w_{\pi(a_k|s_t)}^{(2)}x^{(2)} + \cdots + w_{\pi(a_k|s_t)}^{(n)}x^{(n)}$:

$ \dfrac{\partial \pi(a_i|s_t)}{\partial b_{\pi(a_j|s_t)}}
= \dfrac{\partial}{\partial b_{\pi(a_j|s_t)}} \left ( \dfrac{e^f_i}{\sum_k e^f_k} \right )
= \dfrac{e^f_i \dfrac{\partial f_i}{\partial b_{\pi(a_j|s_t)}} \left ( \sum_k e^f_k \right ) - e^f_i \dfrac{\partial}{\partial b_{\pi(a_j|s_t)}} \left ( \sum_k e^f_k \right )}{\left ( \sum_k e^f_k \right )^2} = 
\left\{\begin{matrix}
    \dfrac{e^f_i \sum_k e^f_k - e^{2f_i}}{\left ( \sum_k e^f_k \right )^2} 
    = \dfrac{e^f_i}{\sum_k e^f_k} - \left ( \dfrac{e^f_i}{\sum_k e^f_k} \right )^2
    = \pi(a_i|s_t) - (\pi(a_i|s_t))^2
    = \pi(a_i|s_t)(1 - \pi(a_i|s_t)) \text{ if } i = j \\
    \dfrac{(0) \sum_k e^f_k - e^{f_i}e^{f_j}}{\left ( \sum_k e^f_k \right )^2}
    = \left ( \dfrac{e^f_i}{\sum_k e^f_k} \right ) \left ( \dfrac{e^f_j}{\sum_k e^f_k} \right )
    = -\pi(a_i|s_t)\pi(a_j|s_t) \text{ if } i \neq j
\end{matrix}\right.$

Similarly, the expressions for $\frac{\partial \pi(a_i|s_t)}{\partial w_{\pi(a_j|s_t)}^{(m)}}$ are the same as above except multiplied by $\frac{\partial f_j}{\partial w_{\pi(a_j|s_t)}^{(m)}} = x^{(m)}$:

$\dfrac{\partial \pi(a_i|s_t)}{\partial b_{\pi(a_j|s_t)}} =
\left\{\begin{matrix}
    = x^{(m)}\pi(a_i|s_t)(1 - \pi(a_i|s_t)) \text{ if } i = j \\
    = -x^{(m)}\pi(a_i|s_t)\pi(a_j|s_t) \text{ if } i \neq j
\end{matrix}\right.$

First, let's check out loss functions again:

$L_{\pi} = −log(0.3064)(-0.1960 - 0.6391)+(0.01)((0.3610)log(0.3610) + (0.3064)log(0.3064) + (0.3327)log(0.3327) = -0.9988$

$L_{V} = (-0.1960 - (0.6391))^2 = 0.6974$

We can plug in the numbers above to check these values. The derivatives of the loss function with respect to softmax outputs are:

$\dfrac{\partial L_\pi}{\partial \pi(a_0|s_t)} = (0.01)(1 + log(0.3610)) = -0.00019$
$\dfrac{\partial L_\pi}{\partial \pi(a_1|s_t)} = -\dfrac{-0.1960 - 0.6391}{0.3064} + (0.01)(1 + log(0.3064)) = 2.7237$
$\dfrac{\partial L_\pi}{\partial \pi(a_2|s_t)} = (0.01)(1 + log(0.3327)) = -0.0010$

and, in turn, the derivatives of the softmax outputs with respect to the output layer weights and biases are (for index 0):

$\dfrac{\partial \pi(a_0|s_t)}{\partial b_{\pi(a_0|s_t)}} = (0.3610)(1 - 0.3610) = 0.2307$

$\dfrac{\partial \pi(a_1|s_t)}{\partial b_{\pi(a_0|s_t)}} = -(0.3064)(0.3610) = -0.1106$

$\dfrac{\partial \pi(a_2|s_t)}{\partial b_{\pi(a_0|s_t)}} = -(0.3327)(0.3610) = -0.1201$

$\dfrac{\partial \pi(a_0|s_t)}{\partial w_{\pi(a_0|s_t)}^{(1)}} = (0.1430)(0.3610)(1 - 0.3610) = 0.03299$

$\dfrac{\partial \pi(a_1|s_t)}{\partial w_{\pi(a_0|s_t)}^{(1)}} = -(0.1430)(0.3064)(0.3610) = -0.01582$

$\dfrac{\partial \pi(a_2|s_t)}{\partial w_{\pi(a_0|s_t)}^{(1)}} = -(0.1430)(0.3327)(0.3610) = -0.01717$

Finally, we sum the derivatives to get the total derivatives for each variable:

$\dfrac{\partial L_\pi}{\partial b_{\pi(a_0|s_t)}} 
= \sum_i \left ( \dfrac{\partial L_\pi}{\partial \pi(a_i|s_t)} \dfrac{\partial \pi(a_i|s_t)}{\partial b_{\pi(a_0|s_t)}} \right )
= (-0.00019)(0.2307) + (2.7237)(-0.1106) + (-0.0010)(-0.1201) = -0.3012$

$\dfrac{\partial L_\pi}{\partial w_{\pi(a_0|s_t)}^{(1)}}
= \sum_i \left ( \dfrac{\partial L_\pi}{\partial \pi(a_i|s_t)} \dfrac{\partial \pi(a_i|s_t)}{\partial w_{\pi(a_0|s_t)}^{(1)}} \right )
= (-0.00019)(0.03299) + (2.7237)(-0.01582) + (-0.0010)(-0.01717) = -0.04308$

Great, everything matches up!

## Troubleshooting backprop
After only a few hundred learning steps, the network weights diverge to `nan` almost all at once. I presume it must be due to some component of the loss function becoming infinity.

In [8]:
import numpy as np
import warnings

np.seterr(all='warn')
tf.logging.set_verbosity(tf.logging.WARN)

In [9]:
def print_values_after_nan(s1):
    print("V:  ", agent.network.get_value_output(s1))
    print("pi: ", agent.network.get_policy_output(s1))
    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 
                                 scope=agent.network.scope)
    sess = agent.network.sess
    for v in var_list:
        values = sess.run(v)
        if np.isnan(values).any():
            print(v.name[:-2])
            print(values)

In [58]:
agent = make_new_agent()
agent.initialize_new_episode()
sess = agent.sess

# Store memories
i = 0
while agent.memory.size < agent.rm_start_size:
    i += 1
    print(i, end="\r")
    agent.perform_learning_step(1, 1)

# Learn from memories until Nan gradient created
s1, a, s2, isterminal, q_sa, w = None, None, None, None, None, None
while True:
    s1, a, s2, isterminal, q_sa, w = agent.memory.get_sample(agent.batch_size)
    s1 = agent.network._check_state(s1)
    a = agent.network._check_actions(a)
    _ = agent.network.learn(s1, a, q_sa)
    feed_dict={agent.network.state: s1,
               agent.network.actions: a, 
               agent.network.q_sa: q_sa,
               agent.network.IS_weights: w}
    grad_sum_ = sess.run(agent.network.grad_sum,
                         feed_dict=feed_dict)
    agent.network.writer.add_summary(grad_sum_) # throws error when grad is Nan

220

InvalidArgumentError: Nan in summary histogram for: global_network/summaries/gradients/global_network/pi/biases/grads_0
	 [[Node: global_network/summaries/gradients/global_network/pi/biases/grads_0 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](global_network/summaries/gradients/global_network/pi/biases/grads_0/tag, global_network/summaries/gradients/gradients/global_network/pi/BiasAdd_grad/tuple/control_dependency_1/_105)]]

Caused by op 'global_network/summaries/gradients/global_network/pi/biases/grads_0', defined at:
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-58-4f3bf23bcbd0>", line 1, in <module>
    agent = make_new_agent()
  File "<ipython-input-7-01f1c4964e8d>", line 12, in make_new_agent
    output_directory=results_dir)
  File "../python/helper.py", line 17, in create_agent
    return agent_types[agent_type](agent_file=agent_filename, **kwargs)
  File "../python/agent/ACERAgent.py", line 25, in __init__
    **kwargs)
  File "../python/agent/Agent.py", line 82, in __init__
    scope=self.MAIN_SCOPE)
  File "../python/helper.py", line 25, in create_network
    return network_types[net_type](network_file=network_filename, **kwargs)
  File "../python/network/ACNetwork.py", line 20, in __init__
    scope=scope)
  File "../python/network/Network.py", line 77, in __init__
    var_sum, neur_sum, grad_sum = builder.add_summaries()
  File "../python/network/NetworkBuilder.py", line 273, in add_summaries
    grad_sum.append(tf.summary.histogram("grads_%d" % i, g))
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/summary/summary.py", line 209, in histogram
    tag=scope.rstrip('/'), values=values, name=scope)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/ops/gen_logging_ops.py", line 139, in _histogram_summary
    name=name)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Nan in summary histogram for: global_network/summaries/gradients/global_network/pi/biases/grads_0
	 [[Node: global_network/summaries/gradients/global_network/pi/biases/grads_0 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](global_network/summaries/gradients/global_network/pi/biases/grads_0/tag, global_network/summaries/gradients/gradients/global_network/pi/BiasAdd_grad/tuple/control_dependency_1/_105)]]


In [59]:
# Print values
print("s1:   ", s1[:, :3, 1, 1])
print("a:    ", a)
print("s2:   ", s2[:, :3, 1, 1])
print("isterminal: ", isterminal)
print("q_sa: ", q_sa)
print("w:    ", w)
print("V:  ", agent.network.get_value_output(s1))
print("pi: ", agent.network.get_policy_output(s1))

# Get gradients
opt = agent.network.optimizer
var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 
                             scope=agent.network.scope)
fd = {agent.network.state: s1, agent.network.actions: a, 
      agent.network.q_sa: q_sa, agent.network.IS_weights: w}
sess = agent.network.sess
gvs_pi = opt.compute_gradients(agent.network.loss_pi, var_list=var_list)
grads_pi = sess.run([g for g, v in gvs_pi], feed_dict=fd)
gvs_v = opt.compute_gradients(agent.network.loss_v, var_list=var_list)
gvs_v = [[g, v] for g, v in gvs_v if g is not None]
grads_v = sess.run([g for g, v in gvs_v], feed_dict=fd)

# Print intermediate values and variables
pre_softmax = tf.get_default_graph().get_tensor_by_name("global_network/pi/BiasAdd:0")
pi_bias = tf.get_default_graph().get_tensor_by_name("global_network/pi/biases:0")
pi_weights = tf.get_default_graph().get_tensor_by_name("global_network/pi/weights:0")
print("pre_softmax: ", sess.run(pre_softmax, feed_dict=fd))
print("pi_bias: ", sess.run(pi_bias))
print("pi_weights", sess.run(pi_weights)[:5, :])

# Print losses and gradients
loss_pi, loss_v = sess.run([agent.network.loss_pi, agent.network.loss_v],
                           feed_dict=fd)
print("loss_pi: ", loss_pi, " loss_v: ", loss_v)
print("d(loss_pi)/d(b_pi): %s" % gvs_pi[-1][1].name[:-2], grads_pi[-1])
print("d(loss_pi)/d(w_pi): %s" % gvs_pi[-2][1].name[:-2], grads_pi[-2][:5, :])
print("d(loss_pi)/d(b_V): %s" % gvs_pi[-3][1].name[:-2], grads_pi[-3])
print("d(loss_V)/d(b_V): %s" % gvs_v[-1][1].name[:-2], grads_v[-1])

s1:    [[ 0.52156866  0.48248366  0.40000001]]
a:     [[0 1]]
s2:    [[ 0.52156866  0.48248366  0.40000001]]
isterminal:  [ 0.]
q_sa:  [-0.20740508]
w:     [ 1.]
V:   [[ 23.00398064]]
pi:  [[  0.00000000e+00   8.93349003e-24   1.00000000e+00]]
pre_softmax:  [[-35.95811081   0.93464893  54.00688553]]
pi_bias:  [ 0.02588907  0.10460023  0.17608647]
pi_weights [[-0.01443296 -0.12811744 -0.01737083]
 [-0.07300077  0.06540819 -0.11601797]
 [-0.1181846   0.04768491  0.18741176]
 [-0.06038205 -0.00111259  0.19380943]
 [-0.02992778 -0.08499466  0.16891941]]
loss_pi:  [[ nan]]  loss_v:  538.768
d(loss_pi)/d(b_pi): global_network/pi/biases [ nan  nan  nan]
d(loss_pi)/d(w_pi): global_network/pi/weights [[ nan  nan  nan]
 [ nan  nan  nan]
 [ nan  nan  nan]
 [ nan  nan  nan]
 [ nan  nan  nan]]
d(loss_pi)/d(b_V): global_network/V/biases [-53.07223511]
d(loss_V)/d(b_V): global_network/V/biases [ 46.42277145]


In [45]:
print_values_after_nan(s1)

V:   [[ 19.40326881]]
pi:  [[  0.00000000e+00   1.00000000e+00   4.64825156e-08]]


It appears that early on, the output layer diverges quickly, leading to a softmax output beyond numerical accuracy and thus 0. This, in turn, leads to $log(0)=-\infty$ and possibly division by 0 as well, both of which backpropagate `nan` throughout the network. This [forum on stackoverflow](https://stackoverflow.com/questions/37448557/why-are-my-tensorflow-network-weights-and-costs-nan-when-i-use-relu-activations) mentioned lowering weights into the final output layer. So now we will edit the `ac_basic.json` file to initialize weights in the `pi` layer with a stddev=1e-4.

In [64]:
agent = make_new_agent()
agent.initialize_new_episode()
sess = agent.sess

# Store memories
i = 0
while agent.memory.size < agent.rm_start_size:
    i += 1
    print(i, end="\r")
    agent.perform_learning_step(1, 1)

# Learn from memories until Nan gradient created
s1, a, s2, isterminal, q_sa, w = None, None, None, None, None, None
for i in range(10000):
    s1, a, s2, isterminal, q_sa, w = agent.memory.get_sample(agent.batch_size)
    s1 = agent.network._check_state(s1)
    a = agent.network._check_actions(a)
    _ = agent.network.learn(s1, a, q_sa)
    feed_dict={agent.network.state: s1,
               agent.network.actions: a, 
               agent.network.q_sa: q_sa,
               agent.network.IS_weights: w}
    grad_sum_ = sess.run(agent.network.grad_sum,
                         feed_dict=feed_dict)
    agent.network.writer.add_summary(grad_sum_) # throws error when grad is Nan
    print(i, end="\r")

435

InvalidArgumentError: Nan in summary histogram for: global_network/summaries/gradients/global_network/pi/biases/grads_0
	 [[Node: global_network/summaries/gradients/global_network/pi/biases/grads_0 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](global_network/summaries/gradients/global_network/pi/biases/grads_0/tag, global_network/summaries/gradients/gradients/global_network/pi/BiasAdd_grad/tuple/control_dependency_1/_105)]]

Caused by op 'global_network/summaries/gradients/global_network/pi/biases/grads_0', defined at:
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-64-a3dab79cf69b>", line 1, in <module>
    agent = make_new_agent()
  File "<ipython-input-7-01f1c4964e8d>", line 12, in make_new_agent
    output_directory=results_dir)
  File "../python/helper.py", line 17, in create_agent
    return agent_types[agent_type](agent_file=agent_filename, **kwargs)
  File "../python/agent/ACERAgent.py", line 25, in __init__
    **kwargs)
  File "../python/agent/Agent.py", line 82, in __init__
    scope=self.MAIN_SCOPE)
  File "../python/helper.py", line 25, in create_network
    return network_types[net_type](network_file=network_filename, **kwargs)
  File "../python/network/ACNetwork.py", line 20, in __init__
    scope=scope)
  File "../python/network/Network.py", line 77, in __init__
    var_sum, neur_sum, grad_sum = builder.add_summaries()
  File "../python/network/NetworkBuilder.py", line 273, in add_summaries
    grad_sum.append(tf.summary.histogram("grads_%d" % i, g))
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/summary/summary.py", line 209, in histogram
    tag=scope.rstrip('/'), values=values, name=scope)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/ops/gen_logging_ops.py", line 139, in _histogram_summary
    name=name)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/james/anaconda3/envs/vizdoom/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Nan in summary histogram for: global_network/summaries/gradients/global_network/pi/biases/grads_0
	 [[Node: global_network/summaries/gradients/global_network/pi/biases/grads_0 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](global_network/summaries/gradients/global_network/pi/biases/grads_0/tag, global_network/summaries/gradients/gradients/global_network/pi/BiasAdd_grad/tuple/control_dependency_1/_105)]]


In [50]:
loss = [agent.network.loss_pi, agent.network.loss_v]
for i, l in enumerate(loss):
    print(i, " ", l)

0   Tensor("global_network/loss/policy_loss:0", shape=(?, ?), dtype=float32)
1   Tensor("global_network/loss/Mean:0", shape=(), dtype=float32)
