# REINFORCE in FNN.
Using Bayesflow released within the Tensorflow Distribution for convenience.

## Bayesflow.

In [2]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical
tf.reset_default_graph()

hidden_dim = 4
output_dim = 2
num_trials = 100
print_every = 10
batch_size = 1
lr = 0.1
decay = 0.9

with tf.name_scope('model'):
    W_gen = tf.Variable(tf.random_uniform([batch_size, hidden_dim]), name='W_gen')
    logits_gen = W_gen
    log_probs = tf.nn.log_softmax(logits_gen)
    
    index = tf.contrib.bayesflow.stochastic_graph.DistributionTensor(
        tf.contrib.distributions.Categorical, 
        logits=log_probs)
        
    logits = tf.one_hot(index, hidden_dim, dtype=tf.float32)
    
    W_dis = tf.constant([[0.2, 0.], [0.5, 0.9], [0.9, 0.1], [0.1, 1.0]], name='W_dis')
    pred = tf.nn.softmax(tf.matmul(logits, W_dis))
    
    
with tf.name_scope('reinforce'):
    reward = pred[:, 1]
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    reduced_reward = tf.reduce_mean(reward)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    advantage = reward - baseline
    loss = -advantage

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    final_loss = tf.contrib.bayesflow.stochastic_graph.surrogate_loss([loss])
    min_op = optimizer.minimize(final_loss, var_list = [W_gen])
    
    # Group operations.
    train_op = tf.group(min_op, maintain_avg_op)
    

with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])



[array([ 0.31002551], dtype=float32), 0.031002559, array([ 0.27902296], dtype=float32)]
[array([ 0.71094948], dtype=float32), 0.32283053, array([ 0.38811895], dtype=float32)]
[array([ 0.59868765], dtype=float32), 0.40917, array([ 0.18951765], dtype=float32)]
[array([ 0.71094948], dtype=float32), 0.48557228, array([ 0.2253772], dtype=float32)]
[array([ 0.71094948], dtype=float32), 0.5823589, array([ 0.12859058], dtype=float32)]
[array([ 0.71094948], dtype=float32), 0.63889945, array([ 0.07205003], dtype=float32)]
[array([ 0.71094948], dtype=float32), 0.64649659, array([ 0.06445289], dtype=float32)]
[array([ 0.71094948], dtype=float32), 0.66480201, array([ 0.04614747], dtype=float32)]
[array([ 0.71094948], dtype=float32), 0.67138833, array([ 0.03956115], dtype=float32)]
[array([ 0.71094948], dtype=float32), 0.69715536, array([ 0.01379412], dtype=float32)]


## Non-Bayesflow Implementation.

In [None]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical
tf.reset_default_graph()

hidden_dim = 4
output_dim = 2
num_trials = 100
print_every = 10
batch_size = 1
lr = 0.1
decay = 0.9

with tf.name_scope('model'):
    W_gen = tf.Variable(tf.random_uniform([batch_size, hidden_dim]), name='W_gen')
    logits_gen = W_gen
    log_probs = tf.nn.log_softmax(logits_gen)
    
    categorical = Categorical(log_probs)
    index = categorical.sample()        
    log_prob = categorical.log_prob(index)
    logits = tf.one_hot(index, hidden_dim, dtype=tf.float32)
    
    W_dis = tf.constant([[0.2, 0.], [0.5, 0.9], [0.9, 0.1], [0.1, 1.0]], name='W_dis')
    pred = tf.nn.softmax(tf.matmul(logits, W_dis))
    
    
with tf.name_scope('reinforce'):
    reward = pred[:, 1]
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    reduced_reward = tf.reduce_mean(reward)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    advantage = reward - baseline

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    min_op = optimizer.minimize(-log_prob * tf.stop_gradient(advantage), var_list = [W_gen])
    train_op = tf.group(min_op, maintain_avg_op)
    

with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])


## Maximum Achievable Reward for this Discriminator

In [None]:
W_dis = tf.constant([[0.2, 0.], [0.5, 0.9], [0.9, 0.1], [0.1, 1.0]], name='W_dis')
x = tf.constant([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]], dtype=tf.float32)

z = tf.nn.softmax(tf.matmul(x, W_dis))

with tf.Session() as sess:
    print sess.run(z[:,1])

# REINFORCE in RNN.
Here we employ REINFORCE in an RNN.  As before, the reward to the Generator $G$ is simply the probability of real at each time point, $p_t$, assigned by the Discriminator $D$.

In [3]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical

rnn_gen_size = 3
rnn_dis_size = 2
time_steps = 5
batch_size = 1
num_trials = 1000
print_every = 100
lr = 0.1
decay = 0.9

## Bayesflow.

In [4]:
tf.reset_default_graph()

# Initial indices
batch_indices = tf.constant([[1., 0., 0.]], dtype=tf.float32)

with tf.name_scope('model'):
    cell_gen = tf.nn.rnn_cell.BasicRNNCell(rnn_gen_size)
    cell_dis = tf.nn.rnn_cell.BasicRNNCell(rnn_dis_size)
    state_gen = cell_gen.zero_state(batch_size, tf.float32)
    state_dis = cell_dis.zero_state(batch_size, tf.float32)
    
    with tf.name_scope('rnn'):
        predictions = []
        
        for t in xrange(time_steps):        
            if t > 0:
                tf.get_variable_scope().reuse_variables()
        
            with tf.variable_scope('gen'):
                rnn_in = batch_indices
                rnn_out, state_gen = cell_gen(rnn_in, state_gen)
                log_probs = tf.nn.log_softmax(rnn_out)

                index = tf.contrib.bayesflow.stochastic_graph.DistributionTensor(
                    tf.contrib.distributions.Categorical,
                    logits=log_probs)
                batch_indices = tf.one_hot(index, rnn_gen_size, dtype=tf.float32)

            with tf.variable_scope('dis'):
                rnn_in = batch_indices
                
                # Set the discriminator to a fixed value for debugging sake.
                rnn_out, state_dis = cell_dis(rnn_in, state_dis)
                pred = tf.nn.softmax(rnn_out)
                predictions.append(pred)
    
    
with tf.name_scope('reinforce'):
    rewards = []
    for pred in predictions:
        rewards.append(pred[:, 1])
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    rewards_tf = tf.pack(rewards)
    reduced_reward = tf.reduce_mean(rewards_tf)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    loss = []
    for reward in rewards:
        advantage = reward - baseline
        loss.append(-advantage)

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    final_loss = tf.contrib.bayesflow.stochastic_graph.surrogate_loss(loss)
    gen_vars = [v for v in tf.trainable_variables() if v.op.name.startswith('gen')]
    min_op = optimizer.minimize(final_loss, var_list = gen_vars)
    
    # Group operations.
    train_op = tf.group(min_op, maintain_avg_op)
    
    
with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])   

[array([ 0.39902741], dtype=float32), 0.063332945, array([ 0.33569446], dtype=float32)]
[array([ 0.55243063], dtype=float32), 0.66985744, array([-0.11742681], dtype=float32)]
[array([ 0.72882092], dtype=float32), 0.67034191, array([ 0.05847901], dtype=float32)]
[array([ 0.72882092], dtype=float32), 0.68640757, array([ 0.04241335], dtype=float32)]
[array([ 0.61128277], dtype=float32), 0.67541271, array([-0.06412995], dtype=float32)]
[array([ 0.72882092], dtype=float32), 0.68771875, array([ 0.04110217], dtype=float32)]
[array([ 0.45748341], dtype=float32), 0.67063922, array([-0.21315581], dtype=float32)]
[array([ 0.54443336], dtype=float32), 0.66973627, array([-0.12530291], dtype=float32)]
[array([ 0.72882092], dtype=float32), 0.68279296, array([ 0.04602796], dtype=float32)]
[array([ 0.72882092], dtype=float32), 0.6936931, array([ 0.03512782], dtype=float32)]


## Non-Bayesflow Implementation.

In [32]:
tf.reset_default_graph()

# Initial indices
batch_indices = tf.constant([[1., 0., 0.]], dtype=tf.float32)

with tf.name_scope('model'):
    cell_gen = tf.nn.rnn_cell.BasicRNNCell(rnn_gen_size)
    state_gen = cell_gen.zero_state(batch_size, tf.float32)
    cell_dis = tf.nn.rnn_cell.BasicRNNCell(rnn_dis_size)
    state_dis = cell_dis.zero_state(batch_size, tf.float32)

    with tf.name_scope('rnn'):
        predictions, log_probs = [], []
        
        for t in xrange(time_steps):        
            if t > 0:
                tf.get_variable_scope().reuse_variables()
        
            with tf.variable_scope('gen'):
                rnn_in = batch_indices
                rnn_out, state_gen = cell_gen(rnn_in, state_gen)
                log_probs_gen = tf.nn.log_softmax(rnn_out)
            
                categorical = Categorical(log_probs_gen)
                index = categorical.sample()        
                log_prob = categorical.log_prob(index)
                log_probs.append(log_prob)
                
                batch_indices = tf.one_hot(index, rnn_gen_size, dtype=tf.float32)
                
            with tf.variable_scope('dis'):
                rnn_in = batch_indices
    
                # Set the discriminator to a fixed value for debugging sake.
                rnn_out, state_dis = cell_dis(rnn_in, state_dis)
                pred = tf.nn.softmax(rnn_out)
                predictions.append(pred)
    
    
with tf.name_scope('reinforce'):
    rewards = []
    for pred in predictions:
        rewards.append(pred[:, 1])
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    rewards_tf = tf.pack(rewards)
    reduced_reward = tf.reduce_mean(rewards_tf)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    final_loss = 0.
    for reward, log_prob in zip(rewards, log_probs):
        advantage = reward - baseline        
        final_loss += -log_prob * tf.stop_gradient(advantage)

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    gen_vars = [v for v in tf.trainable_variables() if v.op.name.startswith('gen')]
    min_op = optimizer.minimize(final_loss, var_list = gen_vars)
    
    # Group operations.
    train_op = tf.group(min_op, maintain_avg_op)
    
    
with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])   

[array([ 0.4540458], dtype=float32), 0.056483652, array([ 0.39756215], dtype=float32)]
[array([ 0.63507384], dtype=float32), 0.6145277, array([ 0.02054614], dtype=float32)]
[array([ 0.46550792], dtype=float32), 0.62330419, array([-0.15779626], dtype=float32)]
[array([ 0.61772895], dtype=float32), 0.62210727, array([-0.00437832], dtype=float32)]
[array([ 0.61772895], dtype=float32), 0.62233084, array([-0.0046019], dtype=float32)]
[array([ 0.61772895], dtype=float32), 0.62022167, array([-0.00249273], dtype=float32)]
[array([ 0.48888925], dtype=float32), 0.61998826, array([-0.13109902], dtype=float32)]
[array([ 0.5428347], dtype=float32), 0.62464905, array([-0.08181435], dtype=float32)]
[array([ 0.68850166], dtype=float32), 0.61919284, array([ 0.06930882], dtype=float32)]
[array([ 0.61772895], dtype=float32), 0.62098986, array([-0.00326091], dtype=float32)]
