# REINFORCE in FNN.
Using Bayesflow released within the Tensorflow Distribution for convenience.

## Bayesflow.

In [None]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical
tf.reset_default_graph()

hidden_dim = 4
output_dim = 2
num_trials = 100
print_every = 10
batch_size = 1
lr = 0.1
decay = 0.9

with tf.name_scope('model'):
    W_gen = tf.Variable(tf.random_uniform([batch_size, hidden_dim]), name='W_gen')
    logits_gen = W_gen
    log_probs = tf.nn.log_softmax(logits_gen)
    
    index = tf.contrib.bayesflow.stochastic_graph.DistributionTensor(
        tf.contrib.distributions.Categorical, 
        logits=log_probs)
        
    logits = tf.one_hot(index, hidden_dim, dtype=tf.float32)
    
    W_dis = tf.constant([[0.2, 0.], [0.5, 0.9], [0.9, 0.1], [0.1, 1.0]], name='W_dis')
    pred = tf.nn.softmax(tf.matmul(logits, W_dis))
    
    
with tf.name_scope('reinforce'):
    reward = pred[:, 1]
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    reduced_reward = tf.reduce_mean(reward)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    advantage = reward - baseline
    loss = -advantage

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    final_loss = tf.contrib.bayesflow.stochastic_graph.surrogate_loss([loss])
    min_op = optimizer.minimize(final_loss, var_list = [W_gen])
    
    # Group operations.
    train_op = tf.group(min_op, maintain_avg_op)
    

with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])



## Non-Bayesflow Implementation.

In [None]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical
tf.reset_default_graph()

hidden_dim = 4
output_dim = 2
num_trials = 100
print_every = 10
batch_size = 1
lr = 0.1
decay = 0.9

with tf.name_scope('model'):
    W_gen = tf.Variable(tf.random_uniform([batch_size, hidden_dim]), name='W_gen')
    logits_gen = W_gen
    log_probs = tf.nn.log_softmax(logits_gen)
    
    categorical = Categorical(log_probs)
    index = categorical.sample()        
    log_prob = categorical.log_prob(index)
    logits = tf.one_hot(index, hidden_dim, dtype=tf.float32)
    
    W_dis = tf.constant([[0.2, 0.], [0.5, 0.9], [0.9, 0.1], [0.1, 1.0]], name='W_dis')
    pred = tf.nn.softmax(tf.matmul(logits, W_dis))
    
    
with tf.name_scope('reinforce'):
    reward = pred[:, 1]
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    reduced_reward = tf.reduce_mean(reward)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    advantage = reward - baseline

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    min_op = optimizer.minimize(-log_prob * tf.stop_gradient(advantage), var_list = [W_gen])
    train_op = tf.group(min_op, maintain_avg_op)
    

with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])


## Maximum Achievable Reward for this Discriminator

In [None]:
W_dis = tf.constant([[0.2, 0.], [0.5, 0.9], [0.9, 0.1], [0.1, 1.0]], name='W_dis')
x = tf.constant([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]], dtype=tf.float32)

z = tf.nn.softmax(tf.matmul(x, W_dis))

with tf.Session() as sess:
    print sess.run(z[:,1])

# REINFORCE in RNN.
Here we employ REINFORCE in an RNN.  As before, the reward to the Generator $G$ is simply the probability of real at each time point, $p_t$, assigned by the Discriminator $D$.

In [25]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical

rnn_gen_size = 3
rnn_dis_size = 2
time_steps = 5
batch_size = 1
num_trials = 1000
print_every = 100
lr = 0.1
decay = 0.9

## Bayesflow.

In [26]:
tf.reset_default_graph()

# Initial indices
batch_indices = tf.constant([[1., 0., 0.]], dtype=tf.float32)

with tf.name_scope('model'):
    cell_gen = tf.nn.rnn_cell.BasicRNNCell(rnn_gen_size)
    cell_dis = tf.nn.rnn_cell.BasicRNNCell(rnn_dis_size)
    state_gen = cell_gen.zero_state(batch_size, tf.float32)
    state_dis = cell_dis.zero_state(batch_size, tf.float32)
    
    with tf.name_scope('rnn'):
        predictions = []
        
        for t in xrange(time_steps):        
            if t > 0:
                tf.get_variable_scope().reuse_variables()
        
            with tf.variable_scope('gen'):
                rnn_in = batch_indices
                rnn_out, state_gen = cell_gen(rnn_in, state_gen)
                log_probs = tf.nn.log_softmax(rnn_out)

                index = tf.contrib.bayesflow.stochastic_graph.DistributionTensor(
                    tf.contrib.distributions.Categorical,
                    logits=log_probs)

                logits = tf.one_hot(index, rnn_gen_size, dtype=tf.float32)
                batch_indices = logits

            with tf.variable_scope('dis'):
                rnn_in = logits
                
                # Set the discriminator to a fixed value for debugging sake.
                rnn_out, state_dis = cell_dis(rnn_in, state_dis)
                pred = tf.nn.softmax(rnn_out)
                predictions.append(pred)
    
    
with tf.name_scope('reinforce'):
    rewards = []
    for pred in predictions:
        rewards.append(pred[:, 1])
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    rewards_tf = tf.pack(rewards)
    reduced_reward = tf.reduce_mean(rewards_tf)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    loss = []
    for reward in rewards:
        advantage = reward - baseline
        loss.append(-advantage)

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    final_loss = tf.contrib.bayesflow.stochastic_graph.surrogate_loss(loss)
    gen_vars = [v for v in tf.trainable_variables() if v.op.name.startswith('gen')]
    min_op = optimizer.minimize(final_loss, var_list = gen_vars)
    
    # Group operations.
    train_op = tf.group(min_op, maintain_avg_op)
    
    
with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])   

[array([ 0.44025958], dtype=float32), 0.051253248, array([ 0.38900632], dtype=float32)]
[array([ 0.54677969], dtype=float32), 0.61269772, array([-0.06591803], dtype=float32)]
[array([ 0.66815019], dtype=float32), 0.61670935, array([ 0.05144083], dtype=float32)]
[array([ 0.6800552], dtype=float32), 0.61474013, array([ 0.06531507], dtype=float32)]
[array([ 0.64466918], dtype=float32), 0.62256682, array([ 0.02210236], dtype=float32)]
[array([ 0.67510086], dtype=float32), 0.61862373, array([ 0.05647713], dtype=float32)]
[array([ 0.54748303], dtype=float32), 0.63039941, array([-0.08291638], dtype=float32)]
[array([ 0.6800552], dtype=float32), 0.61282003, array([ 0.06723517], dtype=float32)]
[array([ 0.67204022], dtype=float32), 0.6046049, array([ 0.06743532], dtype=float32)]
[array([ 0.53704149], dtype=float32), 0.61734879, array([-0.0803073], dtype=float32)]


## Non-Bayesflow Implementation.

In [29]:
tf.reset_default_graph()

# Initial indices
batch_indices = tf.constant([[1., 0., 0.]], dtype=tf.float32)

with tf.name_scope('model'):
    cell_gen = tf.nn.rnn_cell.BasicRNNCell(rnn_gen_size)
    state_gen = cell_gen.zero_state(batch_size, tf.float32)
    cell_dis = tf.nn.rnn_cell.BasicRNNCell(rnn_dis_size)
    state_dis = cell_dis.zero_state(batch_size, tf.float32)

    with tf.name_scope('rnn'):
        predictions, log_probs = [], []
        
        for t in xrange(time_steps):        
            if t > 0:
                tf.get_variable_scope().reuse_variables()
        
            with tf.variable_scope('gen'):
                rnn_in = batch_indices
                rnn_out, state_gen = cell_gen(rnn_in, state_gen)
                log_probs_gen = tf.nn.log_softmax(rnn_out)
            
                categorical = Categorical(log_probs_gen)
                index = categorical.sample()        
                log_prob = categorical.log_prob(index)
                log_probs.append(log_prob)
                
                logits = tf.one_hot(index, rnn_gen_size, dtype=tf.float32)
                batch_indices = logits
                
            with tf.variable_scope('dis'):
                rnn_in = logits
    
                # Set the discriminator to a fixed value for debugging sake.
                rnn_out, state_dis = cell_dis(rnn_in, state_dis)
                pred = tf.nn.softmax(rnn_out)
                predictions.append(pred)
    
    
with tf.name_scope('reinforce'):
    rewards = []
    for pred in predictions:
        rewards.append(pred[:, 1])
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    rewards_tf = tf.pack(rewards)
    reduced_reward = tf.reduce_mean(rewards_tf)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    final_loss = 0.
    for reward, log_prob in zip(rewards, log_probs):
        advantage = reward - baseline        
        final_loss += -log_prob * tf.stop_gradient(advantage)

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    gen_vars = [v for v in tf.trainable_variables() if v.op.name.startswith('gen')]
    min_op = optimizer.minimize(final_loss, var_list = gen_vars)
    
    # Group operations.
    train_op = tf.group(min_op, maintain_avg_op)
    
    
with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])   

[array([ 0.37493035], dtype=float32), 0.048649948, array([ 0.32628042], dtype=float32)]
[array([ 0.59426779], dtype=float32), 0.56523156, array([ 0.02903622], dtype=float32)]
[array([ 0.59582311], dtype=float32), 0.55584794, array([ 0.03997517], dtype=float32)]
[array([ 0.59528738], dtype=float32), 0.55456829, array([ 0.04071909], dtype=float32)]
[array([ 0.62032169], dtype=float32), 0.57161206, array([ 0.04870963], dtype=float32)]
[array([ 0.5996086], dtype=float32), 0.55554104, array([ 0.04406756], dtype=float32)]
[array([ 0.63542444], dtype=float32), 0.55216593, array([ 0.08325851], dtype=float32)]
[array([ 0.35103628], dtype=float32), 0.55592847, array([-0.20489219], dtype=float32)]
[array([ 0.5957855], dtype=float32), 0.56596732, array([ 0.02981818], dtype=float32)]
[array([ 0.61855721], dtype=float32), 0.56022847, array([ 0.05832875], dtype=float32)]
