# REINFORCE in FNN.
Using Bayesflow released within the Tensorflow Distribution for convenience.

In [None]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical
tf.reset_default_graph()

hidden_dim = 4
output_dim = 2
num_trials = 100
print_every = 10
batch_size = 1
lr = 0.1
decay = 0.9

with tf.name_scope('model'):
    W_gen = tf.Variable(tf.random_uniform([batch_size, hidden_dim]), name='W_gen')
    logits_gen = W_gen
    log_probs = tf.nn.log_softmax(logits_gen)
    
    index = tf.contrib.bayesflow.stochastic_graph.DistributionTensor(
        tf.contrib.distributions.Categorical, 
        logits=log_probs)
        
    logits = tf.one_hot(index, hidden_dim, dtype=tf.float32)
    
    W_dis = tf.constant([[0.2, 0.], [0.5, 0.9], [0.9, 0.1], [0.1, 1.0]], name='W_dis')
    pred = tf.nn.softmax(tf.matmul(logits, W_dis))
    
    
with tf.name_scope('reinforce'):
    reward = pred[:, 1]
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    reduced_reward = tf.reduce_mean(reward)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    advantage = reward - baseline
    loss = -advantage

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    final_loss = tf.contrib.bayesflow.stochastic_graph.surrogate_loss([loss])
    min_op = optimizer.minimize(final_loss, var_list = [W_gen])
    
    # Group operations.
    train_op = tf.group(min_op, maintain_avg_op)
    

with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])



## Non-Bayesflow Implementation.

In [None]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical
tf.reset_default_graph()

hidden_dim = 4
output_dim = 2
num_trials = 100
print_every = 10
batch_size = 1
lr = 0.1
decay = 0.9

with tf.name_scope('model'):
    W_gen = tf.Variable(tf.random_uniform([batch_size, hidden_dim]), name='W_gen')
    logits_gen = W_gen
    log_probs = tf.nn.log_softmax(logits_gen)
    
    categorical = Categorical(log_probs)
    index = categorical.sample()        
    log_prob = categorical.log_prob(index)
    logits = tf.one_hot(index, hidden_dim, dtype=tf.float32)
    
    W_dis = tf.constant([[0.2, 0.], [0.5, 0.9], [0.9, 0.1], [0.1, 1.0]], name='W_dis')
    pred = tf.nn.softmax(tf.matmul(logits, W_dis))
    
    
with tf.name_scope('reinforce'):
    reward = pred[:, 1]
    
    # Exponential baseline.
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    reduced_reward = tf.reduce_mean(reward)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    advantage = reward - baseline

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    min_op = optimizer.minimize(-log_prob * tf.stop_gradient(advantage), var_list = [W_gen])
    train_op = tf.group(min_op, maintain_avg_op)
    

with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])


## Maximum Achievable Reward for this Discriminator

In [None]:
W_dis = tf.constant([[0.2, 0.], [0.5, 0.9], [0.9, 0.1], [0.1, 1.0]], name='W_dis')
x = tf.constant([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]], dtype=tf.float32)

z = tf.nn.softmax(tf.matmul(x, W_dis))

with tf.Session() as sess:
    print sess.run(z[:,1])

# REINFORCE in RNN.
Here we employ REINFORCE in an RNN.  As before, the reward to the Generator $G$ is simply the probability of real at each time point, $p_t$, assigned by the Discriminator $D$.

In [None]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical
tf.reset_default_graph()


rnn_gen_size = 3
rnn_dis_size = 2
time_steps = 5
batch_size = 1
num_trials = 1000
print_every = 10
lr = 0.1
decay = 0.9


# Initial indices
batch_indices = tf.constant([[1., 0., 0.]], dtype=tf.float32)


with tf.name_scope('model'):
    with tf.name_scope('rnn'):
        predictions = []
        
        for t in xrange(time_steps):        
            if t > 0:
                tf.get_variable_scope().reuse_variables()
        
            with tf.variable_scope('gen'):
                cell_gen = tf.nn.rnn_cell.BasicRNNCell(rnn_gen_size)
                state_gen = cell_gen.zero_state(batch_size, tf.float32)
    
                # TODO: Sampling for generator.
                rnn_in = batch_indices
                rnn_out, state_gen = cell_gen(rnn_in, state_gen)
                log_probs = tf.nn.log_softmax(rnn_out)

                index = tf.contrib.bayesflow.stochastic_graph.DistributionTensor(
                    tf.contrib.distributions.Categorical,
                    logits=log_probs)

                logits = tf.one_hot(index, rnn_gen_size, dtype=tf.float32)
                rnn_in = logits

            with tf.variable_scope('dis'):
                cell_dis = tf.nn.rnn_cell.BasicRNNCell(rnn_dis_size)
                state_dis = cell_dis.zero_state(batch_size, tf.float32)
    
                # Set the discriminator to a fixed value for debugging sake.
                rnn_out, state_dis = cell_dis(rnn_in, state_dis)
                pred = tf.nn.softmax(rnn_out)
                predictions.append(pred)
    
    
with tf.name_scope('reinforce'):
    rewards = []
    for pred in predictions:
        rewards.append(pred[:, 1])
    
    # Exponential baseline.
    # TODO: Baseline over all the time points?
    ema = tf.train.ExponentialMovingAverage(decay = decay)
    rewards_tf = tf.pack(rewards)
    reduced_reward = tf.reduce_mean(rewards_tf)
    maintain_avg_op = ema.apply([reduced_reward])
    baseline = ema.average(reduced_reward)
    
    # Advantage.
    loss = []
    for reward in rewards:
        advantage = reward - baseline
        loss.append(-advantage)

    # Optimizer 
    optimizer = tf.train.AdamOptimizer(lr)
    final_loss = tf.contrib.bayesflow.stochastic_graph.surrogate_loss(loss)
    gen_vars = [v for v in tf.trainable_variables() if v.op.name.startswith('gen')]
    min_op = optimizer.minimize(final_loss, var_list = gen_vars)
    
    # Group operations.
    train_op = tf.group(min_op, maintain_avg_op)
    
    
# Why is only a single row updating?
with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for t in xrange(num_trials):
#         for v in tf.trainable_variables():
#             if v.op.name.startswith('gen'):
#                 print v.op.name
#                 print sess.run(v)
        
        sess.run(train_op)
        if t % print_every == 0:
            print sess.run([reward, baseline, advantage])   