# Reinforcement Learning GAN
Employ REINFORCE for training the Generator in a GAN formuluation.  As before, the reward to the Generator $G$ is simply the probability of real at each time point, $p_t$, assigned by the Discriminator $D$.

Classic GAN minimization objective,

$$\text{max}_{G_{\theta}} \text{min}_{D_{\theta}} \left[ \text{log}(D(x)) + \text{log}(1. - D(G(z)) \right]$$

In [33]:
import tensorflow as tf
import numpy as np
from tensorflow.contrib.distributions import Categorical

rnn_gen_size = 16
rnn_dis_size = 16
vocab_dim = 3
time_steps = 6
batch_size = 2
num_iterations = 500
gen_iterations = 1
dis_iterations = 1
print_every = 50
lr = 0.01
decay = 0.99

## Generator and Discriminator.
Advantage function without a baseline may perform better for this task.  The hypothesis is that the pace of $D$'s learning can outpace the $G$'s learning.  So for instance, if the predictions from $D$ at a given time step, which we equate to reward $r_t$, decreases rapidly to 0, then the exponential moving average baseline will renormalize and leave little residual reward for a signal.

In [15]:
def score_function_with_mean_baseline(dt, val, rewards, decay = 0.99):
    '''Create a specific score function to pass to DistributionTensors'''
    reward = tf.reduce_mean(tf.add_n(rewards))
    ema = tf.train.ExponentialMovingAverage(decay)
    update_op = ema.apply([reward])
    baseline = ema.average(reward)
#     advantage = tf.stop_gradient(reward - baseline)
    advantage = tf.stop_gradient(reward)
    with tf.control_dependencies([update_op]):
        return dt.distribution.log_prob(val) * advantage


def generator(reuse=False):
    '''Define the Generator graph.'''
    # TODO: Generalize for random input.
    # batch_indices = gen_z
    init_input = tf.zeros([batch_size, vocab_dim])
    
    with tf.variable_scope('gen', reuse=reuse):
        cell_gen = tf.nn.rnn_cell.BasicRNNCell(rnn_gen_size)
        state_gen = cell_gen.zero_state(batch_size, tf.float32)

        with tf.variable_scope('rnn') as vs:
            sequence = []
        
            one_hot = init_input
            for t in xrange(time_steps):        
                if t > 0:
                    tf.get_variable_scope().reuse_variables()
                    
                rnn_in = one_hot
                rnn_out, state_gen = cell_gen(rnn_in, state_gen)
                
                log_probs = tf.nn.log_softmax(
                    tf.contrib.layers.linear(rnn_out, vocab_dim, scope=vs))

                index = tf.contrib.bayesflow.stochastic_graph.DistributionTensor(
                    tf.contrib.distributions.Categorical,
                    logits=log_probs,
                    loss_fn=score_function_with_mean_baseline)
                one_hot = tf.one_hot(index, vocab_dim, dtype=tf.float32)
                sequence.append(one_hot)
    return tf.pack(sequence, axis=1)


def discriminator(sequence, reuse=False):
    '''Define the Discriminator graph.'''
    sequence = tf.unpack(sequence, axis=1)
    with tf.variable_scope('dis', reuse=reuse):    
        cell_dis = tf.nn.rnn_cell.BasicRNNCell(rnn_dis_size)
        state_dis = cell_dis.zero_state(batch_size, tf.float32)

        with tf.variable_scope('rnn') as vs:
            predictions = []

            for t, inp in enumerate(sequence):
                rnn_in = inp
                if t > 0:
                    tf.get_variable_scope().reuse_variables()

                rnn_out, state_dis = cell_dis(rnn_in, state_dis)
                pred = tf.nn.sigmoid(tf.contrib.layers.linear(rnn_out, 1, scope=vs))
                predictions.append(pred)
    return tf.pack(predictions, axis=1)

## Losses for Models.
The reward, which is the taken from the probability of real at each time step, is rescaled to a positive and negative range.

Specifically, the probability of real at each time step is $p_t \in [0,1]$, which we could equate to the reward at each time step $r_t$. We remap this to have rewards in the positive to negative range,

$$r_t \mapsto \left(r_t - 0.5\right) \times f$$

where $f$ is some scale factor. 

In [31]:
def generator_rewards(predictions):
    '''Generator rewards based on Discriminator predictions.'''
    predictions = tf.unpack(tf.squeeze(predictions, [2]), axis=1)
    
    # Modify predictions to have a larger dynamic range.
    predictions = [(p - 0.5) * 10. for p in predictions]
    final_reward = tf.contrib.bayesflow.stochastic_graph.surrogate_loss(predictions)
    return final_reward
    
        
def discriminator_loss(predictions, labels):
    '''Discriminator loss based on predictions and labels'''
    loss = tf.contrib.losses.log_loss(tf.squeeze(predictions, [2]), labels)
    final_loss = tf.reduce_sum(loss) / batch_size / time_steps
    return final_loss

## Generative Adversarial Networks Training.

In [36]:
tf.reset_default_graph()

# Training data.
real_sequence = tf.constant([[[1.,0.,0.],
                              [0.,1.,0.],
                              [0.,0.,1.],
                              [1.,0.,0.],
                              [0.,1.,0.],
                              [0.,0.,1.]],
                             [[1.,0.,0.],
                              [0.,1.,0.],
                              [0.,0.,1.],
                              [1.,0.,0.],
                              [0.,1.,0.],
                              [0.,0.,1.]]], dtype=tf.float32)

real_labels = tf.ones((batch_size, time_steps))
fake_labels = tf.zeros((batch_size, time_steps))

# Generator loss.
fake_sequence = generator()
fake_predictions = discriminator(fake_sequence)
gen_reward = generator_rewards(fake_predictions)

# Discriminator loss.
real_predictions = discriminator(real_sequence, reuse=True)
dis_loss_real = discriminator_loss(real_predictions, real_labels)
dis_loss_fake = discriminator_loss(fake_predictions, fake_labels)
dis_loss = dis_loss_real + dis_loss_fake


with tf.name_scope('train_generator'):
    gen_optimizer = tf.train.AdamOptimizer(lr)
    gen_vars = [v for v in tf.trainable_variables() if v.op.name.startswith('gen')]
    gen_train_op = gen_optimizer.minimize(-gen_reward, var_list = gen_vars)

with tf.name_scope('train_discriminator'):
    dis_optimizer = tf.train.AdamOptimizer(lr)
    dis_vars = [v for v in tf.trainable_variables() if v.op.name.startswith('dis')]
    dis_train_op = dis_optimizer.minimize(dis_loss, var_list = dis_vars)


with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    print 'num_iterations', num_iterations
    for t in xrange(num_iterations):
        for i in xrange(dis_iterations):
            _, dis_loss_eval = sess.run([dis_train_op, dis_loss])

        for i in xrange(gen_iterations):
            _, gen_loss_eval = sess.run([gen_train_op, gen_reward])

        if t % print_every == 0:
            print 'Dis loss:', dis_loss_eval
            print 'Gen loss:', gen_loss_eval[0]
                      
    print 'fake:', sess.run([fake_sequence, fake_predictions]), '\n'
    print 'real:', sess.run([real_sequence, real_predictions]), '\n'
#     print sess.run(dis_vars)



num_iterations 500
Dis loss: 0.117277
Gen loss: -9.60725
Dis loss: 0.093221
Gen loss: -13.1313
Dis loss: 0.109244
Gen loss: 1.08446
Dis loss: 0.0611211
Gen loss: 109.48
Dis loss: 0.0436844
Gen loss: -13.6265
Dis loss: 0.0313581
Gen loss: 80.8814
Dis loss: 0.0840758
Gen loss: 192.522
Dis loss: 0.055773
Gen loss: 6.08514
Dis loss: 0.0736771
Gen loss: 0.727464
Dis loss: 0.0969725
Gen loss: -8.6589
fake: [array([[[ 1.,  0.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.],
        [ 1.,  0.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.]],

       [[ 0.,  1.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.],
        [ 1.,  0.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.]]], dtype=float32), array([[[  4.68854666e-01],
        [  5.11118889e-01],
        [  5.49790025e-01],
        [  5.29724658e-01],
        [  5.52442193e-01],
        [  5.09709775e-01]],

       [[  2.63878014e-02],
        [  1.90954233e-04],
        [  2.97354301e-04],
        [  9.56246164

# TODO
* Generalize for dictionary feed.
* Generalize for random input $z_{gen}$.

### Check Loss Calculation

In [None]:
logits = tf.constant([[[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5, 0.5]]], dtype=tf.float32)
labels = tf.constant([[0, 0, 0, 0], [0, 0, 0, 0]])

loss = tf.contrib.losses.log_loss(logits, labels)
with tf.Session() as sess:
    print sess.run(loss)


## Real Data.
Create a simple data distribution with sequential structure for testing the GAN.

In [None]:
num_examples = 1000

# Real Sequence (num_examples, time_steps, inp_dim)
real_sequences = []
real_seq = np.array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.],[1.,1.,1.]])

for i in xrange(num_examples):
    real_sequences.append(real_seq)
real_sequences = np.asarray(real_sequences)

# Real Labels (num_examples, time_steps)
real_labels = np.ones([num_examples, time_steps], dtype=np.int)