# Backpropagation Through Single *Sampled* Logit.
We test the ability to backpropagate through a single logit.  If you only allow gradients through a single logit, then at a given training iteration, you will only impact one column of your weight matrix, that is, weights contributing to the value of that logit. 

In [None]:
import tensorflow as tf
from tensorflow.contrib.distributions import Categorical
tf.reset_default_graph()

input_dim = 3
hidden_dim = 5
output_dim = 3
lr = 1e-1
num_iterations = 50
print_every = 1

x = tf.fill([1, input_dim], 1.)
y = tf.fill([1, output_dim], 1.)


with tf.name_scope('Model'):
    W_gen = tf.Variable(tf.random_uniform([input_dim, hidden_dim]), name = 'W_gen')
    logits = tf.matmul(x, W_gen)

    # Sample through single logit.
    sample_op = tf.stop_gradient(Categorical(logits).sample(n=1))
    index = tf.squeeze(sample_op)
#     index = tf.constant(0, dtype=tf.int32)
    one_hot = tf.one_hot(index, hidden_dim, dtype = tf.float32)
    logits = logits * one_hot

    W_dis = tf.Variable(tf.random_uniform([hidden_dim, output_dim]), name = 'W_dis')
    output = tf.matmul(logits, W_dis)

        
with tf.name_scope('Loss'):
    loss_op = tf.reduce_mean(tf.squared_difference(output, y))

with tf.name_scope('Train'):
    train_vars = [W_gen]
    train_op = tf.train.AdamOptimizer(lr).minimize(loss_op, var_list = train_vars)
    
    
with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)

    for i in xrange(num_iterations):
        if i % print_every == 0:
            print('Loss at iteration %d: %f' % (i, sess.run(loss_op)))
            print('Sample: [%d]' % sess.run(index))
            print sess.run(W_gen)
        sess.run(train_op)
    print sess.run(output)


### Analysis
Training, even for the most trivial task imaginable, is very volatile.  This does not appear to be a plausible way to proceed.

### TensorFlow Bug
Mathematical impossibility. For a drawn sample $i$, columns $\neq i$ of the generator weight matrix $W_{gen}$ are updating! This only occurs when I'm using Categorical.  This is filed as [Issue 4074](https://github.com/tensorflow/tensorflow/issues/4074).


# Distribution vs. One-Hot


## Forward Propagation.

In [76]:
import tensorflow as tf
import numpy as np
from tensorflow.contrib.distributions import Categorical
tf.reset_default_graph()

input_dim = 3
output_dim = 1
num_samples = 250

x = tf.constant([[0.2, 0.3, 0.5]], dtype = tf.float32) 

with tf.name_scope('dist_input'):
    W_dis = tf.Variable(tf.random_uniform([input_dim, output_dim]), name = 'W_dis')
    output = tf.matmul(x, W_dis)


with tf.name_scope('one_hot_input'):
    tf.get_variable_scope().reuse_variables()
    outputs = []
    
    for i in xrange(num_samples):
        sample_op = Categorical(x).sample(n=1)
        index = tf.squeeze(sample_op)
        x = tf.one_hot(index, input_dim, dtype = tf.float32)
        x = tf.reshape(x, [1, input_dim])
        outputs.append(tf.matmul(x, W_dis))
    
        
with tf.Session() as sess:
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    print('Distribution: %f' %  sess.run(tf.squeeze(output)))
    print('One-hot: %f' % np.average(sess.run(outputs)))


Distribution: 0.800210
One-hot: 0.799524
