In [1]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np

tfe.enable_eager_execution()

In [2]:
def temporal_softmax_loss(x, y, mask, verbose=False):
    N, T, V = x.get_shape().as_list()

    x_flat = tf.reshape(x, [N * T, V]) # x.reshape(N * T, V)
    y_flat = tf.reshape(y, [N * T]) # y.reshape(N * T)
    mask_flat = tf.reshape(mask, [N * T]) # mask.reshape(N * T)
    mask_flat = tf.cast(mask_flat, dtype=tf.float32)

    probs = tf.exp(x_flat - tf.reduce_max(x_flat, axis=1, keepdims=True))
    probs /= tf.reduce_sum(probs, axis=1, keepdims=True)
    probs = tf.gather_nd(probs, tf.stack((tf.range(N * T), y_flat), -1))
    loss = -tf.reduce_sum(mask_flat * tf.log(probs)) / N

    return loss


class CaptioningRNN(object):
    def __init__(self, word_to_idx, hidden_dim=128, wordvec_dim=128):
        self.word_to_idx = word_to_idx
        self.idx_to_word = {i: w for w, i in word_to_idx.items()}
        vocab_size = len(word_to_idx)
        
        self._null = word_to_idx['<NULL>']
        self._start = word_to_idx.get('<START>', None)
        self._end = word_to_idx.get('<END>', None)
        
        # initialize layers
        initializer = tf.variance_scaling_initializer(scale=2.0)
        self.w_embed = tfe.Variable(np.random.randn(vocab_size, wordvec_dim), dtype=tf.float32)
        self.proj_layer = tf.layers.Dense(units=hidden_dim, kernel_initializer=initializer)
        
        self.hidden_dim = hidden_dim
        self.encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_dim)
#         self.encoder_cell = tf.keras.layers.LSTMCell(units=hidden_dim)
        
        self.vocab_layer = tf.layers.Dense(units=vocab_size, kernel_initializer=initializer)
    
    
    def get_loss(self, features, captions):
        features = tf.convert_to_tensor(features, dtype=tf.float32)
        captions_in = tf.convert_to_tensor(captions[:, :-1], dtype=tf.int32)
        captions_out = tf.convert_to_tensor(captions[:, 1:], dtype=tf.int32)
        mask = tf.not_equal(captions_out, self._null)
        
        h0 = self.proj_layer(features)
        x = tf.nn.embedding_lookup(self.w_embed, captions_in)
        
        state = tf.nn.rnn_cell.LSTMStateTuple(c=np.zeros((features.shape[0].value, self.hidden_dim)), h=h0)
        timestep_x = tf.unstack(x, axis=1)
        outputs, cell_states = [], []
        
#         print("features shape: {}".format(features.shape))
#         print("captions_in shape: {}".format(captions_in.shape))
#         print("h0 shape: {}".format(h0.shape))
#         print("timestep_x shape: {}".format(timestep_x[0].shape))
#         print("x shape: {}".format(x.shape))
        
        for input_step in timestep_x:
#             print('input_step shape is {}'.format(input_step.shape))
            output, state = self.encoder_cell(input_step, state)
            
            outputs.append(output)
            cell_states.append(state[0])
        
        outputs = tf.stack(outputs, axis=1)
        cell_states = tf.stack(cell_states, axis=1)
        
        scores = self.vocab_layer(outputs)
        loss = temporal_softmax_loss(scores, captions_out, mask)

        return loss
    
    
    def sample(self, features, max_length=30):
        N = features.shape[0]
        captions = self._null * np.ones((N, max_length))
        captions = captions.astype(np.int32)
        
        
        h0 = self.proj_layer(features)
        x = tf.nn.embedding_lookup(self.w_embed, self._start * np.ones(N, dtype=np.int32))
        state = tf.nn.rnn_cell.LSTMStateTuple(c=np.zeros((N, self.hidden_dim)), h=h0)
        
#         print("caption type: {}".format(captions.dtype))
#         print("features shape: {}".format(features.shape))
#         print("h0 shape: {}".format(h0.shape))
#         print("x shape: {}".format(x.shape))
        
        for t in range(max_length):
            output, state = self.encoder_cell(x, state)
            scores = self.vocab_layer(output)
            
#             print("argmax type: {}".format(tf.argmax(scores, axis=1).dtype))
#             print("caption type: {}".format(captions[:, t].dtype))
            
            captions[:, t] += tf.cast(tf.argmax(scores, axis=1), tf.int32)
            x = tf.nn.embedding_lookup(self.w_embed, captions[:, t])
        
        return captions

In [3]:
N, D, W, H = 10, 20, 30, 40
word_to_idx = {'<NULL>': 0, 'cat': 2, 'dog': 3, '<START>': 4, '<END>': 5}
V = len(word_to_idx)
T = 13

features = np.linspace(-0.5, 1.7, num=N*D).reshape(N, D)
captions = (np.arange(N * T) % V).reshape(N, T).astype(np.int32)

model = CaptioningRNN(word_to_idx, hidden_dim=H, wordvec_dim=W)

optimizer = tf.train.GradientDescentOptimizer(5e-3)
# encoder.loss(features, captions)
for e in range(500):
    optimizer.minimize(lambda: model.get_loss(features, captions))
    
    if e % 50 == 0:
        l = model.get_loss(features, captions)
        print("loss {}".format(l))

Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
loss 14.411775588989258
loss 6.395439147949219
loss 2.9432342052459717
loss 1.5759485960006714
loss 0.9916241765022278
loss 0.6958798766136169
loss 0.5248154401779175
loss 0.41589289903640747
loss 0.3415193557739258
loss 0.2880096733570099


In [5]:
model.sample(features).shape

(10, 30)