In [1]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np

tfe.enable_eager_execution()

In [4]:
class EncoderRNN(object):
    def __init__(self, word_to_idx, input_dim=512, hidden_dim=128, wordvec_dim=128):
        self.word_to_idx = word_to_idx
        self.idx_to_word = {i: w for w, i in word_to_idx.items()}
        vocab_size = len(word_to_idx)
        
        self._null = word_to_idx['<NULL>']
        self._start = word_to_idx.get('<START>', None)
        self._end = word_to_idx.get('<END>', None)
        
        initializer = tf.variance_scaling_initializer(scale=2.0)
        self.w_embed = tfe.Variable(np.random.randn(vocab_size, wordvec_dim), dtype=tf.float32)
        self.proj_layer = tf.layers.Dense(units=hidden_dim, kernel_initializer=initializer)
        
        self.hidden_dim = hidden_dim
        self.encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_dim)
    
    def forward(self, features, captions_in):
        features = tf.convert_to_tensor(features, dtype=tf.float32)
        captions_in = tf.convert_to_tensor(captions_in, dtype=tf.int32)
        
        h0 = self.proj_layer(features)
        x = tf.nn.embedding_lookup(self.w_embed, captions_in)
        
        state = tf.nn.rnn_cell.LSTMStateTuple(c=np.zeros((x.shape[0], self.hidden_dim)), h=h0)
        timestep_x = tf.unstack(x, axis=1)
        outputs, cell_states = [], []
        
        for input_step in timestep_x:
            output, state = self.encoder_cell(input_step, state)
            
            outputs.append(output)
            cell_states.append(state[0])
        
        outputs = tf.stack(outputs, axis=1)
        cell_states = tf.stack(cell_states, axis=1)
        return outputs, cell_states

In [5]:
N, D, W, H = 10, 20, 30, 40
word_to_idx = {'<NULL>': 0, 'cat': 2, 'dog': 3}
V = len(word_to_idx)
T = 13

features = np.linspace(-0.5, 1.7, num=N*D).reshape(N, D)
captions = (np.arange(N * T) % V).reshape(N, T)

encoder = EncoderRNN(word_to_idx, input_dim=D, hidden_dim=H, wordvec_dim=W)
encoder.forward(features, captions[:, :-1])



(<tf.Tensor: id=709, shape=(10, 12, 40), dtype=float32, numpy=
 array([[[-9.89928544e-02,  3.12763080e-02, -1.22419268e-01, ...,
          -6.30025715e-02, -2.45698527e-01,  1.43239737e-01],
         [-5.51665984e-02,  8.39709193e-02, -1.21819086e-01, ...,
          -1.59578621e-01, -1.67258337e-01,  2.37064317e-01],
         [-4.24535461e-02,  7.19536915e-02,  5.79123907e-02, ...,
          -1.57771453e-01,  9.91187692e-02,  2.49051481e-01],
         ...,
         [-5.05314488e-03,  5.51270135e-02, -2.35493891e-02, ...,
          -2.53058434e-01, -1.49394721e-01,  2.14212447e-01],
         [ 4.51324582e-02,  8.68560523e-02, -8.95369127e-02, ...,
          -3.59478086e-01, -5.55806272e-02,  2.86216408e-01],
         [ 1.22316934e-01,  3.97510380e-02,  1.08376659e-01, ...,
          -3.34810019e-01,  1.80044085e-01,  3.09474766e-01]],
 
        [[ 3.28874104e-02,  7.13383183e-02, -8.38517994e-02, ...,
          -4.87979576e-02,  5.47803938e-02,  1.39418930e-01],
         [ 6.63946345e-0