In [0]:
import tensorflow as tf
from tensorflow.contrib import rnn

class Seq2SeqModel(object):
  def __init__(self,vocab_size, word_embedding, input_len, output_len, params, train=True):
    # Get the vocab size
    self.vocab_size=vocab_size
    
    # Get hyper-parameters from params       
    self.num_layers=params['num_layers']
    self.num_hiddens=params['num_hiddens']    
    self.learning_rate = params['learning_rate']
    self.keep_prob = params['keep_prob']
    self.beam_width = params['beam_width']
    
    # Using BasicLSTMCell as a cell unit
    self.cell=tf.nn.rnn_cell.LSTMCell  
    
    # Define Place holders for the model
    self.batch_size=tf.placeholder(tf.int32,(),name="batch_size")
    self.global_step = tf.Variable(0, trainable=False) # False means not adding the variable to the graph collection 
    
    # place holders for encoder
    self.inputSeq=tf.placeholder(tf.int32,[None,input_len])
    self.inputSeq_len=tf.placeholder(tf.int32, [None]) # Need to define the Shape as required in tf.contrib.seq2seq.tile_batch
    
    # place holders for decoder
    self.decoder_input=tf.placeholder(tf.int32,[None,output_len])
    self.decoder_len=tf.placeholder(tf.int32, [None])
    self.decoder_target=tf.placeholder(tf.int32,[None,output_len])
    
    # Define projection_layer
    self.projection_layer = tf.layers.Dense(self.vocab_size, use_bias=False)
    
    # Define the Embedding layer
    with tf.name_scope("embedding"):
      self.embeddings=tf.get_variable("embeddings",initializer=tf.constant(word_embedding,dtype=tf.float32))
      
      # map the int value with its embeddings
      input_emb=tf.nn.embedding_lookup(self.embeddings,self.inputSeq)
      decoder_input_emb=tf.nn.embedding_lookup(self.embeddings,self.decoder_input)
      
      # Convert from batch_size*seq_len*embedding to seq_len*batch_size*embedding to feed data with timestep      
      # But, we need to set time_major=True during Training
      self.encoder_inputEmb = tf.transpose(input_emb, perm=[1, 0, 2])
      self.decoder_inputEmb = tf.transpose(decoder_input_emb, perm=[1, 0, 2])
      
    # Define the Encoder
    with tf.name_scope("encoder"):      
      # Create RNN Cell for forward and backward direction
      fw_cells=list()
      bw_cells=list()
      for i in range(self.num_layers):
        fw_cell= self.cell(self.num_hiddens)
        bw_cell= self.cell(self.num_hiddens)
        
        # Add Dropout
        fw_cell=rnn.DropoutWrapper(fw_cell,output_keep_prob=self.keep_prob)
        bw_cell=rnn.DropoutWrapper(bw_cell,output_keep_prob=self.keep_prob)
        
        # Add cell to the list
        fw_cells.append(fw_cell)
        bw_cells.append(bw_cell)
        
        
      # Build a multi bi-directional model from fw_cells and bw_cells
      outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
          cells_fw=fw_cells, cells_bw=bw_cells,inputs=self.encoder_inputEmb,time_major=True, sequence_length=self.inputSeq_len, dtype=tf.float32)
      
      # The ouput of Encoder (time major)
      self.encoder_outputs=outputs
      
      # Use the final state of the last layer as encoder_final_state 
      encoder_state_c = tf.concat((encoder_state_fw[-1].c, encoder_state_bw[-1].c), 1)
      encoder_state_h = tf.concat((encoder_state_fw[-1].h, encoder_state_bw[-1].h), 1)
      self.encoder_final_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)
      
    # Define the Decoder for training
    with tf.name_scope("decoder"):
      # Define Decoder cell
      decoder_num_hiddens =self.num_hiddens * 2 # As we use bi-directional RNN
      decoder_cell=self.cell(decoder_num_hiddens)
      
      # Training mode 
      if(train):
        # Convert from time major to batch major 
        attention_states = tf.transpose(self.encoder_outputs, [1, 0, 2])
        
         # Decoder with attention      
        attention=tf.contrib.seq2seq.BahdanauAttention(num_units=decoder_num_hiddens, memory=attention_states, memory_sequence_length=self.inputSeq_len,normalize=True)
        attention_decoder_cell= tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell,attention_mechanism=attention,attention_layer_size=decoder_num_hiddens)

        # Use the final state of encoder as the initial state of the decoder
        decoder_initial_state = attention_decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
        decoder_initial_state = decoder_initial_state.clone(cell_state=self.encoder_final_state )

        # Use TrainingHelper to train the Model 
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=self.decoder_inputEmb,sequence_length=self.decoder_len, time_major=True)
        decoder = tf.contrib.seq2seq.BasicDecoder(cell=attention_decoder_cell,helper=training_helper,initial_state=decoder_initial_state,output_layer=self.projection_layer)
        logits, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True,maximum_iterations=output_len)
        
        
        # Convert from time major to batch major 
        self.training_logits = tf.transpose(logits.rnn_output, perm=[1, 0, 2])
        
        # Adding zero to make sure training_logits has shape: [batch_size, sequence_length, num_decoder_symbols]
        self.training_logits = tf.concat([self.training_logits, tf.zeros([self.batch_size, output_len - tf.shape(self.training_logits)[1], self.vocab_size])], axis=1)
     
      # Inference mode 
      else:
        # Using Beam search
        tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(tf.transpose(self.encoder_outputs, perm=[1, 0, 2]), multiplier=self.beam_width)
        tiled_encoder_final_state=tf.contrib.seq2seq.tile_batch(self.encoder_final_state, multiplier=self.beam_width)
        tiled_inputSeq_len=tf.contrib.seq2seq.tile_batch(self.inputSeq_len, multiplier=self.beam_width)

        # Decoder with attention with Beam search
        attention=tf.contrib.seq2seq.BahdanauAttention(num_units=decoder_num_hiddens, memory=tiled_encoder_outputs, memory_sequence_length=tiled_inputSeq_len,normalize=True)
        attention_decoder_cell= tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell,attention_mechanism=attention,attention_layer_size=decoder_num_hiddens)

        # Use the final state of encoder as the initial state of the decoder
        decoder_initial_state = attention_decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
        decoder_initial_state = decoder_initial_state.clone(cell_state=tiled_encoder_final_state)

        # Build a Decoder with Beam Search
        beamSearch_decoder=tf.contrib.seq2seq.BeamSearchDecoder(          
            cell=attention_decoder_cell,
            embedding=self.embeddings,
            start_tokens=tf.fill([self.batch_size],tf.constant(2)),
            end_token=tf.constant(3),
            initial_state=decoder_initial_state,
            beam_width=self.beam_width,
            output_layer=self.projection_layer  
        )

        # Perform dynamic decoding with beamSearch_decoder
        outputs, _ , _ =tf.contrib.seq2seq.dynamic_decode(decoder=beamSearch_decoder,maximum_iterations= output_len,output_time_major=True)
        
        # Convert from seq_len*batch_size*beam_width to batch_size*beam_width*seq_len
        outputs=tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])
        
        # Take the first beam (best result) as Decoder ouput 
        self.decoder_outputs=outputs[:,0,:]

    with tf.name_scope("optimization"):
      # Used for Training mode only 
      if(train):
        # Caculate loss value 
        masks = tf.sequence_mask(lengths=self.decoder_len,maxlen=output_len, dtype=tf.float32)         
        self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.training_logits,targets=self.decoder_target,weights=masks)

        # Using AdamOptimizer
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # Compute gradient 
        gradients = optimizer.compute_gradients(self.loss)
        # Apply Gradient Clipping 
        gradients_clipping = [(tf.clip_by_value(grad, clip_value_min=-5., clip_value_max=5.), var) for grad, var in gradients if grad is not None]

        # Apply gradients to variables
        self.train_update = optimizer.apply_gradients(gradients_clipping, global_step=self.global_step)