In [0]:
%tensorflow_version 1.x
import tensorflow as tf
print(tf.__version__)

TensorFlow 1.x selected.
1.15.2


In [0]:
import os
os.chdir('/content/drive/My Drive/')
# Set paths of train and validation data
train_article_path = "Data/gigaword/train/train.article.txt"
train_title_path   = "Data/gigaword/train/train.title.txt"
valid_article_path = "Data/gigaword/train/valid.article.filter.txt"
valid_title_path   = "Data/gigaword/train/valid.title.filter.txt"

# **LOAD DATA**

In [0]:
import re
import string


def getSentList(path, n_sents):
  sentList=[]
  with open(path) as f:
    # using 100 to test the code only
    for line in f.readlines()[:n_sents]:
    #for line in f.readlines():
      # Remove begining and ending space
      sent=line.strip()
      # Replace ## or ##.#, #,## .. by #
      sent=re.sub("#(\W)*","# ",sent) 
      # Remove !#"$%&'()*+,-./:;<=>?@[\]^_`{|}~
      #sent=sent.translate(str.maketrans(' ', ' ', string.punctuation))
      # Add sent to the list
      sentList.append(sent)
  
  return sentList

# Get train_article and train_title
train_article=getSentList(train_article_path,100000)
train_title=getSentList(train_title_path,100000)

# Print some data
print('Length of train data: ',len(train_article),len(train_title))

print('\nThe first five train articles and titles: ')
for i in range(5):
  print('Train article: ',train_article[i])
  print('Train title: ',train_title[i])
  print()


Length of train data:  100000 100000

The first five train articles and titles: 
Train article:  australia 's current account deficit shrunk by a record # billion dollars -lrb- # billion us -rrb- in the june quarter due to soaring commodity prices , figures released monday showed .
Train title:  australian current account deficit narrows sharply

Train article:  at least two people were killed in a suspected bomb attack on a passenger bus in the strife-torn southern philippines on monday , the military said .
Train title:  at least two dead in southern philippines blast

Train article:  australian shares closed down # percent monday following a weak lead from the united states and lower commodity prices , dealers said .
Train title:  australian stocks close down # percent

Train article:  south korea 's nuclear envoy kim sook urged north korea monday to restart work to disable its nuclear plants and stop its `` typical '' brinkmanship in negotiations .
Train title:  envoy urges north k

# **BUILD VOCAB**

In [0]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk import word_tokenize

words=[]
# Need to modify this later
for sent in train_article+train_title:
  for word in word_tokenize(sent):
    words.append(word)
# Print first 15 words
print("\nThe first 15 words: \n",words[:15])
# Need to modify this later
#for sent in train_article+train_title:
#  for word in word_tokenize(sent):
#    if not word in stopwords.words():
#      words.append(word)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

The first 15 words: 
 ['australia', "'s", 'current', 'account', 'deficit', 'shrunk', 'by', 'a', 'record', '#', 'billion', 'dollars', '-lrb-', '#', 'billion']


In [0]:
import collections
import pickle
from pickle import dump
# Create word2int and int2word dictionary
word_counter = collections.Counter(words).most_common()

word2int = dict()
word2int["<pad>"] = 0
word2int["<unk>"] = 1
word2int["<s>"] = 2
word2int["</s>"] = 3
for word, _ in word_counter:
    word2int[word] = len(word2int)
    
int2word = dict(zip(word2int.values(), word2int.keys()))

print("The length of word2index: " , len(word2int))
print("\nThe word2index: \n" , word2int)
print("\nThe int2word: \n" , int2word)

print("\nsize:", len(int2word))

 
# Save word2int and int2word into pickle file

dump(word2int, open('vocab_w2i_100000_glove.pkl', 'wb'))
dump(int2word, open('vocab_i2w_100000_glove.pkl', 'wb'))

The length of word2index:  50409

The word2index: 

The int2word: 

size: 50409


# **CONVERT SEQ TO INT**

In [0]:
import numpy as np
from tensorflow import keras

# Convert input data from text to int
def get_intSeq(data_list,max_length,padding=False):
  seq_list=list()
  for sent in data_list:
    # Get tokens in each sent
    words=word_tokenize(sent)    
    
    # Use this for train_article
    if(padding):
      # Make all sent to have the same length as max_length
      if(len(words)<max_length):
        words=words + (max_length-len(words))*["<pad>"] 
      else:
        words=words[:max_length]
        
    # Use this for train_title
    else:
      words=words[:(max_length-1)]
    
    # Convert word to its corresponding int value
    # If the word doesnt exist, use the value of "<unk>" by default
    int_seq=[word2int.get(word,word2int["<unk>"]) for word in words]
    
    # Add int_seq to seq_list
    seq_list.append(int_seq)      
  
  return seq_list

# Define the max length of article and title
article_max_len = 45
title_max_len = 15

# Get the sequence of int value
train_article_intSeq=get_intSeq(train_article,article_max_len,padding=True)
train_title_intSeq=get_intSeq(train_title,title_max_len)

print('Train article: ',train_article[0])
print('Train article(int seq): ',train_article_intSeq[0])
print('\nTrain title: ',train_title[0])
print('Train title(int seq): ',train_title_intSeq[0])


Train article:  australia 's current account deficit shrunk by a record # billion dollars -lrb- # billion us -rrb- in the june quarter due to soaring commodity prices , figures released monday showed .
Train article(int seq):  [170, 13, 901, 2477, 771, 18804, 28, 10, 261, 11, 155, 73, 50, 11, 155, 18, 48, 5, 4, 752, 384, 264, 6, 3067, 6239, 61, 8, 710, 465, 32, 534, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Train title:  australian current account deficit narrows sharply
Train title(int seq):  [158, 901, 2477, 771, 7557, 852]


# **GLOVE**

In [0]:
glove_path   = "Data/Glove/glove.6B.300d.txt"
 
# Get the word embedding from Glove
word_emb_glove=dict()
with open(glove_path,encoding="utf8") as f:
  for line in f:
    el=line.split()
    word=el[0]
    emb=[float(val) for val in el[1:]]
    word_emb_glove[word]=emb

In [0]:
import numpy as np
# Sort the int2word
int2word_sorted=sorted(int2word.items())
 
# Get the list of word embedding corresponding to int value in ascending order
word_emb_list=list()
embedding_size=len(word_emb_glove['the'])
for int_val, word in int2word_sorted:
  # Add Glove embedding if it exists
  if(word in word_emb_glove):
    word_emb_list.append(word_emb_glove[word])
  
  # Otherwise, the value of word embedding is 0
  else:
    word_emb_list.append(np.zeros([embedding_size], dtype=np.float32))
    
# Assign random vector to <s>, </s> token
word_emb_list[2] = np.random.normal(0, 1, embedding_size)
word_emb_list[3] = np.random.normal(0, 1, embedding_size)
 
# the final word embedding
word_emb=np.array(word_emb_list)
print(len(word_emb)) 
print("The first 5 word embedding: ")
for i in range(5):
  print(word_emb[i])

50409
The first 5 word embedding: 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

# **Model Seq2seq**

In [0]:
import tensorflow as tf
from tensorflow.contrib import rnn

class Seq2SeqModel(object):
    def __init__(self,vocab_size, word_embedding, input_len, output_len, params, train=True):
        # Get the vocab size
        self.vocab_size=vocab_size #-> mục đích là gì?

        # Get hyper-parameters from params       
        self.num_layers=params['num_layers']
        self.num_hiddens=params['num_hiddens']    
        self.learning_rate = params['learning_rate']
        self.keep_prob = params['keep_prob']
        self.beam_width = params['beam_width']

        #CNN----------------------------------#
        #self.filters = params['num_filters']
        #self.kernel_size = params['size_filters']
        #CNN----------------------------------#

        # Using BasicLSTMCell as a cell unit
        self.cell=tf.nn.rnn_cell.LSTMCell

        # Define Place holders for the model
        self.batch_size=tf.placeholder(tf.int32,(),name="batch_size")
        self.global_step = tf.Variable(0, trainable=False) # False means not adding the variable to the graph collection 

        # place holders for encoder
        self.inputSeq=tf.placeholder(tf.int32,[None,input_len])
        self.inputSeq_len=tf.placeholder(tf.int32, [None]) # Need to define the Shape as required in tf.contrib.seq2seq.tile_batch

        # place holders for decoder
        self.decoder_input=tf.placeholder(tf.int32,[None,output_len])
        self.decoder_len=tf.placeholder(tf.int32, [None])
        self.decoder_target=tf.placeholder(tf.int32,[None,output_len])

        # Define projection_layer
        self.projection_layer = tf.layers.Dense(self.vocab_size, use_bias=False)

        # Define the Embedding layer
        with tf.name_scope("embedding"):
            self.embeddings=tf.get_variable("embeddings",initializer=tf.constant(word_embedding,dtype=tf.float32))

            # map the int value with its embeddings
            input_emb=tf.nn.embedding_lookup(self.embeddings,self.inputSeq)
            decoder_input_emb=tf.nn.embedding_lookup(self.embeddings,self.decoder_input)

            # Convert from batch_size*seq_len*embedding to seq_len*batch_size*embedding to feed data with timestep      
            # But, we need to set time_major=True during Training
            self.encoder_inputEmb = tf.transpose(input_emb, perm=[1, 0, 2])
            self.decoder_inputEmb = tf.transpose(decoder_input_emb, perm=[1, 0, 2])
            print("Input encoder cnn: ",self.encoder_inputEmb)
            print("Input decoder cnn: ",self.decoder_inputEmb)
            
        # Define the Encoder
        with tf.name_scope("encoder"):      
            # Create RNN Cell for forward and backward direction
            fw_cells=list()
            bw_cells=list()
            for i in range(self.num_layers):
                fw_cell= self.cell(self.num_hiddens)
                bw_cell= self.cell(self.num_hiddens)

                # Add Dropout
                fw_cell=rnn.DropoutWrapper(fw_cell,output_keep_prob=self.keep_prob)
                bw_cell=rnn.DropoutWrapper(bw_cell,output_keep_prob=self.keep_prob)

                # Add cell to the list
                fw_cells.append(fw_cell)
                bw_cells.append(bw_cell)


            # Build a multi bi-directional model from fw_cells and bw_cells
            outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
              cells_fw=fw_cells, cells_bw=bw_cells,inputs=self.encoder_inputEmb,time_major=True, sequence_length=self.inputSeq_len, dtype=tf.float32)

            # The ouput of Encoder (time major)
            self.encoder_outputs=outputs

            # Use the final state of the last layer as encoder_final_state 
            encoder_state_c = tf.concat((encoder_state_fw[-1].c, encoder_state_bw[-1].c), 1)
            encoder_state_h = tf.concat((encoder_state_fw[-1].h, encoder_state_bw[-1].h), 1)
            self.encoder_final_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)
      
        # Define the Decoder for training
        with tf.name_scope("decoder"):
            # Define Decoder cell
            decoder_num_hiddens =self.num_hiddens * 2 # As we use bi-directional RNN
            decoder_cell=self.cell(decoder_num_hiddens)
      
            # Training mode 
            if(train):
                # Convert from time major to batch major 
                attention_states = tf.transpose(self.encoder_outputs, [1, 0, 2])

                # Decoder with attention      
                attention=tf.contrib.seq2seq.BahdanauAttention(num_units=decoder_num_hiddens, memory=attention_states, memory_sequence_length=self.inputSeq_len,normalize=True)
                attention_decoder_cell= tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell,attention_mechanism=attention,attention_layer_size=decoder_num_hiddens)

                # Use the final state of encoder as the initial state of the decoder
                decoder_initial_state = attention_decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
                decoder_initial_state = decoder_initial_state.clone(cell_state=self.encoder_final_state )

                # Use TrainingHelper to train the Model 
                training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=self.decoder_inputEmb,sequence_length=self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(cell=attention_decoder_cell,helper=training_helper,initial_state=decoder_initial_state,output_layer=self.projection_layer)
                logits, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True,maximum_iterations=output_len)


                # Convert from time major to batch major 
                self.training_logits = tf.transpose(logits.rnn_output, perm=[1, 0, 2])

                # Adding zero to make sure training_logits has shape: [batch_size, sequence_length, num_decoder_symbols]
                self.training_logits = tf.concat([self.training_logits, tf.zeros([self.batch_size, output_len - tf.shape(self.training_logits)[1], self.vocab_size])], axis=1)

            # Inference mode 
            else:
                # Using Beam search
                tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(tf.transpose(self.encoder_outputs, perm=[1, 0, 2]), multiplier=self.beam_width)
                tiled_encoder_final_state=tf.contrib.seq2seq.tile_batch(self.encoder_final_state, multiplier=self.beam_width)
                tiled_inputSeq_len=tf.contrib.seq2seq.tile_batch(self.inputSeq_len, multiplier=self.beam_width)

                # Decoder with attention with Beam search
                attention=tf.contrib.seq2seq.BahdanauAttention(num_units=decoder_num_hiddens, memory=tiled_encoder_outputs, memory_sequence_length=tiled_inputSeq_len,normalize=True)
                attention_decoder_cell= tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell,attention_mechanism=attention,attention_layer_size=decoder_num_hiddens)

                # Use the final state of encoder as the initial state of the decoder
                decoder_initial_state = attention_decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
                decoder_initial_state = decoder_initial_state.clone(cell_state=tiled_encoder_final_state)

                # Build a Decoder with Beam Search
                beamSearch_decoder=tf.contrib.seq2seq.BeamSearchDecoder(          
                    cell=attention_decoder_cell,
                    embedding=self.embeddings,
                    start_tokens=tf.fill([self.batch_size],tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=decoder_initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer  
                )

                # Perform dynamic decoding with beamSearch_decoder
                outputs, _ , _ =tf.contrib.seq2seq.dynamic_decode(decoder=beamSearch_decoder,maximum_iterations= output_len,output_time_major=True)

                # Convert from seq_len*batch_size*beam_width to batch_size*beam_width*seq_len
                outputs=tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])

                # Take the first beam (best result) as Decoder ouput 
                self.decoder_outputs=outputs[:,0,:]

        with tf.name_scope("optimization"):
            # Used for Training mode only 
            if(train):
                # Caculate loss value 
                masks = tf.sequence_mask(lengths=self.decoder_len,maxlen=output_len, dtype=tf.float32)         
                self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.training_logits,targets=self.decoder_target,weights=masks)

                # Using AdamOptimizer
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                # Compute gradient 
                gradients = optimizer.compute_gradients(self.loss)
                # Apply Gradient Clipping 
                gradients_clipping = [(tf.clip_by_value(grad, clip_value_min=-5., clip_value_max=5.), var) for grad, var in gradients if grad is not None]

                # Apply gradients to variables
                self.train_update = optimizer.apply_gradients(gradients_clipping, global_step=self.global_step)
        
print("done")

done


In [0]:
import math
def get_batches(input_data, output_data, batch_size):
    #Convert input and output data from list to numpy array
    input_data=np.array(input_data)
    output_data=np.array(output_data)

    # Number of batches per epoch 
    num_batches_epoch = math.ceil(len(input_data)/batch_size)
    for batch_num in range(num_batches_epoch):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, len(input_data))
        yield input_data[start_index:end_index], output_data[start_index:end_index]

# **TRAIN MODEL**

In [0]:
import tensorflow as tf
BATCH_SIZE = 512
# Run the Seq2seq model written in another ipynb file 

# Define hyper-parameters for the Model
params=dict()
params['num_layers']=2
params['num_hiddens']=400
params['learning_rate']=0.001
params['keep_prob']=0.85
params['beam_width']=5

num_epochs=15
early_stop=5 # Stop if there is no improvement after 5 epochese

# Set paths to save the model
checkpoint = "Data/gigaword/train/TSModeltest.ckpt"
import time
start_time=time.time()


tf.reset_default_graph()
with tf.Session() as sess:
    # Create a Seq2seq model
    model=Seq2SeqModel(len(int2word), word_emb, article_max_len, title_max_len, params)
    
    # Initialize all variables
    sess.run(tf.global_variables_initializer())
    min_loss=1000 # To find the minimum loss during training   
    no_impove_count=0 # Count the number of consecutive epoch having no improvement
    
    for epoch in range (num_epochs):  
        # Get batches from training data 
        batches=get_batches(train_article_intSeq,train_title_intSeq,batch_size=BATCH_SIZE) 
    
        # Reset epoch_loss after each epoch 
        epoch_loss=0
        # Interate over batches
        for batch_i,(batch_x, batch_y) in enumerate(batches):
            # The actual length of each sequence in the batch (excluding "<pad>")
            batch_x_len = list(map(lambda seq: len([word_int for word_int in seq if word_int != 0]), batch_x))

            # Decoder input is created by adding <s> to the begining of each output sentence 
            batch_decoder_input = list(map(lambda seq: [word2int["<s>"]] + list(seq), batch_y))

            # The actual length of each Decoder input (excluding "<pad>")
            batch_decoder_len = list(map(lambda seq: len([word_int for word_int in seq if word_int != 0]), batch_decoder_input))

            # The actual ouput of Decoder is created by adding </s> to the begining of each output sentence
            batch_decoder_output = list(map(lambda seq: list(seq) + [word2int["</s>"]], batch_y))

            # Add <pad> to make all input and ouput of Decoder have same length
            batch_decoder_input = list(
                map(lambda seq: seq + (title_max_len - len(seq)) * [word2int["<pad>"]], batch_decoder_input))
            batch_decoder_output = list(
                map(lambda seq: seq + (title_max_len - len(seq)) * [word2int["<pad>"]], batch_decoder_output))

            # Create a train_feed_dict
            train_feed_dict = {
                model.batch_size: len(batch_x),
                model.inputSeq: batch_x,
                model.inputSeq_len: batch_x_len,

                model.decoder_input: batch_decoder_input,
                model.decoder_len: batch_decoder_len,
                model.decoder_target: batch_decoder_output
            }
      
            # Start training the model
            _, step, loss,encoder_outputs = sess.run([model.train_update, model.global_step, model.loss,model.encoder_outputs], feed_dict=train_feed_dict)
            epoch_loss+=loss
      
            # Display loss value of each step
            print("step {0}: loss = {1}".format(step, loss))      
   
        print("Finish epoch",epoch+1 )
        # Averaging the epoch_loss
        epoch_loss=epoch_loss/(batch_i+1)
    
        # Save the model if the epoch_loss is at a new minimum,
        if epoch_loss <= min_loss:
            # Set new minimum loss
            min_loss=epoch_loss
            # Reset the no_impove_count
            no_impove_count=0 
      
            # Save the new model
            saver = tf.train.Saver(tf.global_variables()) 
            saver.save(sess, checkpoint)
      
            print('New model saved, minimum loss:',min_loss,'\n') 
      
        # Early stopping
        else:
            print("No Improvement!",'\n')
            no_impove_count+=1
            if(no_impove_count==early_stop):
                print("Early stopping... Finish training")
                break

end_time=time.time()
training_time=(end_time-start_time)/60
print("\nTraining time (mins): ",training_time )

Input encoder cnn:  Tensor("embedding/transpose:0", shape=(45, ?, 300), dtype=float32)
Input decoder cnn:  Tensor("embedding/transpose_1:0", shape=(15, ?, 300), dtype=float32)
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initia

# **Test Model**

In [0]:
VALID_SENT = 2000
# The hyper-parameters of the Model (should be the same as in Training)
params=dict()
params['num_layers']=2
params['num_hiddens']=400
params['learning_rate']=0.001
params['keep_prob']=0.85
params['beam_width']=5
 
# Path of the saved model
checkpoint = "Data/gigaword/train/TSModeltest.ckpt"
 
# Reset the default graph
tf.reset_default_graph()
 
# Get the first 10 validation articles and validation titles
valid_article=getSentList(valid_article_path,VALID_SENT)
valid_title=getSentList(valid_title_path,VALID_SENT)
print('Length of test data: ',len(valid_article),len(valid_title))

#article_max_len = 45

# Get the sequence of int value
valid_article_intSeq=get_intSeq(valid_article,article_max_len,padding=True)
valid_title_intSeq=get_intSeq(valid_title,title_max_len)
print("len inseq: ",len(valid_article_intSeq),len(valid_title_intSeq)) 

with tf.Session() as sess:  
  # Load saved model 
  # Use Seq2SeqModel to create the same graph as saved model
  loaded_model=Seq2SeqModel(len(int2word), word_emb, article_max_len, title_max_len, params, train=False)
  
  # Load the value of variables in saved model
  saver = tf.train.Saver(tf.global_variables())
  saver.restore(sess, checkpoint)
  
  # Get batches from validation data 
  batches_valid=get_batches(valid_article_intSeq,valid_title_intSeq,batch_size=VALID_SENT) 
  
  # Interate over batches
  for batch_valid_i,(batch_x_valid, batch_y_valid) in enumerate(batches_valid):
    # The actual length of each sequence in the batch (excluding "<pad>")
    batch_x_len_valid = list(map(lambda seq: len([word_int for word_int in seq if word_int != 0]), batch_x_valid))
    
    # Create a feed_dict for validation data
    valid_feed_dict = {
            loaded_model.batch_size: len(batch_x_valid),
            loaded_model.inputSeq: batch_x_valid,
            loaded_model.inputSeq_len: batch_x_len_valid,
        }
    
    # Get the decoder output by Inference
    decoder_outputs=sess.run(loaded_model.decoder_outputs,feed_dict=valid_feed_dict)
    
    # Convert from sequence of int to actual sentence
    output_titles=[]    
    # Loop through each seq in decoder_outputs
    for out_seq in decoder_outputs:  
      out_sent=list()
      for word_int in out_seq:    
        # Convert int to word 
        word=int2word[word_int]  
        # Stop converting when it reach to the end of ouput sentence
        if word == "</s>":
          break
        else:
          out_sent.append(word)
      # Combine list of word to sentence and add this sentence to output_titles
      output_titles.append(" ".join(out_sent))
print(len(decoder_outputs))
      
# Display the results      
for i in range(5):
  print("Article: ",valid_article[i])
  print("Actual title: ",valid_title[i])
  print("Generated title: ",output_titles[i],'\n')

Length of test data:  2000 2000
len inseq:  2000 2000
Input encoder cnn:  Tensor("embedding/transpose:0", shape=(45, ?, 300), dtype=float32)
Input decoder cnn:  Tensor("embedding/transpose_1:0", shape=(15, ?, 300), dtype=float32)
Instructions for updating:
Use `tf.cast` instead.
INFO:tensorflow:Restoring parameters from Data/gigaword/train/TSModeltest.ckpt
2000
Article:  five-time world champion michelle kwan withdrew from the # us figure skating championships on wednesday , but will petition us skating officials for the chance to compete at the # turin olympics .
Actual title:  injury leaves kwan 's olympic hopes in limbo
Generated title:  kwan withdraws from us team 

Article:  us business leaders lashed out wednesday at legislation that would penalize companies for employing illegal immigrants .
Actual title:  us business attacks tough immigration law
Generated title:  us business leaders condemn illegal immigrants 

Article:  general motors corp. said wednesday its us sales fell # 

# **EVALUATION BY ROUGE-N**

In [0]:
!pip install sumeval
from sumeval.metrics.rouge import RougeCalculator
# from sumeval.metrics.bleu import BLEUCalculator

def eval_rouges(refrence_summary,model_summary):
    #refrence_summary = "tokyo shares close up #.## percent"
    #model_summary = "tokyo stocks close up # percent to fresh record high"

    rouge = RougeCalculator(stopwords=True, lang="en")

    rouge_1 = rouge.rouge_n(
                summary=model_summary,
                references=refrence_summary,
                n=1)

    rouge_2 = rouge.rouge_n(
                summary=model_summary,
                references=[refrence_summary],
                n=2)
    
    rouge_l = rouge.rouge_l(
                summary=model_summary,
                references=[refrence_summary])
    
    # You need spaCy to calculate ROUGE-BE
    
    rouge_be = rouge.rouge_be(
                summary=model_summary,
                references=[refrence_summary])

    # bleu = BLEUCalculator()
    # bleu_score = bleu.bleu( summary=model_summary,
    #                     references=[refrence_summary])

    print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(
       rouge_1, rouge_2, rouge_l, rouge_be
    ).replace(", ", "\n"))
    
    return rouge_1, rouge_2,rouge_l,rouge_be

rouge_1_avg = 0
rouge_2_avg = 0
rouge_l_avg = 0
rouge_be_avg = 0
for i in range(VALID_SENT) :

  # print("Article: ",valid_article[i])
  print("Actual title: ",valid_title[i])
  print("Generated title: ",output_titles[i],'\n')

  rouge_1, rouge_2,rouge_l,rouge_be = eval_rouges(valid_title[i], output_titles[i])
  print("\n")
  rouge_1_avg = rouge_1_avg + rouge_1
  rouge_2_avg = rouge_2_avg + rouge_2
  rouge_l_avg = rouge_l_avg + rouge_l
  rouge_be_avg = rouge_be_avg + rouge_be


print("AVG ROUGE 1 :", rouge_1_avg/VALID_SENT)
print("AVG ROUGE 2 :", rouge_2_avg/VALID_SENT)
print("AVG ROUGE L :", rouge_l_avg/VALID_SENT)
print("AVG ROUGE BE :", rouge_be_avg/VALID_SENT)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
a.cargo=(dobj)=>accept
<BasicElement: cargo-[dobj]->accept>
b.unwanted=(amod)=>sludge
a.sludge=(dobj)=>export
<BasicElement: sludge-[dobj]->export>
b.local=(amod)=>protests
ROUGE-1: 0.18181818181818182
ROUGE-2: 0
ROUGE-L: 0.18181818181818182
ROUGE-BE: 0


Actual title:  russia <unk> iranian leader 's remarks about sharon
Generated title:  russia 's lavrov says he is dead 

b.iranian=(amod)=>leader
ROUGE-1: 0.2222222222222222
ROUGE-2: 0
ROUGE-L: 0.2222222222222222
ROUGE-BE: 0


Actual title:  china 's top women distance runner banned for doping
Generated title:  china 's leading fan banned for failure 

b.leading=(amod)=>fan
b.banned=(acl)=>fan
ROUGE-1: 0.3333333333333333
ROUGE-2: 0
ROUGE-L: 0.3333333333333333
ROUGE-BE: 0


Actual title:  icc decides champions trophy venues discusses zimbabwe crisis
Generated title:  icc to host champions trophy in india 

a.trophy=(dobj)=>host
<BasicElement: trophy-[dobj]->host>
a.venues=