In [1]:
# Mount to Google drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
# Change the working directory
import os
import re
import nltk
nltk.download('punkt')

from nltk import word_tokenize
import collections
import numpy as np

os.chdir('/content/drive/My Drive/Deep learning with Colab')

# Set paths of train and validation data
train_article_path = "Data/Harvardnlp_sent_summary/train.article.txt"
train_title_path   = "Data/Harvardnlp_sent_summary/train.title.txt"
valid_article_path = "Data/Harvardnlp_sent_summary/valid.article.filter.txt"
valid_title_path   = "Data/Harvardnlp_sent_summary/valid.title.filter.txt"


def getSentList(path, n_sents):
  sentList=[]
  with open(path) as f:
    # using 100 to test the code only
    for line in f.readlines()[:n_sents]:
    #for line in f.readlines():
      # Revove begining and ending space
      sent=line.strip()

      # Replace ## or ##.#, #,## .. by #
      sent=re.sub("#(\W)*","# ",sent)   
      
      # Add sent to the list
      sentList.append(sent)
  
  return sentList

# Get train_article and train_title
train_article=getSentList(train_article_path,100000)
train_title=getSentList(train_title_path,100000)

# Get all words in train_article and train_title
words=[]
# Need to modify this later
for sent in train_article+train_title:
  for word in word_tokenize(sent):
    words.append(word)
    
# Create word2int and int2word dictionary
word_counter = collections.Counter(words).most_common()

word2int = dict()
word2int["<pad>"] = 0
word2int["<unk>"] = 1
word2int["<s>"] = 2
word2int["</s>"] = 3
for word, _ in word_counter:
    word2int[word] = len(word2int)
    
int2word = dict(zip(word2int.values(), word2int.keys()))

# Get the word embedding from Glove
glove_path = "Data/Word Embedding/glove.6B.50d.txt"
word_emb_glove=dict()
with open(glove_path) as f:
  for line in f:
    el=line.split()
    word=el[0]
    emb=[float(val) for val in el[1:]]
    word_emb_glove[word]=emb 
    
# Get the list of word embedding corresponding to int value in ascending order
word_emb_list=list()
embedding_size=len(word_emb_glove['the'])
for i in int2word:
  word=int2word[i]
  # Add Glove embedding if it exists
  if(word in word_emb_glove):
    word_emb_list.append(word_emb_glove[word])
  
  # Otherwise, the value of word embedding is 0
  else:
    word_emb_list.append(np.zeros([embedding_size], dtype=np.float32))
    
# Assign random vector to <s>, </s> token
word_emb_list[2] = np.random.normal(0, 1, embedding_size)
word_emb_list[3] = np.random.normal(0, 1, embedding_size)

# the final word embedding
word_emb=np.array(word_emb_list)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
def get_trainDatasets(src_data,tgt_data,src_maxlen,tgt_maxlen,word2int,batch_size,num_epochs):
  
  # Create the Dataset
  train_dataset = tf.data.Dataset.from_tensor_slices((src_data,tgt_data))

  # Split sents into words
  train_dataset = train_dataset.map(lambda src, tgt: (tf.string_split([src]).values,tf.string_split([tgt]).values))

  # Truncate src_data and tgt_data to max length
  if src_maxlen:
    train_dataset = train_dataset.map(lambda src, tgt: (src[:src_maxlen],tgt))
  if tgt_maxlen:
    train_dataset = train_dataset.map(lambda src, tgt: (src,tgt[:tgt_maxlen]))

  # Convert sequence of words to sequence of int
  vocab_list =list(word2int.keys())
  mapping_strings_tensor=tf.constant(vocab_list)
  vocab_table=tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings_tensor, default_value=word2int["<unk>"])
  train_dataset = train_dataset.map(lambda src, tgt: (tf.cast(vocab_table.lookup(src), tf.int32),tf.cast(vocab_table.lookup(tgt), tf.int32)))

  # Create input and output for decoder
  # Decoder input: adding "<s>" to the start of tgt_data
  # Decoder output: adding "</s>" to the end of tgt_data
  train_dataset = train_dataset.map(lambda src, tgt: (src,tf.concat(([word2int["<s>"]],tgt), 0),tf.concat((tgt,[word2int["</s>"]]), 0)))

  # Adding the length of encoder's input and decoder's input
  train_dataset = train_dataset.map(lambda src, tgt_in, tgt_out: (src,tgt_in,tgt_out,tf.size(src),tf.size(tgt_in)))

  # Shuffle the dataset
  train_dataset = train_dataset.shuffle(len(src_data))
  
  # Using padded_batch to create batches
  train_dataset = train_dataset.padded_batch(batch_size,
                                             padded_shapes=(  tf.TensorShape([None]),  # src
                                                              tf.TensorShape([None]),  # tgt_input
                                                              tf.TensorShape([None]),  # tgt_output
                                                              tf.TensorShape([]),  # src_len
                                                              tf.TensorShape([])),  # tgt_len)
                                             padding_values=( word2int["<pad>"],  # src
                                                              word2int["<pad>"],  # tgt_input
                                                              word2int["<pad>"],  # tgt_output
                                                              0,  # src_len -- unused
                                                              0),  # tgt_len -- unused
                                             drop_remainder= True
                                            )
  
  # Repeat the dataset with num_epochs
  train_dataset = train_dataset.repeat(num_epochs)
  
  # Prefetch 10 batches 
  train_dataset = train_dataset.prefetch(10)
  
  return train_dataset
  

In [0]:
def get_testDatasets(src_data,src_maxlen,word2int,batch_size):
  
  # Create the Dataset
  test_dataset = tf.data.Dataset.from_tensor_slices(src_data)

  # Split sents into words
  test_dataset = test_dataset.map(lambda src: tf.string_split([src]).values)

  # Truncate src_data and tgt_data to max length
  if src_maxlen:
    test_dataset = test_dataset.map(lambda src: src[:src_maxlen])

  # Convert sequence of words to sequence of int
  vocab_list =list(word2int.keys())
  mapping_strings_tensor=tf.constant(vocab_list)
  vocab_table=tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings_tensor, default_value=word2int["<unk>"])
  test_dataset = test_dataset.map(lambda src: tf.cast(vocab_table.lookup(src), tf.int32)) 

  # Adding the length of encoder's input 
  test_dataset = test_dataset.map(lambda src: (src,tf.size(src)))

  # Shuffle the dataset
  test_dataset = test_dataset.shuffle(len(src_data))
  
  # Using padded_batch to create batches
  test_dataset = test_dataset.padded_batch(  batch_size,
                                             padded_shapes=(  tf.TensorShape([None]),  # src                                                   
                                                              tf.TensorShape([])),  # src_len                                                              
                                             padding_values=( word2int["<pad>"],  # src
                                                              0),  # src_len -- unused
                                             drop_remainder= True
                                            )
    
  # Prefetch 10 batches 
  test_dataset = test_dataset.prefetch(10)
  
  return test_dataset

In [0]:
import tensorflow as tf
from tensorflow.contrib import rnn

class Seq2SeqModel(object):
  def __init__(self,iterator, params, batch_size, word_embedding,train=True):
    # Get the vocab size and batch-size
    self.vocab_size=len(word_embedding)
    self.batch_size=batch_size    
    
    # Get hyper-parameters from params       
    self.num_layers=params['num_layers']
    self.num_hiddens=params['num_hiddens']    
    self.learning_rate = params['learning_rate']
    self.keep_prob = params['keep_prob']
    self.beam_width = params['beam_width']
    
    # Using BasicLSTMCell as a cell unit
    self.cell=tf.nn.rnn_cell.LSTMCell  
    self.global_step = tf.Variable(0, trainable=False) # False means not adding the variable to the graph collection 
    
    # Get value from iterator
    if(train):
      encoder_input,decoder_input,decoder_output,inputSeq_len,decoder_len= iterator.get_next()
      
      # Decoder variable
      self.decoder_input=decoder_input
      self.decoder_len=decoder_len
      self.decoder_target=decoder_output    

      # The length of output sequence 
      output_len=tf.to_int32(tf.shape(decoder_output)[1])
    else:
      encoder_input,inputSeq_len= iterator.get_next()
       
    # Encoder variables     
    self.inputSeq=encoder_input
    self.inputSeq_len=inputSeq_len         
    
    # Define projection_layer
    self.projection_layer = tf.layers.Dense(self.vocab_size, use_bias=False)
    
    # Define the Embedding layer
    with tf.name_scope("embedding"):
      self.embeddings=tf.get_variable("embeddings",initializer=tf.constant(word_embedding,dtype=tf.float32))
      
      # map the int value with its embeddings
      input_emb=tf.nn.embedding_lookup(self.embeddings,self.inputSeq)
      
      # Convert from batch_size*seq_len*embedding to seq_len*batch_size*embedding to feed data with timestep      
      # But, we need to set time_major=True during Training      
      self.encoder_inputEmb = tf.transpose(input_emb, perm=[1, 0, 2])
      
      if(train):
        decoder_input_emb=tf.nn.embedding_lookup(self.embeddings,self.decoder_input)
        self.decoder_inputEmb = tf.transpose(decoder_input_emb, perm=[1, 0, 2])
      
    # Define the Encoder
    with tf.name_scope("encoder"):      
      # Create RNN Cell for forward and backward direction
      fw_cells=list()
      bw_cells=list()
      for i in range(self.num_layers):
        fw_cell= self.cell(self.num_hiddens)
        bw_cell= self.cell(self.num_hiddens)
        
        # Add Dropout
        fw_cell=rnn.DropoutWrapper(fw_cell,output_keep_prob=self.keep_prob)
        bw_cell=rnn.DropoutWrapper(bw_cell,output_keep_prob=self.keep_prob)
        
        # Add cell to the list
        fw_cells.append(fw_cell)
        bw_cells.append(bw_cell)
        
        
      # Build a multi bi-directional model from fw_cells and bw_cells
      outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
          cells_fw=fw_cells, cells_bw=bw_cells,inputs=self.encoder_inputEmb,time_major=True, sequence_length=self.inputSeq_len, dtype=tf.float32)
      
      # The ouput of Encoder (time major)
      self.encoder_outputs=outputs
      
      # Use the final state of the last layer as encoder_final_state 
      encoder_state_c = tf.concat((encoder_state_fw[-1].c, encoder_state_bw[-1].c), 1)
      encoder_state_h = tf.concat((encoder_state_fw[-1].h, encoder_state_bw[-1].h), 1)
      self.encoder_final_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)
      
    # Define the Decoder for training
    with tf.name_scope("decoder"):
      # Define Decoder cell
      decoder_num_hiddens =self.num_hiddens * 2 # As we use bi-directional RNN
      decoder_cell=self.cell(decoder_num_hiddens)
      
      # Training mode 
      if(train):
        # Convert from time major to batch major 
        attention_states = tf.transpose(self.encoder_outputs, [1, 0, 2])
        
         # Decoder with attention      
        attention=tf.contrib.seq2seq.BahdanauAttention(num_units=decoder_num_hiddens, memory=attention_states, memory_sequence_length=self.inputSeq_len,normalize=True)
        attention_decoder_cell= tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell,attention_mechanism=attention,attention_layer_size=decoder_num_hiddens)

        # Use the final state of encoder as the initial state of the decoder
        decoder_initial_state = attention_decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
        decoder_initial_state = decoder_initial_state.clone(cell_state=self.encoder_final_state )

        # Use TrainingHelper to train the Model 
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=self.decoder_inputEmb,sequence_length=self.decoder_len, time_major=True)
        decoder = tf.contrib.seq2seq.BasicDecoder(cell=attention_decoder_cell,helper=training_helper,initial_state=decoder_initial_state,output_layer=self.projection_layer)
        logits, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True,maximum_iterations=output_len)
        
        
        # Convert from time major to batch major 
        self.training_logits = tf.transpose(logits.rnn_output, perm=[1, 0, 2])
        
        # Adding zero to make sure training_logits has shape: [batch_size, sequence_length, num_decoder_symbols]
        self.training_logits = tf.concat([self.training_logits, tf.zeros([self.batch_size, output_len - tf.shape(self.training_logits)[1], self.vocab_size])], axis=1)
     
      # Inference mode 
      else:
        # Using Beam search
        tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(tf.transpose(self.encoder_outputs, perm=[1, 0, 2]), multiplier=self.beam_width)
        tiled_encoder_final_state=tf.contrib.seq2seq.tile_batch(self.encoder_final_state, multiplier=self.beam_width)
        tiled_inputSeq_len=tf.contrib.seq2seq.tile_batch(self.inputSeq_len, multiplier=self.beam_width)

        # Decoder with attention with Beam search
        attention=tf.contrib.seq2seq.BahdanauAttention(num_units=decoder_num_hiddens, memory=tiled_encoder_outputs, memory_sequence_length=tiled_inputSeq_len,normalize=True)
        attention_decoder_cell= tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell,attention_mechanism=attention,attention_layer_size=decoder_num_hiddens)

        # Use the final state of encoder as the initial state of the decoder
        decoder_initial_state = attention_decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
        decoder_initial_state = decoder_initial_state.clone(cell_state=tiled_encoder_final_state)

        # Build a Decoder with Beam Search
        beamSearch_decoder=tf.contrib.seq2seq.BeamSearchDecoder(          
            cell=attention_decoder_cell,
            embedding=self.embeddings,
            start_tokens=tf.fill([self.batch_size],tf.constant(2)),
            end_token=tf.constant(3),
            initial_state=decoder_initial_state,
            beam_width=self.beam_width,
            output_layer=self.projection_layer  
        )

        # Perform dynamic decoding with beamSearch_decoder
        outputs, _ , _ =tf.contrib.seq2seq.dynamic_decode(decoder=beamSearch_decoder,output_time_major=True)
        
        # Convert from seq_len*batch_size*beam_width to batch_size*beam_width*seq_len
        outputs=tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])
        
        # Take the first beam (best result) as Decoder ouput 
        self.decoder_outputs=outputs[:,0,:]

    with tf.name_scope("optimization"):
      # Used for Training mode only 
      if(train):
        # Caculate loss value 
        masks = tf.sequence_mask(lengths=self.decoder_len,maxlen=output_len, dtype=tf.float32)         
        self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.training_logits,targets=self.decoder_target,weights=masks)

        # Using AdamOptimizer
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # Compute gradient 
        gradients = optimizer.compute_gradients(self.loss)
        # Apply Gradient Clipping 
        gradients_clipping = [(tf.clip_by_value(grad, clip_value_min=-5., clip_value_max=5.), var) for grad, var in gradients if grad is not None]

        # Apply gradients to variables
        self.train_update = optimizer.apply_gradients(gradients_clipping, global_step=self.global_step)

### **Traing the Model**

In [5]:
# Define hyper-parameters for the Model
params=dict()
params['num_layers']=2
params['num_hiddens']=150
params['learning_rate']=0.001
params['keep_prob']=0.85
params['beam_width']=10

# Define the max length of article and title
article_max_len = 45
title_max_len = 15

num_epochs=10
batch_size=64
tf.reset_default_graph() 

train_dataset=get_trainDatasets(train_article,train_title,article_max_len,title_max_len,word2int,batch_size,num_epochs)
# Create an Initializable iterator
iterator = train_dataset.make_initializable_iterator()

model=Seq2SeqModel(iterator, params, batch_size, word_emb)

num_batches_epoch = len(train_article)//batch_size
early_stop=5 # Stop if there is no improvement after 5 epochs

# Set paths to save the model
checkpoint = "Saved Models/Text Summarization/Using Datasets/TSModel.ckpt"
import time
start_time=time.time()

with tf.Session() as sess:
  # Run initializer
  sess.run(tf.tables_initializer())
  sess.run(iterator.initializer)  
  sess.run(tf.global_variables_initializer())

  min_loss=1000 # To find the minimum loss during training   
  no_impove_count=0 # Count the number of consecutive epoch having no improvement
    
  for epoch in range (num_epochs):  
    # Reset epoch_loss after each epoch 
    epoch_loss=0
    for batch_i in range(num_batches_epoch):
      #print(sess.run(model.outputLength))
      # Start training the model
      _, step, loss,encoder_outputs = sess.run([model.train_update, model.global_step, model.loss,model.encoder_outputs])
      epoch_loss+=loss
      
      # Display loss value of each step
      print("step {0}: loss = {1}".format(step, loss))      
   
    print("Finish epoch",epoch+1 )
    # Averaging the epoch_loss
    epoch_loss=epoch_loss/(batch_i+1)
    
    # Save the model if the epoch_loss is at a new minimum,
    if epoch_loss <= min_loss:
      # Set new minimum loss
      min_loss=epoch_loss
      # Reset the no_impove_count
      no_impove_count=0 
      
      # Save the new model
      saver = tf.train.Saver(tf.global_variables()) 
      saver.save(sess, checkpoint)
      
      print('New model saved, minimum loss:',min_loss,'\n') 
      
    # Early stopping
    else:
      print("No Improvement!",'\n')
      no_impove_count+=1
      if(no_impove_count==early_stop):
        print("Early stopping... Finish training")
        break

end_time=time.time()
training_time=(end_time-start_time)/60
print("\nTraining time (mins): ",training_time )

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
step 1: loss = 10.828153610229492
step 2: loss = 10.78711223602295
step 3: loss = 10.670001983642578
step 4: loss = 10.364364624023438
step 5: loss = 9.87707233428955
step 6: loss = 9.137256622314453
step 7: loss = 8.413293838500977
step 8: loss = 8.200343132019043
step 9: loss = 8.542448997497559
step 10: loss = 8.741886138916016
step 11: loss = 8.175909996032715
s

### **Test the model**

In [19]:
# The hyper-parameters of the Model (should be the same as in Training)
params=dict()
params['num_layers']=2
params['num_hiddens']=150
params['learning_rate']=0.001
params['keep_prob']=0.85
params['beam_width']=10

# Path of the saved model
checkpoint = "Saved Models/Text Summarization/Using Datasets/TSModel.ckpt"

# Reset the default graph
tf.reset_default_graph()

# Get the first 10 validation articles and validation titles
valid_article=getSentList(valid_article_path,100)
valid_title=getSentList(valid_title_path,100)

test_dataset=get_testDatasets(valid_article,title_max_len,word2int,batch_size)

# Create an Initializable iterator
test_iterator = test_dataset.make_initializable_iterator()

# Set paths to the saved model
checkpoint = "Saved Models/Text Summarization/Using Datasets/TSModel.ckpt"

num_batches_epoch = len(valid_article)//batch_size
with tf.Session() as sess:
  # Run initializer
  sess.run(tf.tables_initializer())
  sess.run(test_iterator.initializer)  
  sess.run(tf.global_variables_initializer())
  
  # Load saved model 
  # Use Seq2SeqModel to create the same graph as saved model
  loaded_model=Seq2SeqModel(test_iterator, params, batch_size, word_emb,train=False)
  
  # Load the value of variables in saved model
  saver = tf.train.Saver(tf.global_variables())
  saver.restore(sess, checkpoint)
  
  # Interate over batches
  for batch_i in range(num_batches_epoch):
    
    # Get the decoder output by Inference
    decoder_outputs=sess.run(loaded_model.decoder_outputs)    
    
    # Convert from sequence of int to actual sentence
    output_titles=[]    
    # Loop through each seq in decoder_outputs
    for out_seq in decoder_outputs:  
      out_sent=list()
      for word_int in out_seq:    
        # Convert int to word 
        word=int2word[word_int]  
        # Stop converting when it reach to the end of ouput sentence
        if word == "</s>":
          break
        else:
          out_sent.append(word)
      # Combine list of word to sentence and add this sentence to output_titles
      output_titles.append(" ".join(out_sent))
      
# Display the results      
for i in range(10):
  print("Article: ",valid_article[i])
  print("Actual title: ",valid_title[i])
  print("Generated title: ",output_titles[i],'\n')

INFO:tensorflow:Restoring parameters from Saved Models/Text Summarization/Using Datasets/TSModel.ckpt
Article:  five-time world champion michelle kwan withdrew from the # us figure skating championships on wednesday , but will petition us skating officials for the chance to compete at the # turin olympics .
Actual title:  injury leaves kwan 's olympic hopes in limbo
Generated title:  new zealand and sri lanka look on target 

Article:  us business leaders lashed out wednesday at legislation that would penalize companies for employing illegal immigrants .
Actual title:  us business attacks tough immigration law
Generated title:  final results of key iraqi general elections 

Article:  general motors corp. said wednesday its us sales fell # percent in december and four percent in # with the biggest losses coming from passenger car sales .
Actual title:  gm december sales fall # percent
Generated title:  <unk> of burkina faso 

Article:  several thousand people gathered on wednesday eveni