In [1]:
# Mount to Google drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Change the working directory
import os
os.chdir('/content/drive/My Drive/Deep learning with Colab')

# Set paths of train and validation data
train_article_path = "Data/Harvardnlp_sent_summary/train.article.txt"
train_title_path   = "Data/Harvardnlp_sent_summary/train.title.txt"
valid_article_path = "Data/Harvardnlp_sent_summary/valid.article.filter.txt"
valid_title_path   = "Data/Harvardnlp_sent_summary/valid.title.filter.txt"
word2int_path   = "Data/Harvardnlp_sent_summary/word2int.pickle"
int2word_path   = "Data/Harvardnlp_sent_summary/int2word.pickle"

In [3]:
import re

def getSentList(path, n_sents):
  sentList=[]
  with open(path) as f:
    # using 100 to test the code only
    for line in f.readlines()[:n_sents]:
    #for line in f.readlines():
      # Revove begining and ending space
      sent=line.strip()

      # Replace ## or ##.#, #,## .. by #
      sent=re.sub("#(\W)*","# ",sent)   
      
      # Add sent to the list
      sentList.append(sent)
  
  return sentList

# Get train_article and train_title
train_article=getSentList(train_article_path,100000)
train_title=getSentList(train_title_path,100000)

# Print some data
print('Length of train data: ',len(train_article),len(train_title))

print('\nThe first five train articles and titles: ')
for i in range(5):
  print('Train article: ',train_article[i])
  print('Train title: ',train_title[i])
  print()

Length of train data:  100000 100000

The first five train articles and titles: 
Train article:  australia 's current account deficit shrunk by a record # billion dollars -lrb- # billion us -rrb- in the june quarter due to soaring commodity prices , figures released monday showed .
Train title:  australian current account deficit narrows sharply

Train article:  at least two people were killed in a suspected bomb attack on a passenger bus in the strife-torn southern philippines on monday , the military said .
Train title:  at least two dead in southern philippines blast

Train article:  australian shares closed down # percent monday following a weak lead from the united states and lower commodity prices , dealers said .
Train title:  australian stocks close down # percent

Train article:  south korea 's nuclear envoy kim sook urged north korea monday to restart work to disable its nuclear plants and stop its `` typical '' brinkmanship in negotiations .
Train title:  envoy urges north k

In [4]:
import nltk
nltk.download('punkt')

from nltk import word_tokenize

words=[]
# Need to modify this later
for sent in train_article+train_title:
  for word in word_tokenize(sent):
    words.append(word)

# Print first 15 words
print("\nThe first 15 words: \n",words[:15])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

The first 15 words: 
 ['australia', "'s", 'current', 'account', 'deficit', 'shrunk', 'by', 'a', 'record', '#', 'billion', 'dollars', '-lrb-', '#', 'billion']


In [5]:
import collections
import pickle

# Create word2int and int2word dictionary
word_counter = collections.Counter(words).most_common()

word2int = dict()
word2int["<pad>"] = 0
word2int["<unk>"] = 1
word2int["<s>"] = 2
word2int["</s>"] = 3
for word, _ in word_counter:
    word2int[word] = len(word2int)
    
int2word = dict(zip(word2int.values(), word2int.keys()))

print("The length of word2index: " , len(word2int))
print("\nThe word2index: \n" , word2int)
print("\nThe int2word: \n" , int2word)

# Save word2int and int2word into pickle file
with open(word2int_path,'wb') as f:
  pickle.dump(word2int,f)
  
with open(int2word_path,'wb') as f:
  pickle.dump(int2word,f)

The length of word2index:  50409

The word2index: 

The int2word: 


In [6]:
# Convert input data from text to int
def get_intSeq(data_list,max_length,padding=False):
  seq_list=list()
  for sent in data_list:
    # Get tokens in each sent
    words=word_tokenize(sent)    
    
    # Use this for train_article
    if(padding):
      # Make all sent to have the same length as max_length
      if(len(words)<max_length):
        words=words + (max_length-len(words))*["<pad>"] 
      else:
        words=words[:max_length]
        
    # Use this for train_title
    else:
      words=words[:(max_length-1)]
    
    # Convert word to its corresponding int value
    # If the word doesnt exist, use the value of "<unk>" by default
    int_seq=[word2int.get(word,word2int["<unk>"]) for word in words]
    
    # Add int_seq to seq_list
    seq_list.append(int_seq)      
  
  return seq_list

# Define the max length of article and title
article_max_len = 45
title_max_len = 15

# Get the sequence of int value
train_article_intSeq=get_intSeq(train_article,article_max_len,padding=True)
train_title_intSeq=get_intSeq(train_title,title_max_len)

print('Train article: ',train_article[0])
print('Train article(int seq): ',train_article_intSeq[0])
print('\nTrain title: ',train_title[0])
print('Train title(int seq): ',train_title_intSeq[0])

Train article:  australia 's current account deficit shrunk by a record # billion dollars -lrb- # billion us -rrb- in the june quarter due to soaring commodity prices , figures released monday showed .
Train article(int seq):  [170, 13, 901, 2477, 771, 18804, 28, 10, 261, 11, 155, 73, 50, 11, 155, 18, 48, 5, 4, 752, 384, 264, 6, 3067, 6239, 61, 8, 710, 465, 32, 534, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Train title:  australian current account deficit narrows sharply
Train title(int seq):  [158, 901, 2477, 771, 7557, 852]


In [0]:
glove_path = "Data/Word Embedding/glove.6B.50d.txt"

# Get the word embedding from Glove
word_emb_glove=dict()
with open(glove_path) as f:
  for line in f:
    el=line.split()
    word=el[0]
    emb=[float(val) for val in el[1:]]
    word_emb_glove[word]=emb 

In [8]:
import numpy as np
# Sort the int2word
int2word_sorted=sorted(int2word.items())

# Get the list of word embedding corresponding to int value in ascending order
word_emb_list=list()
embedding_size=len(word_emb_glove['the'])
for int_val, word in int2word_sorted:
  # Add Glove embedding if it exists
  if(word in word_emb_glove):
    word_emb_list.append(word_emb_glove[word])
  
  # Otherwise, the value of word embedding is 0
  else:
    word_emb_list.append(np.zeros([embedding_size], dtype=np.float32))
    
# Assign random vector to <s>, </s> token
word_emb_list[2] = np.random.normal(0, 1, embedding_size)
word_emb_list[3] = np.random.normal(0, 1, embedding_size)

# the final word embedding
word_emb=np.array(word_emb_list)

print("The first 5 word embedding: ")
for i in range(5):
  print(word_emb[i])

The first 5 word embedding: 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
[-0.19950797  1.21899043 -0.07793702 -1.68556162  0.27735388  2.04192758
  3.08245291 -0.46025775 -1.53399205  0.4981773   1.25398646  1.41315859
  0.94553658 -0.7891086   0.47136215 -0.18185557  1.25337172 -0.6111245
  0.88583587  0.43536218  0.21490001  0.18328599  0.12801997 -0.86321295
  1.01573016 -0.78184354  0.82736298  0.88025789 -0.91771294  0.53146997
  0.08238099  1.29622152  0.4149311   1.20241565 -0.47031893 -0.30917514
  0.62004252 -0.14399287  0.06057103 -0.80855648  0.266399    0.18692125
  0.7264479  -0.01846257 -1.07630822 -0.27466848 -1.00979732  0.24081977
  0.38535452  0.89981619]
[ 0.60799937  0.72641368  0.32671662  2.36591841  0.55

In [0]:
import math
def get_batches(input_data, output_data, batch_size):
  #Convert input and output data from list to numpy array
  input_data=np.array(input_data)
  output_data=np.array(output_data)
  
  # Number of batches per epoch 
  num_batches_epoch = math.ceil(len(input_data)/batch_size)
  for batch_num in range(num_batches_epoch):
      start_index = batch_num * batch_size
      end_index = min((batch_num + 1) * batch_size, len(input_data))
      yield input_data[start_index:end_index], output_data[start_index:end_index]

### **Traing the Model**

In [10]:
# Run the Seq2seq model written in another ipynb file 
# http://itechseeker.com/tutorials/xu-ly-ngon-ngu-tu-nhien-bang-deep-learning/thuc-hanh-viet-ung-dung/xay-dung-mo-hinh-seq2seq-voi-attention-mechanism/
%run 'Seq2seq with Attention.ipynb'

# Define hyper-parameters for the Model
params=dict()
params['num_layers']=2
params['num_hiddens']=150
params['learning_rate']=0.001
params['keep_prob']=0.85
params['beam_width']=10

num_epochs=10
early_stop=5 # Stop if there is no improvement after 5 epochese

# Set paths to save the model
checkpoint = "Saved Models/Text Summarization/TSModel.ckpt"
import time
start_time=time.time()


tf.reset_default_graph()
with tf.Session() as sess:
  # Create a Seq2seq model
  model=Seq2SeqModel(len(int2word), word_emb, article_max_len, title_max_len, params)
  
  # Initialize all variables
  sess.run(tf.global_variables_initializer())
  min_loss=1000 # To find the minimum loss during training   
  no_impove_count=0 # Count the number of consecutive epoch having no improvement
    
  for epoch in range (num_epochs):  
    # Get batches from training data 
    batches=get_batches(train_article_intSeq,train_title_intSeq,batch_size=64) 
    
    # Reset epoch_loss after each epoch 
    epoch_loss=0
    # Interate over batches
    for batch_i,(batch_x, batch_y) in enumerate(batches):
      # The actual length of each sequence in the batch (excluding "<pad>")
      batch_x_len = list(map(lambda seq: len([word_int for word_int in seq if word_int != 0]), batch_x))

      # Decoder input is created by adding <s> to the begining of each output sentence 
      batch_decoder_input = list(map(lambda seq: [word2int["<s>"]] + list(seq), batch_y))

      # The actual length of each Decoder input (excluding "<pad>")
      batch_decoder_len = list(map(lambda seq: len([word_int for word_int in seq if word_int != 0]), batch_decoder_input))

      # The actual ouput of Decoder is created by adding </s> to the begining of each output sentence
      batch_decoder_output = list(map(lambda seq: list(seq) + [word2int["</s>"]], batch_y))

      # Add <pad> to make all input and ouput of Decoder have same length
      batch_decoder_input = list(
          map(lambda seq: seq + (title_max_len - len(seq)) * [word2int["<pad>"]], batch_decoder_input))
      batch_decoder_output = list(
          map(lambda seq: seq + (title_max_len - len(seq)) * [word2int["<pad>"]], batch_decoder_output))

      # Create a train_feed_dict
      train_feed_dict = {
                model.batch_size: len(batch_x),
                model.inputSeq: batch_x,
                model.inputSeq_len: batch_x_len,

                model.decoder_input: batch_decoder_input,
                model.decoder_len: batch_decoder_len,
                model.decoder_target: batch_decoder_output
            }
      
      # Start training the model
      _, step, loss,encoder_outputs = sess.run([model.train_update, model.global_step, model.loss,model.encoder_outputs], feed_dict=train_feed_dict)
      epoch_loss+=loss
      
      # Display loss value of each step
      print("step {0}: loss = {1}".format(step, loss))      
   
    print("Finish epoch",epoch+1 )
    # Averaging the epoch_loss
    epoch_loss=epoch_loss/(batch_i+1)
    
    # Save the model if the epoch_loss is at a new minimum,
    if epoch_loss <= min_loss:
      # Set new minimum loss
      min_loss=epoch_loss
      # Reset the no_impove_count
      no_impove_count=0 
      
      # Save the new model
      saver = tf.train.Saver(tf.global_variables()) 
      saver.save(sess, checkpoint)
      
      print('New model saved, minimum loss:',min_loss,'\n') 
      
    # Early stopping
    else:
      print("No Improvement!",'\n')
      no_impove_count+=1
      if(no_impove_count==early_stop):
        print("Early stopping... Finish training")
        break

end_time=time.time()
training_time=(end_time-start_time)/60
print("\nTraining time (mins): ",training_time )

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
step 1: loss = 10.828413963317871
step 2: loss = 10.778870582580566
step 3: loss = 10.641901016235352
step 4: loss = 10.28805923461914
step 5: loss = 9.692378997802734
step 6: loss = 8.83515453338623
step 7: loss = 7.762962341308594
step 8: loss = 7.404228687286377
step 9: loss = 8.829028129577637
step 10: loss = 8.49134635925293
step 11: loss = 7.970305442810059
step 12: loss = 7.550246715545654
step 13: loss = 

### **Test the model**

In [27]:
# Run the Seq2seq model written in another ipynb file 
# http://itechseeker.com/tutorials/xu-ly-ngon-ngu-tu-nhien-bang-deep-learning/thuc-hanh-viet-ung-dung/xay-dung-mo-hinh-seq2seq-voi-attention-mechanism/
%run 'Seq2seq with Attention.ipynb'

# The hyper-parameters of the Model (should be the same as in Training)
params=dict()
params['num_layers']=2
params['num_hiddens']=150
params['learning_rate']=0.001
params['keep_prob']=0.85
params['beam_width']=10

# Path of the saved model
checkpoint = "Saved Models/Text Summarization/TSModel.ckpt"

# Reset the default graph
tf.reset_default_graph()

# Get the first 10 validation articles and validation titles
valid_article=getSentList(valid_article_path,10)
valid_title=getSentList(valid_title_path,10)

# Get the sequence of int value
valid_article_intSeq=get_intSeq(valid_article,article_max_len,padding=True)
valid_title_intSeq=get_intSeq(valid_title,title_max_len)


with tf.Session() as sess:  
  # Load saved model 
  # Use Seq2SeqModel to create the same graph as saved model
  loaded_model=Seq2SeqModel(len(int2word), word_emb, article_max_len, title_max_len, params, train=False)
  
  # Load the value of variables in saved model
  saver = tf.train.Saver(tf.global_variables())
  saver.restore(sess, checkpoint)
  
  # Get batches from validation data 
  batches_valid=get_batches(valid_article_intSeq,valid_title_intSeq,batch_size=64) 
  
  # Interate over batches
  for batch_valid_i,(batch_x_valid, batch_y_valid) in enumerate(batches_valid):
    # The actual length of each sequence in the batch (excluding "<pad>")
    batch_x_len_valid = list(map(lambda seq: len([word_int for word_int in seq if word_int != 0]), batch_x_valid))
    
    # Create a feed_dict for validation data
    valid_feed_dict = {
            loaded_model.batch_size: len(batch_x_valid),
            loaded_model.inputSeq: batch_x_valid,
            loaded_model.inputSeq_len: batch_x_len_valid,
        }
    
    # Get the decoder output by Inference
    decoder_outputs=sess.run(loaded_model.decoder_outputs,feed_dict=valid_feed_dict)
    
    
    # Convert from sequence of int to actual sentence
    output_titles=[]    
    # Loop through each seq in decoder_outputs
    for out_seq in decoder_outputs:  
      out_sent=list()
      for word_int in out_seq:    
        # Convert int to word 
        word=int2word[word_int]  
        # Stop converting when it reach to the end of ouput sentence
        if word == "</s>":
          break
        else:
          out_sent.append(word)
      # Combine list of word to sentence and add this sentence to output_titles
      output_titles.append(" ".join(out_sent))
      
# Display the results      
for i in range(10):
  print("Article: ",valid_article[i])
  print("Actual title: ",valid_title[i])
  print("Generated title: ",output_titles[i],'\n')

INFO:tensorflow:Restoring parameters from Saved Models/Text Summarization/TSModel.ckpt
1
Article:  five-time world champion michelle kwan withdrew from the # us figure skating championships on wednesday , but will petition us skating officials for the chance to compete at the # turin olympics .
Actual title:  injury leaves kwan 's olympic hopes in limbo
Generated title:  kwan backs us olympic championships 

Article:  us business leaders lashed out wednesday at legislation that would penalize companies for employing illegal immigrants .
Actual title:  us business attacks tough immigration law
Generated title:  us business leaders slam plans for illegal immigrants 

Article:  general motors corp. said wednesday its us sales fell # percent in december and four percent in # with the biggest losses coming from passenger car sales .
Actual title:  gm december sales fall # percent
Generated title:  gm sales down # percent in october 

Article:  several thousand people gathered on wednesday e