<a href="https://colab.research.google.com/github/jayanwana/article-summarizer/blob/master/abstract_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Abstract Article Summarizer
generates a summary of any article fed into it

This summarizer uses tensor flow rnn to generate an abstract summary of any article fed into it.
In order to test the model, change the test_article_path to the path of any .txt article you wish to summarize.
this summarizer currently has a bug that causes it to crash if there are empty lines in the article you wish to summarize. And since the data is cleaned line by line, still trying to find a way around this.




**Mounting Google Drive**


In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install wget


Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9681 sha256=2dda0cc1efc7f59847b62f6a544438b281007dcd4a6072641dc963f2445dcee1
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


**Importing** **dependencies** 

In [0]:
import tensorflow as tf
from tensorflow.contrib import rnn
import time
import re
import wget
import os
import zipfile
import nltk
from nltk.tokenize import word_tokenize
import collections
import pickle
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec


In [4]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Creating Path Variables**

In [0]:
#default path for the folder inside google drive
default_path = "/content/drive/Data/"
#path for training text (article)
train_article_path = default_path + "sumdata/train/train.article.txt" 
 #path for training text output (headline)
train_title_path   = default_path + "sumdata/train/train.title.txt" 
# test article path
test_article_path = default_path + "test/test2.txt"
#path for validation text (article)
valid_article_path = default_path + "sumdata/train/valid.article.filter.txt"
#path for validation text output(headline)
valid_title_path   = default_path + "sumdata/train/valid.title.filter.txt"


**Download training data**


In [0]:
if not os.path.exists(default_path):
        os.mkdir(default_path)

data_url = https://github.com/dongjun-Lee/text-summarization-tensorflow/raw/master/sample_data.zip
# Download training Data
wget.download(data_url, out=default_path)

# Extract training Data
with zipfile.ZipFile(os.path.join(default_path, "sample_data.zip"), "r") as z:
    z.extractall(default_path)

Creating functions to get, clean and prepare data for Model 

In [0]:
def clean_data(sentence):
  '''
  this function cleans each sentence from the article
  '''
  sentence = re.sub("[#.]+", "#", sentence)
  sentence = re.sub('[^\w\s\n]', '', sentence)
  sentence = re.sub('_', '', sentence)
  sentence = re.sub('\s+', ' ', sentence)
  return sentence

In [0]:
def get_clean_data_list(data_path, full_data):
    with open (data_path, "r", encoding="utf-8") as f:
        if full_data:
            return [clean_data(line.strip()) for line in f.readlines() if len(line)>0][:200000]
        else:
            return [clean_data(line.strip()) for line in f.readlines() if len(line)>0][:50]

function that Builds dictionary (and reverse dictionary) of words and numerical keys as values

In [0]:
def build_dict(mode, full_data=True):
    """
    builds a dictionary and reverse dictionary of word keys and numeric values
    :param mode: str; train- for training data
                        test- for real data
    :param full_data: Bool; full data or subset
    :return:    word_dict, reversed_dict, article_max_len, summary_max_len
    """
    if mode == "train":
      # load training data
        train_article_list = get_clean_data_list(train_article_path, full_data)
        train_title_list = get_clean_data_list(train_title_path, full_data)

      # create list of words using word tokenizer
        words = []
        for sentence in [*train_article_list, *train_title_list]:
            for word in word_tokenize(sentence):
                words.append(word)

        word_counter = collections.Counter(words).most_common()
        # create word dict
        word_dict = {}
        # padding for same length sequences
        word_dict["<padding>"] = 0
        # unkown words not in dict
        word_dict["<unk>"] = 1
        # <s> for beginning of sentence
        word_dict["<s>"] = 2
        # end of sentence
        word_dict["</s>"] = 3
        # create dict looping over word counter
        for word, _ in word_counter:
            word_dict[word] = len(word_dict)

        # store word dict to be used for validation for use in test mode
        with open(default_path + "word_dict.pickle", "wb") as f:
            pickle.dump(word_dict, f)

    elif mode == "test":
        with open(default_path + "word_dict.pickle", "rb") as f:
            word_dict = pickle.load(f)
    # switcching keys and values in word dict
    reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))

    article_max_len = 100
    summary_max_len = 30

    return word_dict, reversed_dict, article_max_len, summary_max_len


In [0]:
def build_dataset(mode, word_dict, article_max_len, summary_max_len, 
                  full_data=True):
    """
    builds dataset used by alogrithm
     :param mode: str;   train- for training data
                        test- for real data
    :param word_dict:   dictionary of word and values 
    :param article_max_len: int; length of article
    :param summary_max_len: int; length of summary
    :param full_data:   Bool; full data or subset
    :return:    words; list of words 
  """
    if mode == "train":
        article_list = get_clean_data_list(train_article_path, full_data)
        title_list = get_clean_data_list(train_title_path, full_data)
    elif mode == "test":
        article_list = get_clean_data_list(test_article_path, full_data)
    else:
        raise NotImplementedError
    # words list for words in article
    words = [word_tokenize(sentence) for sentence in article_list]
    # if word isn't in dict use unkown token
    words = [[word_dict.get(w, word_dict["<unk>"]) for w in word] for word in words]
    # if words in article are less than article max length
    words = [word[:article_max_len] for word in words]
    # pad words using padding token
    words = [word + (article_max_len - len(word)) * [word_dict["<padding>"]] for word in words]

    if mode == "test":
        return words
    else:
        title_word = [word_tokenize(word) for word in title_list]
        title_word = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in title_word]
        title_word = [word[:(summary_max_len - 1)] for word in title_word]
        return words, title_word

In [0]:
def batch_iter(inputs, outputs, batch_size, num_epochs):
    """
    
    :param inputs: 
    :param outputs: 
    :param batch_size: 
    :param num_epochs: 
    :return: 
    """
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]

In [0]:
def get_init_embedding(glove, reversed_dict, embedding_size):
    """
    Returns a numpy array of word vector embedding
    :param reversed_dict: dict();
    :param embedding_size: int;
    :return: np.array(word_vec_list)
    """
    glove_dir = default_path + 'glove/'
    glove_url = "https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip"

    if not os.path.exists(glove_dir):
        os.mkdir(glove_dir)

    glove_file = glove_dir + "glove.42B.300d.txt"

    if not os.path.isfile(glove_file):
        # Download glove vector
        wget.download(glove_url, out=glove_dir)

        # Extract glove file
        with zipfile.ZipFile(os.path.join(glove_dir, "glove.42B.300d.zip"), "r") as z:
            z.extractall(glove_dir)

    
    word2vec_file = get_tmpfile("word2vec_format.vec")
    glove2word2vec(glove_file, word2vec_file)
    print("Loading Glove vectors...")
    word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)

    word_vec_list = []
    for _, word in sorted(reversed_dict.items()):
        try:
            word_vec = word_vectors.word_vec(word)
        except KeyError:
            word_vec = np.zeros([embedding_size], dtype=np.float32)

        word_vec_list.append(word_vec)

    # Assign random vector to <s>, </s> token
    word_vec_list[2] = np.random.normal(0, 1, embedding_size)
    word_vec_list[3] = np.random.normal(0, 1, embedding_size)

    return np.array(word_vec_list)

In [0]:
class Model(object):
    def __init__(self, reversed_dict, article_max_len, summary_max_len, args,forward_only=False):
        # Innitialization Block
        self.vocabulary_size = len(reversed_dict)
        self.embedding_size = args.embedding_size
        self.num_hidden = args.num_hidden
        self.num_layers = args.num_layers
        self.learning_rate = args.learning_rate
        self.beam_width = args.beam_width
        if not forward_only:
          self.keep_prob = args.keep_prob
        else:
          self.keep_prob = 1.0
        self.cell = tf.nn.rnn_cell.BasicLSTMCell
        with tf.variable_scope("decoder/projection"):
          self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False)

        self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
        self.X = tf.placeholder(tf.int32, [None, article_max_len])
        self.X_len = tf.placeholder(tf.int32, [None])
        self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
        self.decoder_len = tf.placeholder(tf.int32, [None])
        self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
        self.global_step = tf.Variable(0, trainable=False)

        # Embedding Block
        with tf.name_scope("embedding"):
          # Check if training
            if not forward_only and args.glove:
              init_embeddings = tf.constant(get_init_embedding(args.glove, reversed_dict, self.embedding_size), dtype=tf.float32)
          # Else testing
            else:
              init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0)
            self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings)
            self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2])
            self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(self.embeddings, self.decoder_input), perm=[1, 0, 2])

        # Encoding Block
        with tf.name_scope("encoder"):
            fw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            bw_cells = [self.cell(self.num_hidden) for _ in range(self.num_layers)]
            fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells]
            bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells]

            encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                fw_cells, bw_cells, self.encoder_emb_inp,
                sequence_length=self.X_len, time_major=True, dtype=tf.float32)
            self.encoder_output = tf.concat(encoder_outputs, 2)
            encoder_state_c = tf.concat((encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
            encoder_state_h = tf.concat((encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
            self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

        # Decoding Block
        with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope:
            decoder_cell = self.cell(self.num_hidden * 2)

            if not forward_only:
                attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                    attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size)
                initial_state = initial_state.clone(cell_state=self.encoder_state)
                helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_emb_inp, self.decoder_len, time_major=True)
                decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, scope=decoder_scope)
                self.decoder_output = outputs.rnn_output
                self.logits = tf.transpose(
                    self.projection_layer(self.decoder_output), perm=[1, 0, 2])
                self.logits_reshape = tf.concat(
                    [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1)
            else:
                tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
                    tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width)
                tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, multiplier=self.beam_width)
                tiled_seq_len = tf.contrib.seq2seq.tile_batch(self.X_len, multiplier=self.beam_width)
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
                                                                    attention_layer_size=self.num_hidden * 2)
                initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
                initial_state = initial_state.clone(cell_state=tiled_encoder_final_state)
                decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=self.embeddings,
                    start_tokens=tf.fill([self.batch_size], tf.constant(2)),
                    end_token=tf.constant(3),
                    initial_state=initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.projection_layer
                )
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope)
                self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0])

        # Loss Block
        with tf.name_scope("loss"):
            if not forward_only:
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits_reshape, labels=self.decoder_target)
                weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32)
                self.loss = tf.reduce_sum(crossent * weights / tf.to_float(self.batch_size))

                params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, params)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.update = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

In [0]:
def train_model():
  class Args:
    num_hidden=150
    num_layers=2
    beam_width=10
    glove=True
    embedding_size=300

    learning_rate=1e-2
    batch_size=64
    num_epochs=10
    keep_prob = 0.8

    full_data=True 

    with_model=True
  args = Args()
  if not os.path.exists(default_path + "saved_model"):
    os.mkdir(default_path + "saved_model")

  print("Building dictionary...")
  word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("train", full_data)
  print("Loading training dataset...")
  train_x, train_y = build_dataset("train", word_dict, article_max_len, summary_max_len, full_data)
  print('Done...')
  start = time.perf_counter()
  tf.reset_default_graph()

  with tf.Session() as sess:
    model = Model(reversed_dict, article_max_len, summary_max_len, args)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())
    if 'old_model_checkpoint_path' in globals():
        print("Continuing from previous trained model:" , old_model_checkpoint_path , "...")
        saver.restore(sess, old_model_checkpoint_path )

    batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs)
    num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1

    print("\nIteration starts.")
    print("Number of batches per epoch :", num_batches_per_epoch)
    for batch_x, batch_y in batches:
        batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
        batch_decoder_input = list(map(lambda x: [word_dict["<s>"]] + list(x), batch_y))
        batch_decoder_len = list(map(lambda x: len([y for y in x if y != 0]), batch_decoder_input))
        batch_decoder_output = list(map(lambda x: list(x) + [word_dict["</s>"]], batch_y))

        batch_decoder_input = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_input))
        batch_decoder_output = list(
            map(lambda d: d + (summary_max_len - len(d)) * [word_dict["<padding>"]], batch_decoder_output))

        train_feed_dict = {
            model.batch_size: len(batch_x),
            model.X: batch_x,
            model.X_len: batch_x_len,
            model.decoder_input: batch_decoder_input,
            model.decoder_len: batch_decoder_len,
            model.decoder_target: batch_decoder_output
        }

        _, step, loss = sess.run([model.update, model.global_step, model.loss], feed_dict=train_feed_dict)

        if step % 1000 == 0:
            print("step {0}: loss = {1}".format(step, loss))

        if step % num_batches_per_epoch == 0:
            hours, rem = divmod(time.perf_counter() - start, 3600)
            minutes, seconds = divmod(rem, 60)
            saver.save(sess, default_path + "saved_model/model.ckpt", global_step=step)
            print(" Epoch {0}: Model is saved.".format(step // num_batches_per_epoch),
            "Elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds) , "\n")


  


In [79]:
train_model()

FileNotFoundError: ignored

In [0]:
def test_model():
    class Args:
        num_hidden=150
        num_layers=2
        beam_width=10
        glove=True
        embedding_size=300

        learning_rate=1e-2
        batch_size=64
        num_epochs=10
        keep_prob = 0.8

        full_data=True

        with_model=True


    args = Args()
    tf.reset_default_graph()

    print("Loading dictionary...")
    word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("test", args.full_data)
    print("Loading test dataset...")
    valid_x = build_dataset("test", word_dict, article_max_len, summary_max_len, args.full_data)
    valid_x_len = [len([y for y in x if y != 0]) for x in valid_x]

    with tf.Session() as sess:
        print("Loading saved model...")
        model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True)
        saver = tf.train.Saver(tf.global_variables())
        ckpt = tf.train.get_checkpoint_state(default_path + "saved_model/")
        saver.restore(sess, ckpt.model_checkpoint_path)

        batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1)

        print("Writing summaries to 'result.txt'...")
        for batch_x, _ in batches:
            batch_x_len = [len([y for y in x if y != 0]) for x in batch_x]

            valid_feed_dict = {
                model.batch_size: len(batch_x),
                model.X: batch_x,
                model.X_len: batch_x_len,
            }

            prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
            prediction_output = [[reversed_dict[y] for y in x] for x in prediction[:, 0, :]]
            summary_array = []
            with open(default_path + "result.txt", "a") as f:
                for line in prediction_output:
                    summary = list()
                    for word in line:
                        if word == "</s>":
                            break
                        if word not in summary:
                            summary.append(word)
                    summary_array.append(" ".join(summary))
                    print(" ".join(summary), file=f)

        print('Summaries have been generated')



In [80]:
test_model()

Loading dictionary...


FileNotFoundError: ignored