<a href="https://colab.research.google.com/github/katrina906/CS6120-Summarization-Project/blob/main/extractive_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# TODO: other extractive methods
# LSA: https://www.researchgate.net/publication/220195824_Text_summarization_using_Latent_Semantic_Analysis
  # choose most importance sentence from most important concepts, then most important sentence from second import concept (and other methods in article)
    # ensures summary is capturing different aspects of the article 
# score each sentence according to some generated features: https://github.com/xiaoxu193/PyTeaser/blob/master/pyteaser.py
  # other ideas for features: https://medium.com/@umerfarooq_26378/text-summarization-in-python-76c0a41f0dc4
  # not sure want to go down this rabbit hole - lots of tuning, feature creatione etc.. could just mention as an alternative method. or baseline compare the package version.
# try Jaccard similarity instead of cosine for text rank

# https://medium.com/@umerfarooq_26378/text-summarization-in-python-76c0a41f0dc4 - also includes abstractive methods
# https://www.machinecurve.com/index.php/2020/12/21/easy-text-summarization-with-huggingface-transformers-and-machine-learning/ - BART auto-regressive for summarization

In [2]:
%%capture
!pip install rouge-score
!pip install fasttext
!pip install compress-fasttext
!pip install gensim==3.8.3
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip

In [3]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip
# TODO: read into drive so don't have to wget every time? - in downloads. need to cmd unzip and upload to drive!
  # uploaded as zip in drive. see how fast !unzip is or if need to unzip in drive

In [4]:
import os
import pandas as pd
import numpy as np
import pickle
import string
import re
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import itertools
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer
import gensim
import fasttext
from gensim.models import FastText
import compress_fasttext
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords  
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# clean sentences for similarity comparisons; not for final display
# always do this function
# note that some sentence tokenization is messy. Ex: if numbered list, list becomes own sentence.
  # but shouldn't show up as important as summary sentence anyways
def text_cleaning(df):
  # downcase everything
  df['sentences_cleaned'] = df.sentences.apply(lambda text: [sentence.lower() for sentence in text])
  # remove punctuation 
  df.sentences_cleaned = df.sentences_cleaned.apply(lambda text: [re.sub("[^\w\s]", '', sentence) for sentence in text])
  # drop extra spaces
  df.sentences_cleaned = df.sentences_cleaned.apply(lambda text: [re.sub("\s+", ' ', sentence) for sentence in text])
  # drop sentences that only have one word (usually because numbered list etc.)
    # also drop from sentences: need indexes to match for final summary extraction 
  df['drop_sentences'] = df.sentences_cleaned.apply(lambda text: [i for i in range(len(text)) if len(text[i].split()) <= 1])
  df.sentences_cleaned = df.apply(lambda row: [row.sentences_cleaned[i] for i in range(len(row.sentences_cleaned)) if i not in row.drop_sentences], axis = 1)
  df.sentences = df.apply(lambda row: [row.sentences[i] for i in range(len(row.sentences)) if i not in row.drop_sentences], axis = 1)
  
  return df

In [7]:
# cleaning depending on configuration
def text_cleaning_config(df_config, config, stop_words):
  df_config['words'] = df_config.sentences_cleaned.apply(lambda row: [sentence.split() for sentence in row])
  if 'stopwords' in config:
    df_config.words = df_config.words.apply(lambda row: [[w for w in sentence if not w in stop_words] for sentence in row])
  if 'stem' in config:
    stemmer = PorterStemmer()
    df_config.words = df_config.words.apply(lambda row: [[stemmer.stem(w) for w in sentence] for sentence in row])
  if 'lemma' in config:
    lemmatizer = WordNetLemmatizer()
    df_config.words = df_config.words.apply(lambda row: [[lemmatizer.lemmatize(w) for w in sentence] for sentence in row])

  # recombine words into sentences
  df_config.sentences_cleaned = df_config.words.apply(lambda row: [' '.join(sentence) for sentence in row])

  # drop sentences that only have one word after these exclusions
    # also drop from sentences: need indexes to match for final summary extraction 
  df_config['drop_sentences'] = df_config.sentences_cleaned.apply(lambda text: [i for i in range(len(text)) if len(text[i].split()) <= 1])
  df_config.sentences_cleaned = df_config.apply(lambda row: [row.sentences_cleaned[i] for i in range(len(row.sentences_cleaned)) if i not in row.drop_sentences], axis = 1)
  df_config.sentences = df_config.apply(lambda row: [row.sentences[i] for i in range(len(row.sentences)) if i not in row.drop_sentences], axis = 1)

  return df_config

### Train TFIDF in Corpus
Used in baseline model to sum tfidf scores within each sentence in each document 

In [8]:
def corpus_tfidf(df):
  
  # list of words in each article
  corpus = df.sentences_cleaned.to_list()
  corpus = [' '.join(article) for article in corpus]  
  corpus = [article.split(' ') for article in corpus]

  # tfidf trained on entire corpus: document = article
  tfidf_vec = TfidfVectorizer(analyzer = 'word', 
                              tokenizer = lambda doc: doc, preprocessor = lambda doc: doc, token_pattern = None)
                              # already did preprocessing, so using identity functions for tokenizer and preprocessor
  tfidf = tfidf_vec.fit_transform(corpus) # sparse arrays of scores for each word in each article. articles x words
  feature_array = list(tfidf_vec.get_feature_names())
  
  return tfidf, feature_array

### Vector Representation 
Default: unigram bag of words with counts
Options: 
1. Bow
  - binary: bag of words with binary indicators rather than counts (don't use with tfidf)
  - tf: term frequency normalization 
    - Same as default if cosine similarity. Cosine similarity does the normalization (double check this!!)
  - idf: inverse document normalization 
  - include_bigrams/include_trigrams: include bigrams and/or trigrams of words in addition to unigrams as distinct tokens in bag of words
    - Gives sense of order in sentence, capture _concepts_ rather than just individual words
2. Embeddings (pre-trained)
  - GloVe
  - Fasttext
    - Advantage: generate embeddings for out of vocabulary words based on their parts
    - But memory issues

In [9]:
# vector representation of words in each sentence in document 
def vector_representation(df, configuration, embeddings):

  # list of words in each sentence 
  df['words'] = df.sentences_cleaned.apply(lambda row: [sentence.split() for sentence in row])

  if 'bow' in configuration:

    # include bigrams and/or trigrams (in addition to unigrams) in bow 
    if 'bigram' in configuration or 'all' in configuration:
      df['bigrams'] = df.words.apply(lambda row: [list(nltk.bigrams(sentence)) if len(sentence) >= 2 else '' for sentence in row])
      df.bigrams = df.bigrams.apply(lambda row: [[words[0] + ' ' + words[1] for words in sentence] for sentence in row])
      df.words = df.apply(lambda row: [row.bigrams[j] + row.words[j] for j in range(len(row.words))], axis = 1)
    if 'trigram' in configuration or 'all' in configuration:
      df['trigrams'] = df.words.apply(lambda row: [list(nltk.trigrams(sentence)) if len(sentence) >= 3 else '' for sentence in row])
      df.trigrams = df.bigrams.apply(lambda row: [[words[0] + ' ' + words[1] + words[2] for words in sentence] for sentence in row])
      df.words = df.apply(lambda row: [row.trigrams[j] + row.words[j] for j in range(len(row.words))], axis = 1)

    # bag of words with binary indicators for words/n-grams rather than counts
    if 'binary' in configuration: 
      df.words = df.words.apply(lambda row: [set(sentence) for sentence in row])

    # bag of words: # sentences x # unique words
    vec = DictVectorizer()
    df['vector_rep'] = df.words.apply(lambda row: vec.fit_transform(Counter(f) for f in row))

    # term frequency normalization
    if 'tf' in configuration: 
      tfidf_transformer = TfidfTransformer(use_idf = False)
      df['vector_rep'] = df.vector_rep.apply(lambda row: tfidf_transformer.fit_transform(row))
    # term frequency-inverse document frequency normalization
    if 'tfidf' in configuration:
      tfidf_transformer = TfidfTransformer(use_idf = True)
      df['vector_rep'] = df.vector_rep.apply(lambda row: tfidf_transformer.fit_transform(row))  

  # possible extension: continued training on specific corpus. Probably unnecessary since wikipedia and news article words should be similar
  if 'embedding' in configuration:

    if 'glove' in configuration:
      word_embeddings = embeddings['glove']
      # find average of word embeddings for each sentence 
      # if unknown word, give embedding = 0 
      df['vector_rep'] = df.sentences_cleaned.apply(lambda row: [sum([word_embeddings.get(word, np.zeros(100,)) for word in sentence.split()]) / len(sentence.split()) for sentence in row])

    # fasttext.
    if 'fasttext' in configuration:
      word_embeddings = embeddings['fasttext']
      # find average of word embeddings for each sentence 
      df['vector_rep'] = df.sentences_cleaned.apply(lambda row: [sum([word_embeddings[word] for word in sentence.split()]) / len(sentence.split()) for sentence in row])

  return df

### PageRank Model

In [10]:
# TODO: other similarity metrics?
# TODO: other algorithms
def pagerank(df):

  # similarity matrix between sentences
  df['sim'] = df.vector_rep.apply(lambda row: cosine_similarity(row))
  # graph where node = sentence, edge weight = simialarity score
  df['graph'] = df.sim.apply(lambda row: nx.from_numpy_array(row))  
  # page rank
  df['pr'] = df.graph.apply(lambda row: nx.pagerank_numpy(row))
  # sort keys in order of page rank
  df['bestkeys'] = df.pr.apply(lambda row: sorted(row, key = row.get, reverse = True))

  return df

### Baseline Model
- Train TF-IDF on entire corpus where document = article. Get a score for each word in each document
- Sum scores for all words in each sentence 
- Produce sentences with highest total TF-IDF score 

Idea: Sentences that are indicative of the specifics of the article. High frequency in the article, but specific to the article

Could also try straight term frequencies within the article. (or weighted like above so fractional of most frequent rather than diff. magnitudes). Would need to drop stop words first (https://stackabuse.com/text-summarization-with-nltk-in-python/)

In [11]:
def tfidf_sum(df, feature_array, tfidf):

  # sum tfidf score within each sentence. 
  # Normalize by length of sentence. Otherwise recommend longest sentences 
  df['doc_num'] = np.arange(len(df))
  df['sentence_words'] = df.sentences_cleaned.apply(lambda row: [sentence.split() for sentence in row])
  df['sentence_scores'] = df.apply(lambda row: [np.sum([tfidf[row.doc_num,feature_array.index(word)] for word in sentence]) / len(sentence) for sentence in row.sentence_words], axis = 1)

  # sort keys in order of summed tfidf score
  df['bestkeys'] = df.sentence_scores.apply(lambda row: np.argsort(row)[::-1])

  return df

### Extract Summary
Grab best sentences based on ranking mechanism     
Length of summary (Number of sentences)?
- Number of sentences: generate 1 summary sentence per text sentence (average)
  - Problem: text sentences are much longer than summary sentences, and since we are producing text sentences as our predicted summary, predicted summary is much longer than label summary
- Number of words: generate 20 summary words per 1 text word
  - Strict version: words in summary must be less than the threshold
  - Less strict version: can go over limit by 1 sentence if reach threshold within the sentence

In [12]:
def extract_summary_num_words(row, config):
  num_words = 0
  summary = []
  cnt = 0 # ensure give at least one sentence in summary
  for i in row.bestkeys:
    num_words += len(row.sentences[i].split())
    if 'num_words_lt' in config:
      if (num_words >= row.max_words) and (cnt != 0):
        return summary
      summary.append(row.sentences[i])
    if 'num_words_gt' in config:
      summary.append(row.sentences[i])
      if num_words >= row.max_words:
        return summary
    cnt += 1

In [13]:
def extract_summary(df, config):

  # summary based on number of sentences 
  if 'num_sentences' in config:
    df['max_sentences'] = df.sentences.apply(lambda row: int(np.floor(len(row) / 6))) # average 6 summary sentences per doc sentence
    df['predicted_summary'] = df.apply(lambda row: [row.sentences[i] for i in row.bestkeys[0:row.max_sentences]], axis = 1)
  # summary based on number of words
  if 'num_words_gt' in config or 'num_words_lt' in config:
    df['max_words'] = df.sentences.apply(lambda row: np.floor(len(''.join(row).split(' ')) / 20)) # average 20 summary words per text word
    df['predicted_summary'] = df.apply(lambda row: extract_summary_num_words(row, config), axis = 1)
  
  return df

### Evaluation 
ROUGE metric:
https://kavita-ganesan.com/what-is-rouge-and-how-it-works-for-evaluation-of-summaries/#.YEKJyI5KiUl   
- Precision = # overlapping ngrams / # total ngrams in produced summary 
  - Measure of junk. Did we produce a lot in the generated summary that is not in the actual summary?
  - Important if we don't manually set the length. The generated summary could be very long which causes good recall
- Recall = # overlapping ngrams / # total ngrams in label summary  
  - Did we get all the words in the actual summary?
- F1 = harmonic mean
- Look at both purely unigram mesaure and an average of unigram and bigram measure
  - Don't care about order of words (captured by bigram) as much as in other settings where worried about fluency, syntax of text. Here we know the produced sentences are real English. But still bigrams can capture phrases. 

Cons: 
- Doesn't look at sentence structure --> doesn't apply here because using correct sentences
- Doesn't consider meaning -- same words could have different meaning   
  
Also considered BLEU, but only gives precision.     
https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213

In [14]:
def evaluate(df):
  
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer = False)
  df['rouge'] = df.apply(lambda row: scorer.score(''.join(row.predicted_summary), ''.join(row.summary)), axis = 1)

  return df

In [15]:
# TODO compare evaluations between models with paired bootstrap test to test significance? separate config loop for each model type (baseline, page rank etc.)

In [16]:
CONFIGURATIONS_BOW = [['pagerank'],
                      ['nostop', 'stopwords'],
                      ['no_stemlemma', 'lemma', 'stem'],
                      ['bow'],
                      ['counts', 'binary'],
                      ['no_normalization', 'tf', 'tfidf'],
                      ['unigram', 'bigram', 'trigram', 'all'],
                      ['num_sentences', 'num_words_lt', 'num_words_gt']
                      ]
CONFIGURATIONS_EMBEDDINGS = [['pagerank'],
                             ['nostop', 'stopwords'],
                             ['no_stemlemma', 'lemma', 'stem'],
                             ['embedding'],
                             ['glove', 'fasttext'],
                             ['num_sentences', 'num_words_lt', 'num_words_gt']
                             ]
# no custom text cleaning options: true baseline. Stop word removal unnecessary with tfidf.
CONFIGURATIONS_BASELINE = [['baseline'],
                           ['num_sentences', 'num_words_lt', 'num_words_gt']
                           ]

In [17]:
def data_setup():

  # load data
  df = pd.read_pickle("/content/drive/MyDrive/data/cleaned_df.pkl")
  df = df.head(100)

  # text cleaning 
  df = text_cleaning(df)

  return df

In [18]:
def feature_setup(df):

  # 1. train TF-IDF on entire corpus for baseline model
  tfidf, feature_array = corpus_tfidf(df) 

  # 2. load embeddings 
  embeddings = {}

  # load glove embeddings - code from https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/
  # 100 length vector for each word 
  glove_wv = {}
  f = open('glove.6B.100d.txt', encoding='utf-8')
  for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      glove_wv[word] = coefs
  f.close()
  embeddings['glove'] = glove_wv

  # load fasttext embeddings 
  embeddings['fasttext'] = gensim.models.KeyedVectors.load("/content/drive/MyDrive/data/shrunk_fasttext_svd.model")

  return tfidf, feature_array, embeddings

In [19]:
# TODO think about memory - run in batches and save config results in an append fashion?
# number of configurations quickly ballooning -- currently 468 configurations...
  # maybe evaluate on smaller subsample and then pick top x configurations. Then train on full sample. 

In [20]:
def train_loop(df, tfidf, feature_array, embeddings, config_list):
  config_results = {}

  # loop through configurations
  for config in config_list:
    print(config)
    df_config = df.copy()

    if 'stopwords' in config or 'stem' in config or 'lemma' in config: 
      df_config = text_cleaning_config(df_config, config, stop_words)

    if 'baseline' in config:
      df_config = tfidf_sum(df_config, feature_array, tfidf)
    elif 'pagerank' in config:
      df_config = vector_representation(df_config, config, embeddings)
      df_config = pagerank(df_config)

    df = extract_summary(df_config, config) 
    df = evaluate(df)

    # evaluate along each metric and average across documents for config stats
    eval_dict = {}
    for metric_type in ['fmeasure', 'precision', 'recall']:
      for avg in [True, False]:
        if metric_type == 'fmeasure':
          df['metric1'] = df.rouge.apply(lambda row: row['rouge1'].fmeasure)
          df['metric2'] = df.rouge.apply(lambda row: row['rouge2'].fmeasure)

        elif metric_type == 'precision':
          df['metric1'] = df.rouge.apply(lambda row: row['rouge1'].precision)
          df['metric2'] = df.rouge.apply(lambda row: row['rouge2'].precision)

        elif metric_type == 'recall':
          df['metric1'] = df.rouge.apply(lambda row: row['rouge1'].recall)
          df['metric2'] = df.rouge.apply(lambda row: row['rouge2'].recall)

        if avg:
          df['metric'] = (df.metric1 + df.metric2) / 2
        else: 
          df['metric'] = df.metric1 

        # average across all documents
        eval_dict[(metric_type, avg)] = df.metric.mean()

    config_results[str(config)] = eval_dict

  return config_results

### Find Best Configurations
- Find configuration with best ROUGE (1) Precision (2) Recall and (3) F-Measure.   
- Unigram vs average of unigram and bigram metrics



In [21]:
def best_configs(config_results):
  max_metric = 0
  best_config = ''

  for metric in [('fmeasure', False), ('fmeasure', True), ('precision', False), ('precision', True), ('recall', False), ('recall', True)]:
    max_metric = 0
    best_config = ''
    for k,v in config_results.items():
      if v[metric] > max_metric:
        max_metric = v[metric]
        best_config = k

    if metric[1] == True:
      avg_text = 'Gram-Average'
    else:
      avg_text = ''
    print(metric, avg_text, ':', best_config, max_metric)

In [22]:
def main():

  df = data_setup()
  tfidf, feature_array, embeddings = feature_setup(df)

  config_list_pr = list(itertools.product(*CONFIGURATIONS_BOW)) + list(itertools.product(*CONFIGURATIONS_EMBEDDINGS)) 
  config_list_baseline = list(itertools.product(*CONFIGURATIONS_BASELINE))

  config_results_baseline = train_loop(df, tfidf, feature_array, embeddings, config_list_baseline)
  config_results_pr = train_loop(df, tfidf, feature_array, embeddings, config_list_pr) 
  
  for model in ['baseline', 'pr']:
    if model == 'baseline':
      print('------- Baseline -------')
      config_results = config_results_baseline
    elif model == 'pr':
      print('------- Text Rank -------')
      config_results = config_results_pr
    best_configs(config_results)

  return config_results_baseline, config_results_pr

In [23]:
config_results_baseline, config_results_pr = main()

('baseline', 'num_sentences')
('baseline', 'num_words_lt')
('baseline', 'num_words_gt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'unigram', 'num_sentences')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'unigram', 'num_words_lt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'unigram', 'num_words_gt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'bigram', 'num_sentences')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'bigram', 'num_words_lt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'bigram', 'num_words_gt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'trigram', 'num_sentences')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'trigram', 'num_words_lt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'trigram', '

In [23]:
# TODO: save distribution of scores; not just averages for each config. 

In [24]:
#config_results["('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'tfidf', 'unigram', 'num_words_lt')"]

In [25]:
#config_results["('baseline', 'num_words_lt')"]