<a href="https://colab.research.google.com/github/katrina906/CS6120-Summarization-Project/blob/main/extractive_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# this project is focusing on unsupervised approaches with less human assumptions about what makes a good summary. Allow content owners to ultimately decide 
# but there are some other interesting, more supervised-ish (kinda?) approaches that exist: 
  # Possible extension/further research: score each sentence according to some generated features: https://github.com/xiaoxu193/PyTeaser/blob/master/pyteaser.py
    # other ideas for features: https://medium.com/@umerfarooq_26378/text-summarization-in-python-76c0a41f0dc4
    # not sure want to go down this rabbit hole - lots of tuning, feature creation etc
  # also some people do binary classification for if each sentence should be in the summary. 

# https://medium.com/@umerfarooq_26378/text-summarization-in-python-76c0a41f0dc4 - also includes abstractive methods
# https://www.machinecurve.com/index.php/2020/12/21/easy-text-summarization-with-huggingface-transformers-and-machine-learning/ - BART auto-regressive for summarization

In [None]:
%%capture
!pip install rouge-score
!pip install fasttext
!pip install compress-fasttext
!pip install gensim==3.8.3

In [None]:
import os
import pandas as pd
import numpy as np
import pickle
import string
import re
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import networkx as nx
from rouge_score import rouge_scorer
import gensim
import fasttext
from gensim.models import FastText
import compress_fasttext
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords  

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# clean sentences for similarity comparisons; not for final display
# always do this function
# note that some sentence tokenization is messy. Ex: if numbered list, list becomes own sentence.
  # but shouldn't show up as important as summary sentence anyways
def text_cleaning(df):
  # downcase everything
  df['sentences_cleaned'] = df.sentences.apply(lambda text: [sentence.lower() for sentence in text])
  # remove punctuation 
  df.sentences_cleaned = df.sentences_cleaned.apply(lambda text: [re.sub("[^\w\s]", '', sentence) for sentence in text])
  # drop extra spaces
  df.sentences_cleaned = df.sentences_cleaned.apply(lambda text: [re.sub("\s+", ' ', sentence) for sentence in text])
  # drop sentences that only have one word (usually because numbered list etc.)
    # also drop from sentences: need indexes to match for final summary extraction 
  df['drop_sentences'] = df.sentences_cleaned.apply(lambda text: [i for i in range(len(text)) if len(text[i].split()) <= 1])
  df.sentences_cleaned = df.apply(lambda row: [row.sentences_cleaned[i] for i in range(len(row.sentences_cleaned)) if i not in row.drop_sentences], axis = 1)
  df.sentences = df.apply(lambda row: [row.sentences[i] for i in range(len(row.sentences)) if i not in row.drop_sentences], axis = 1)
  
  return df

In [None]:
# cleaning depending on configuration
def text_cleaning_config(df_config, config, stop_words):
  df_config['words'] = df_config.sentences_cleaned.apply(lambda row: [sentence.split() for sentence in row])
  if 'stopwords' in config:
    df_config.words = df_config.words.apply(lambda row: [[w for w in sentence if not w in stop_words] for sentence in row])
  if 'stem' in config:
    stemmer = PorterStemmer()
    df_config.words = df_config.words.apply(lambda row: [[stemmer.stem(w) for w in sentence] for sentence in row])
  if 'lemma' in config:
    lemmatizer = WordNetLemmatizer()
    df_config.words = df_config.words.apply(lambda row: [[lemmatizer.lemmatize(w) for w in sentence] for sentence in row])

  # recombine words into sentences
  df_config.sentences_cleaned = df_config.words.apply(lambda row: [' '.join(sentence) for sentence in row])

  # drop sentences that only have one word after these exclusions
    # also drop from sentences: need indexes to match for final summary extraction 
  df_config['drop_sentences'] = df_config.sentences_cleaned.apply(lambda text: [i for i in range(len(text)) if len(text[i].split()) <= 1])
  df_config.sentences_cleaned = df_config.apply(lambda row: [row.sentences_cleaned[i] for i in range(len(row.sentences_cleaned)) if i not in row.drop_sentences], axis = 1)
  df_config.sentences = df_config.apply(lambda row: [row.sentences[i] for i in range(len(row.sentences)) if i not in row.drop_sentences], axis = 1)

  return df_config

### Train TFIDF in Corpus
Used in baseline model to sum tfidf scores within each sentence in each document 

In [None]:
def corpus_tfidf(df):
  
  # list of words in each article
  corpus = df.sentences_cleaned.to_list()
  corpus = [' '.join(article) for article in corpus]  
  corpus = [article.split(' ') for article in corpus]

  # tfidf trained on entire corpus: document = article
  tfidf_vec = TfidfVectorizer(analyzer = 'word', 
                              tokenizer = lambda doc: doc, preprocessor = lambda doc: doc, token_pattern = None)
                              # already did preprocessing, so using identity functions for tokenizer and preprocessor
  tfidf = tfidf_vec.fit_transform(corpus) # sparse arrays of scores for each word in each article. articles x words
  feature_array = list(tfidf_vec.get_feature_names())
  
  return tfidf, feature_array

### Vector Representation 
Default: unigram bag of words with counts
Options: 
1. Bow
  - binary: bag of words with binary indicators rather than counts (don't use with tfidf)
  - tf: term frequency normalization 
    - Same as default if cosine similarity. Cosine similarity does the normalization (double check this!!)
  - idf: inverse document normalization 
  - include_bigrams/include_trigrams: include bigrams and/or trigrams of words in addition to unigrams as distinct tokens in bag of words
    - Gives sense of order in sentence, capture _concepts_ rather than just individual words
2. Embeddings (pre-trained)
  - GloVe
  - Fasttext
    - Advantage: generate embeddings for out of vocabulary words based on their parts
    - But memory issues

In [None]:
# vector representation of words in each sentence in document 
def vector_representation(df, configuration, embeddings):

  # list of words in each sentence 
  df['words'] = df.sentences_cleaned.apply(lambda row: [sentence.split() for sentence in row])

  if 'bow' in configuration:

    # include bigrams and/or trigrams (in addition to unigrams) in bow 
    if 'bigram' in configuration or 'all' in configuration:
      df['bigrams'] = df.words.apply(lambda row: [list(nltk.bigrams(sentence)) if len(sentence) >= 2 else '' for sentence in row])
      df.bigrams = df.bigrams.apply(lambda row: [[words[0] + ' ' + words[1] for words in sentence] for sentence in row])
      df.words = df.apply(lambda row: [row.bigrams[j] + row.words[j] for j in range(len(row.words))], axis = 1)
    if 'trigram' in configuration or 'all' in configuration:
      df['trigrams'] = df.words.apply(lambda row: [list(nltk.trigrams(sentence)) if len(sentence) >= 3 else '' for sentence in row])
      df.trigrams = df.trigrams.apply(lambda row: [[words[0] + ' ' + words[1] + words[2] for words in sentence] for sentence in row])
      df.words = df.apply(lambda row: [row.trigrams[j] + row.words[j] for j in range(len(row.words))], axis = 1)

    # bag of words with binary indicators for words/n-grams rather than counts
    if 'binary' in configuration: 
      df.words = df.words.apply(lambda row: [set(sentence) for sentence in row])

    # bag of words: # sentences x # unique words
    vec = DictVectorizer()
    df['vector_rep'] = df.words.apply(lambda row: vec.fit_transform(Counter(f) for f in row))

    # term frequency normalization
    if 'tf' in configuration: 
      tfidf_transformer = TfidfTransformer(use_idf = False)
      df['vector_rep'] = df.vector_rep.apply(lambda row: tfidf_transformer.fit_transform(row))
    # term frequency-inverse document frequency normalization
    if 'tfidf' in configuration:
      tfidf_transformer = TfidfTransformer(use_idf = True)
      df['vector_rep'] = df.vector_rep.apply(lambda row: tfidf_transformer.fit_transform(row))  

  # possible extension: continued training on specific corpus. Probably unnecessary since wikipedia and news article words should be similar
  if 'embedding' in configuration:

    if 'glove' in configuration:
      word_embeddings = embeddings['glove']
      # find average of word embeddings for each sentence 
      # if unknown word, give embedding = 0 
      df['vector_rep'] = df.sentences_cleaned.apply(lambda row: [sum([word_embeddings.get(word, np.zeros(100,)) for word in sentence.split()]) / len(sentence.split()) for sentence in row])

    # fasttext.
    if 'fasttext' in configuration:
      word_embeddings = embeddings['fasttext']
      # find average of word embeddings for each sentence 
      df['vector_rep'] = df.sentences_cleaned.apply(lambda row: [sum([word_embeddings[word] for word in sentence.split()]) / len(sentence.split()) for sentence in row])

  return df

### Baseline Model
- Train TF-IDF on entire corpus where document = article. Get a score for each word in each document
- Sum scores for all words in each sentence 
- Produce sentences with highest total TF-IDF score 

Idea: Sentences that are indicative of the specifics of the article. High frequency in the article, but specific to the article

Could also try straight term frequencies within the article. (or weighted like above so fractional of most frequent rather than diff. magnitudes). Would need to drop stop words first (https://stackabuse.com/text-summarization-with-nltk-in-python/)

In [None]:
def tfidf_sum(df, feature_array, tfidf):

  # sum tfidf score within each sentence. 
  # Normalize by length of sentence. Otherwise recommend longest sentences 
  df['doc_num'] = np.arange(len(df))
  df['sentence_words'] = df.sentences_cleaned.apply(lambda row: [sentence.split() for sentence in row])
  df['sentence_scores'] = df.apply(lambda row: [np.sum([tfidf[row.doc_num,feature_array.index(word)] for word in sentence]) / len(sentence) for sentence in row.sentence_words], axis = 1)

  # sort keys in order of summed tfidf score
  df['bestkeys'] = df.sentence_scores.apply(lambda row: np.argsort(row)[::-1])

  return df

### TextRank Model
Similarity Metrics:
- Cosine
- Jaccard: proportion of elements that are the same where at least one is non-zero
  - AKA, what percent of words in the two sentences are the same?
- Hamming: proportion of elements that are not same (including zeroes)
  - AKA, out of all words in the vocabulary, what percent of words are both in or both not in two sentences

Both Jaccard and Hamming only make sense on binary representations of the data. Only applied to binary vector representations.

In [None]:
%%capture
def textrank(df, config):

  # similarity matrix between sentences
  if 'cosine' in config:
    df['sim'] = df.vector_rep.apply(lambda row: cosine_similarity(row))
  elif 'jaccard' in config:
    df['sim'] = df.vector_rep.apply(lambda row: 1 - pairwise_distances(row.A, metric = 'jaccard'))
  elif 'hamming' in config:
    df['sim'] = df.vector_rep.apply(lambda row: 1 - pairwise_distances(row.A, metric = 'hamming'))

  # graph where node = sentence, edge weight = simialarity score
  df['graph'] = df.sim.apply(lambda row: nx.from_numpy_array(row))  
  # page rank
  df['textrank'] = df.graph.apply(lambda row: nx.pagerank_numpy(row))
  # sort keys in order of page rank
  df['bestkeys'] = df.pr.apply(lambda row: sorted(row, key = row.get, reverse = True))

  return df

### Latent Semantic Allocation (Singular Value Decomposition)
1. SVD to get matrix of sentences x latent concepts, ordered in importance of concept to article
2. For each concept, get the highest scored sentence for that concept. Get best sentence for each concept in decreasing order of concept importance. 
  - Ensures summary captures the most important aspects of the article and covers the span of concepts.

Other sentence extraction techniques have been explored, but they are mostly deal with choosing multiple sentences from the same concept. In our application, the summaries are very short (a few sentences each) and it is more important to capture the breadth of topics discussed so that it is retrievable in search and so that the content creator can edit and specify based on an overall outline of their content. 

https://www.researchgate.net/publication/220195824_Text_summarization_using_Latent_Semantic_Analysis


Also considered using LDA, but there's no sense of dominant topic for the overall corpus (article). Rather, you can only get the probability each sentence belongs to each topic. So I could get best sentence per topic, but don't know in which order to place those sentences in the overall ranking.


In [None]:
def lsa(df):

  # SVD: get Vt sentences x concepts matrix
  df['svd'] = df.vector_rep.apply(lambda row: np.linalg.svd(row.T.todense(), full_matrices = False)[2])
  # sentence with highest score for each concept in order of most important concept to document
  df['bestkeys'] = df.svd.apply(lambda row: [np.argmax(row[:, i]) for i in range(len(row))])

  return df

### Extract Summary
Grab best sentences based on ranking mechanism     
Length of summary (Number of sentences)?
- Number of sentences: generate 1 summary sentence per text sentence (average)
  - Problem: text sentences are much longer than summary sentences, and since we are producing text sentences as our predicted summary, predicted summary is much longer than label summary
- Number of words: generate 20 summary words per 1 text word
  - Strict version: words in summary must be less than the threshold
  - Less strict version: can go over limit by 1 sentence if reach threshold within the sentence

In [None]:
def extract_summary_num_words(row, config):
  num_words = 0
  summary = []
  cnt = 0 # ensure give at least one sentence in summary
  for i in row.bestkeys:
    num_words += len(row.sentences[i].split())
    if 'num_words_lt' in config:
      if (num_words >= row.max_words) and (cnt != 0):
        return summary
      summary.append(row.sentences[i])
    if 'num_words_gt' in config:
      summary.append(row.sentences[i])
      if num_words >= row.max_words:
        return summary
    cnt += 1

In [None]:
def extract_summary(df, config):

  # summary based on number of sentences 
  if 'num_sentences' in config:
    df['max_sentences'] = df.sentences.apply(lambda row: int(np.floor(len(row) / 6))) # average 6 summary sentences per doc sentence
    df['predicted_summary'] = df.apply(lambda row: [row.sentences[i] for i in row.bestkeys[0:row.max_sentences]], axis = 1)
  # summary based on number of words
  if 'num_words_gt' in config or 'num_words_lt' in config:
    df['max_words'] = df.sentences.apply(lambda row: np.floor(len(''.join(row).split(' ')) / 20)) # average 20 summary words per text word
    df['predicted_summary'] = df.apply(lambda row: extract_summary_num_words(row, config), axis = 1)
  
  return df

### Evaluation 
ROUGE metric:
https://kavita-ganesan.com/what-is-rouge-and-how-it-works-for-evaluation-of-summaries/#.YEKJyI5KiUl   
- Precision = # overlapping ngrams / # total ngrams in produced summary 
  - Measure of junk. Did we produce a lot in the generated summary that is not in the actual summary?
  - Important if we don't manually set the length. The generated summary could be very long which causes good recall
- Recall = # overlapping ngrams / # total ngrams in label summary  
  - Did we get all the words in the actual summary?
- F1 = harmonic mean
- Look at both purely unigram mesaure and an average of unigram and bigram measure
  - Don't care about order of words (captured by bigram) as much as in other settings where worried about fluency, syntax of text. Here we know the produced sentences are real English. But still bigrams can capture phrases. 

Cons: 
- Doesn't look at sentence structure --> doesn't apply here because using correct sentences
- Doesn't consider meaning -- same words could have different meaning   
  
Also considered BLEU, but only gives precision.     
https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213

In [None]:
def evaluate(df):
  
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer = False)
  df['rouge'] = df.apply(lambda row: scorer.score(''.join(row.predicted_summary), ''.join(row.summary)), axis = 1)

  return df

In [None]:
# get distribution of each metric across documents
def metrics_distribution(df):
    
  eval_dict = {}
  for metric_type in ['fmeasure', 'precision', 'recall']:
    for avg in [True, False]:
      if metric_type == 'fmeasure':
        df['metric1'] = df.rouge.apply(lambda row: row['rouge1'].fmeasure)
        df['metric2'] = df.rouge.apply(lambda row: row['rouge2'].fmeasure)

      elif metric_type == 'precision':
        df['metric1'] = df.rouge.apply(lambda row: row['rouge1'].precision)
        df['metric2'] = df.rouge.apply(lambda row: row['rouge2'].precision)

      elif metric_type == 'recall':
        df['metric1'] = df.rouge.apply(lambda row: row['rouge1'].recall)
        df['metric2'] = df.rouge.apply(lambda row: row['rouge2'].recall)

      if avg:
        df['metric'] = (df.metric1 + df.metric2) / 2
      else: 
        df['metric'] = df.metric1 

      # record distribution of metric across all documents
      eval_dict[(metric_type, avg)] = df.metric.describe()

  return eval_dict

### Find Best Configurations
- Find configuration with best ROUGE (1) Precision (2) Recall and (3) F-Measure.   
- For each configuration, mean metric for all documents 
- Unigram vs average of unigram and bigram metrics



In [None]:
def find_best_configs(config_results):
  max_metric = 0
  best_config = ''
  best_config_dict = {}

  for metric in [('fmeasure', False), ('fmeasure', True), ('precision', False), ('precision', True), ('recall', False), ('recall', True)]:
    max_metric = 0
    best_config = ''
    for k,v in config_results.items():
      if v[metric]['mean'] > max_metric:
        max_metric = v[metric]['mean']
        best_config = k

    best_config_dict[metric] = best_config
  return best_config_dict

### Main Run

In [None]:
# All possible configurations for each model
CONFIGURATIONS_BOW = [['textrank'],
                      ['nostop', 'stopwords'],
                      ['no_stemlemma', 'lemma', 'stem'],
                      ['bow'],
                      ['counts', 'binary'],
                      ['no_normalization', 'tf', 'tfidf'],
                      ['unigram', 'bigram', 'trigram', 'all'],
                      ['cosine', 'hamming', 'jaccard'],
                      ['num_sentences', 'num_words_lt', 'num_words_gt']
                      ]          
CONFIGURATIONS_EMBEDDINGS = [['textrank'],
                             ['nostop', 'stopwords'],
                             ['no_stemlemma', 'lemma', 'stem'],
                             ['embedding'],
                             ['glove', 'fasttext'],
                             ['cosine'],
                             ['num_sentences', 'num_words_lt', 'num_words_gt']
                             ]
# no custom text cleaning options: true baseline. Stop word removal unnecessary with tfidf.
CONFIGURATIONS_BASELINE = [['baseline'],
                           ['num_sentences', 'num_words_lt', 'num_words_gt']
                           ]
CONFIGURATIONS_LSA = [['lsa'],
                      ['nostop', 'stopwords'],
                      ['no_stemlemma', 'lemma', 'stem'],
                      ['bow'],
                      ['counts', 'binary'],
                      ['no_normalization', 'tf', 'tfidf'],
                      ['unigram', 'bigram', 'trigram', 'all'],
                      ['num_sentences', 'num_words_lt', 'num_words_gt']]

# cross products of all possible combinations of configurations
model_configurations = {'textrank':list(itertools.product(*CONFIGURATIONS_BOW)) + list(itertools.product(*CONFIGURATIONS_EMBEDDINGS)),
                        'baseline':list(itertools.product(*CONFIGURATIONS_BASELINE)),
                        'lsa':list(itertools.product(*CONFIGURATIONS_LSA))}
                    
# textrank: remove configurations with jaccard or hamming distance that don't use a binary vector representation
model_configurations['textrank'] = [i for i in model_configurations['textrank'] if (('jaccard' not in i) and ('hamming') not in i) or (('binary' in i) and ('no_normalization' in i))]

In [None]:
def data_setup():

  # load data
  df = pd.read_pickle("/content/drive/MyDrive/data/cleaned_df.pkl")
  df = df.head(110) # TODO 10000 is this a good size to limit to? _randomize_

  # text cleaning 
  df = text_cleaning(df)

  return df

In [None]:
def load_embeddings():
  embeddings = {}

  # load glove embeddings - code from https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/
  # 100 length vector for each word 
  glove_wv = {}
  f = open('glove.6B.100d.txt', encoding='utf-8')
  for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      glove_wv[word] = coefs
  f.close()
  embeddings['glove'] = glove_wv

  # load fasttext embeddings 
  embeddings['fasttext'] = gensim.models.KeyedVectors.load("/content/drive/MyDrive/data/shrunk_fasttext_svd.model")

  return embeddings

In [None]:
def train_config_loop(df, tfidf, feature_array, embeddings, config_list):
  eval_results = {}
  model_results = {}

  # loop through configurations
  for config in config_list:
    print(config)
    df_config = df.copy()

    if 'stopwords' in config or 'stem' in config or 'lemma' in config: 
      df_config = text_cleaning_config(df_config, config, stop_words)

    if 'baseline' in config:
      df_config = tfidf_sum(df_config, feature_array, tfidf)
    elif 'textrank' in config:
      df_config = vector_representation(df_config, config, embeddings)
      df_config = textrank(df_config, config)
    elif 'lsa' in config:
      df_config = vector_representation(df_config, config, embeddings)
      df_config = lsa(df_config)

    df_config = extract_summary(df_config, config) 
    eval_dict = evaluate(df_config)

    # distribution of metrics across all documents
    eval_results[str(config)] = metrics_distribution(df_config)
    # keep track of model results (with predicted_summaries) 
    model_results[str(config)] = df_config

  return eval_results, model_results