<a href="https://colab.research.google.com/github/katrina906/CS6120-Summarization-Project/blob/main/text_rank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# https://stackabuse.com/text-summarization-with-nltk-in-python/ -- TODO baseline method, just summing weights in sentences 

In [2]:
!pip install rouge-score



In [3]:
import os
import pandas as pd
import numpy as np
import pickle
import string
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import itertools
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

In [5]:
# load data
df = pd.read_pickle("/content/drive/MyDrive/data/cleaned_df.pkl")

In [6]:
# clean sentences for similarity comparisons; not for final display
# TODO: stemming or lemmatization? 
# TODO: stop word exclusion? 
def text_cleaning(doc):
  # downcase everything
  df['sentences_cleaned'] = df.sentences.apply(lambda text: [sentence.lower() for sentence in text])
  # remove punctuation 
  df.sentences_cleaned = df.sentences_cleaned.apply(lambda text: [re.sub("[^\w\s]", '', sentence) for sentence in text])

  return df

### Vector Representation 
Default: unigram bag of words with counts
Options: 
  - binary: bag of words with binary indicators rather than counts (don't use with tfidf)
  - tf: term frequency normalization 
    - Same as default if cosine similarity. Cosine similarity does the normalization (double check this!!)
  - idf: inverse document normalization 
  - include_bigrams/include_trigrams: include bigrams and/or trigrams of words in addition to unigrams as distinct tokens in bag of words
    - Gives sense of order in sentence, capture _concepts_ rather than just individual words

In [7]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip

In [8]:
#wv = fasttext.load_model("/content/drive/MyDrive/data/wiki.en.bin")

In [9]:
#!pip install fasttext
#import fasttext

In [10]:
# vector representation of words in each sentence in document 
# TODO: try embeddings - fasttext
def vector_representation(doc, configuration):

  # list of words in each sentence 
  words = [sentence.split() for sentence in doc]

  if 'bow' in configuration:

    # include bigrams and/or trigrams (in addition to unigrams) in bow 
    grams = []
    if 'bigram' in configuration or 'all' in configuration:
      bigrams = [list(nltk.bigrams(sentence)) for sentence in words]
      grams.append([[words[0] + ' ' + words[1] for words in sentence] for sentence in bigrams]) # combine tuples of words into string
    if 'trigram' in configuration or 'all' in configuration:
      trigrams = [list(nltk.trigrams(sentence)) for sentence in words]
      grams.append([[words[0] + ' ' + words[1] + ' ' + words[2] for words in sentence] for sentence in trigrams]) # combine tuples of words into string
    # concat with unigrams per sentence
    for i in range(len(grams)):
      words = [grams[i][j] + words[j] for j in range(len(words))] 

    # bag of words with binary indicators for words/n-grams rather than counts
    if 'binary' in configuration: 
      words = [set(sentence) for sentence in words]

    # bag of words: # sentences x # unique words
    vec = DictVectorizer()
    bow = vec.fit_transform(Counter(f) for f in words)

    # term frequency normalization
    if 'tf' in configuration: 
      tfidf_transformer = TfidfTransformer(use_idf = False)
      tfidf = tfidf_transformer.fit_transform(bow)
      return tfidf
    # term frequency-inverse document frequency normalization
    if 'tfidf' in configuration:
      tfidf_transformer = TfidfTransformer(use_idf = True)
      tfidf = tfidf_transformer.fit_transform(bow)
      return tfidf

    return bow

  if 'embedding' in configuration:

    if 'glove' in configuration:
      # Extract glove word vectors as dictionary - code from https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/
      # 100 length vector for each word 
      word_embeddings = {}
      f = open('glove.6B.100d.txt', encoding='utf-8')
      for line in f:
          values = line.split()
          word = values[0]
          coefs = np.asarray(values[1:], dtype='float32')
          word_embeddings[word] = coefs
      f.close()

      # find average of word embeddings for each sentence 
      # if unknown word, give embedding = 0 
      sentence_vectors = []
      for sentence in doc_processed:
        sentence_vectors.append(sum([word_embeddings.get(word, np.zeros(100,)) for word in sentence.split()])/(len(sentence.split())))

      return np.array(sentence_vectors)

    # fasttext. Advantage: generate embeddings for out of vocabulary words based on their parts
    # possible extension: continued training on specific corpus. Probably unnecessary since wikipedia and news article words should be similar

In [11]:
# TODO: other similarity metrics?
# TODO: other algorithms
def pagerank(bow):
  # similarity matrix between sentences
  sim =  cosine_similarity(bow)
  # graph where node = sentence, edge weight = simialarity score
  G = nx.from_numpy_array(sim)
  # page rank
  pr = nx.pagerank(G)

  return pr

In [70]:
def extract_summary(pr, doc, config):
  # sort keys in order of page rank
  bestkeys = sorted(pr, key=pr.get, reverse=True)
  # summary based on number of sentences 
  if 'num_sentences' in config:
    max_sentences = int(np.floor(len(doc) / 6)) # average 6 summary sentences per doc sentence
    return [doc[i] for i in  bestkeys[0:max_sentences]]
  # summary based on number of words
  # text sentences much longer than summary sentences
  if 'num_words_gt' in config or 'num_words_lt' in config:
    summary = []
    num_words = 0
    max_words = np.floor(len(''.join(doc).split(' ')) / 20) # average 20 summary words per text word
    for i in bestkeys:
      num_words += len(doc[i].split(' ')) 
      # strict version: words in summary must be less than threshold
      if 'num_words_lt' in config:
        if num_words >= max_words:
          return summary
        summary.append(doc[i])
      # less strict version: can go over limit by 1 sentence 
      elif 'num_words_gt' in config:
        summary.append(doc[i])
        if num_words >= max_words:
          return summary

### Evaluation 
ROUGE metric:
https://kavita-ganesan.com/what-is-rouge-and-how-it-works-for-evaluation-of-summaries/#.YEKJyI5KiUl   
- Precision = # overlapping ngrams / # total ngrams in produced summary 
  - Measure of junk. Did we produce a lot in the generated summary that is not in the actual summary?
  - Important if we don't manually set the length. The generated summary could be very long which causes good recall
- Recall = # overlapping ngrams / # total ngrams in label summary  
  - Did we get all the words in the actual summary?
- F1 = harmonic mean
- N-Gram vs. LCS. Do we care about order? Don't need it to measure fluency/proper syntax. But ordering of words can indicate phrases 

Cons: 
- Doesn't look at sentence structure --> doesn't apply here because using correct sentences
- Doesn't consider meaning -- same words could have different meaning   
  
Also considered BLEU, but only gives precision.     
https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213

In [13]:
def evaluate(predicted_summary, actual_summary):
  # TODO: unigram, bigram etc. models for rouge? - do we care about the order of the words?
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer = False)
  rouge = scorer.score(''.join(predicted_summary), ''.join(actual_summary))

  return rouge

In [14]:
# TODO create matrix of configurations to iterate through
  # report config with best precision, best recall, best fmeasure 
  # loop over all documents and average results 

In [15]:
# TODO evaluation strategy. average of rouge1, rouge2, rouge3 (like bleu with weights?). then fmeasure? preicision and recall equally important? 
  # do some algorithms/configurations do better in precision and others do better in recall? 
# TODO compare evaluations between models with paired bootstrap test to test significance? 

In [71]:
configurations_bow = [['bow'],
                      ['counts', 'binary'],
                      ['no_normalization', 'tf', 'tfidf'],
                      ['unigram', 'bigram', 'trigram', 'all'],
                      ['num_sentences', 'num_words_lt', 'num_words_gt']
                      ]
configurations_embeddings = [['embedding'],
                             ['glove'], # fasttext
                             ['num_sentences', 'num_words_lt', 'num_words_gt']
                             ]

In [17]:
df = text_cleaning(df)
doc_processed = df.iloc[0].sentences_cleaned # version for modeling
doc_display = df.iloc[0].sentences # version for display (original punctuation, capitalization etc.)

In [72]:
config_results = {}
config_list = list(itertools.product(*configurations_bow)) + list(itertools.product(*configurations_embeddings))
for config in config_list:
  print(config)
  local_results = {}

  bow = vector_representation(doc_processed, config)
  pr = pagerank(bow)  
  local_results['predicted_summary'] = extract_summary(pr, doc_display, config) 
  local_results['actual_summary'] = df.iloc[0].sentences_summary
  local_results['rouge'] = evaluate(local_results['predicted_summary'], local_results['actual_summary'])

  config_results[str(config)] = local_results

('bow', 'counts', 'no_normalization', 'unigram', 'num_sentences')
('bow', 'counts', 'no_normalization', 'unigram', 'num_words_lt')
('bow', 'counts', 'no_normalization', 'unigram', 'num_words_gt')
('bow', 'counts', 'no_normalization', 'bigram', 'num_sentences')
('bow', 'counts', 'no_normalization', 'bigram', 'num_words_lt')
('bow', 'counts', 'no_normalization', 'bigram', 'num_words_gt')
('bow', 'counts', 'no_normalization', 'trigram', 'num_sentences')
('bow', 'counts', 'no_normalization', 'trigram', 'num_words_lt')
('bow', 'counts', 'no_normalization', 'trigram', 'num_words_gt')
('bow', 'counts', 'no_normalization', 'all', 'num_sentences')
('bow', 'counts', 'no_normalization', 'all', 'num_words_lt')
('bow', 'counts', 'no_normalization', 'all', 'num_words_gt')
('bow', 'counts', 'tf', 'unigram', 'num_sentences')
('bow', 'counts', 'tf', 'unigram', 'num_words_lt')
('bow', 'counts', 'tf', 'unigram', 'num_words_gt')
('bow', 'counts', 'tf', 'bigram', 'num_sentences')
('bow', 'counts', 'tf', 'b

In [73]:
max_rouge1_fmeasure = 0
best_config = ''
for k,v in config_results.items():
  fmeasure = v['rouge']['rouge1'].fmeasure
  if fmeasure > max_rouge1_fmeasure:
    max_rouge1_fmeasure = fmeasure
    best_config = k
best_config

"('bow', 'counts', 'no_normalization', 'unigram', 'num_words_lt')"

In [24]:
config_results["('embedding', 'glove')"]

{'actual_summary': ['Syrian official: Obama climbed to the top of the tree, "doesn\'t know how to get down"',
  'Obama sends a letter to the heads of the House and Senate',
  'Obama to seek congressional approval on military action against Syria',
  'Aim is to determine whether CW were used, not by whom, says U.N. spokesman'],
 'predicted_summary': ['Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.',
  "Bergen:  Syria is a problem from hell for the U.S.  Obama: 'This menace must be confronted'  Obama's senior advisers have debated the next steps to take, and the president's comments Saturday came amid mounting political pressure over the situation in Syria.",
  'Why Russia, China, Iran stand by Assad  Syria\'s government unfazed  After Obama\'s speech, a military and political analyst on Syrian state TV said Obama 

In [23]:
config_results["('bow', 'binary', 'no_normalization', 'bigram')"]

{'actual_summary': ['Syrian official: Obama climbed to the top of the tree, "doesn\'t know how to get down"',
  'Obama sends a letter to the heads of the House and Senate',
  'Obama to seek congressional approval on military action against Syria',
  'Aim is to determine whether CW were used, not by whom, says U.N. spokesman'],
 'predicted_summary': ['Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.',
  'He noted that Ban has repeatedly said there is no alternative to a political solution to the crisis in Syria, and that "a military solution is not an option."',
  "5 key assertions: U.S. intelligence report on Syria  Syria: Who wants what after chemical weapons horror  Reactions mixed to Obama's speech  A spokesman for the Syrian National Coalition said that the opposition group was disappointed by Obama's announcem