<a href="https://colab.research.google.com/github/katrina906/CS6120-Summarization-Project/blob/main/text_rank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rouge-score



In [15]:
import os
import pandas as pd
import numpy as np
import pickle
import string
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import itertools
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# load data
df = pd.read_pickle("/content/drive/MyDrive/data/cleaned_df.pkl")

In [5]:
# clean sentences for similarity comparisons; not for final display
# TODO: stemming or lemmatization? 
# TODO: stop word exclusion? 
def text_cleaning(doc):
  # downcase everything
  df['sentences_cleaned'] = df.sentences.apply(lambda text: [sentence.lower() for sentence in text])
  # remove punctuation 
  df.sentences_cleaned = df.sentences_cleaned.apply(lambda text: [re.sub("[^\w\s]", '', sentence) for sentence in text])

  return df

### Vector Representation 
Default: unigram bag of words with counts
Options: 
  - binary: bag of words with binary indicators rather than counts (don't use with tfidf)
  - tf: term frequency normalization 
    - Same as default if cosine similarity. Cosine similarity does the normalization (double check this!!)
  - idf: inverse document normalization 
  - include_bigrams/include_trigrams: include bigrams and/or trigrams of words in addition to unigrams as distinct tokens in bag of words
    - Gives sense of order in sentence, capture _concepts_ rather than just individual words

In [6]:
# vector representation of words in each sentence in document 
# TODO: try embeddings
def vector_representation(doc, configuration):

  # list of words in each sentence 
  words = [sentence.split() for sentence in doc]

  # include bigrams and/or trigrams (in addition to unigrams) in bow 
  grams = []
  if 'bigram' in configuration or 'all' in configuration:
    bigrams = [list(nltk.bigrams(sentence)) for sentence in words]
    grams.append([[words[0] + ' ' + words[1] for words in sentence] for sentence in bigrams]) # combine tuples of words into string
  if 'trigram' in configuration or 'all' in configuration:
    trigrams = [list(nltk.trigrams(sentence)) for sentence in words]
    grams.append([[words[0] + ' ' + words[1] + ' ' + words[2] for words in sentence] for sentence in trigrams]) # combine tuples of words into string
  # concat with unigrams per sentence
  for i in range(len(grams)):
    words = [grams[i][j] + words[j] for j in range(len(words))] 

  # bag of words with binary indicators for words/n-grams rather than counts
  if 'bow_binary' in configuration: 
    words = [set(sentence) for sentence in words]

  # bag of words: # sentences x # unique words
  vec = DictVectorizer()
  bow = vec.fit_transform(Counter(f) for f in words)

  # term frequency normalization
  if 'tf' in configuration: 
    tfidf_transformer = TfidfTransformer(use_idf = False)
    tfidf = tfidf_transformer.fit_transform(bow)
    return tfidf
  # term frequency-inverse document frequency normalization
  if 'tfidf' in configuration:
    tfidf_transformer = TfidfTransformer(use_idf = True)
    tfidf = tfidf_transformer.fit_transform(bow)
    return tfidf

  return bow

In [7]:
# TODO: other similarity metrics?
# TODO: other algorithms
def pagerank(bow):
  # similarity matrix between sentences
  sim =  cosine_similarity(bow)
  # graph where node = sentence, edge weight = simialarity score
  G = nx.from_numpy_array(sim)
  # page rank
  pr = nx.pagerank(G)

  return pr

In [8]:
def extract_summary(pr, doc, topn):
  # sort keys in order
  bestkeys = sorted(pr, key=pr.get, reverse=True)[0:topn]
  return [doc[i] for i in bestkeys]

### Evaluation 
ROUGE metric:
https://kavita-ganesan.com/what-is-rouge-and-how-it-works-for-evaluation-of-summaries/#.YEKJyI5KiUl   
- Precision = # overlapping ngrams / # total ngrams in produced summary 
  - Measure of junk. Did we produce a lot in the generated summary that is not in the actual summary?
  - Important if we don't manually set the length. The generated summary could be very long which causes good recall
- Recall = # overlapping ngrams / # total ngrams in label summary  
  - Did we get all the words in the actual summary?
- F1 = harmonic mean
- N-Gram vs. LCS. Do we care about order? Don't need it to measure fluency/proper syntax. But ordering of words can indicate phrases 

Cons: 
- Doesn't look at sentence structure --> doesn't apply here because using correct sentences
- Doesn't consider meaning -- same words could have different meaning   
  
Also considered BLEU, but only gives precision.     
https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213

In [9]:
def evaluate(predicted_summary, actual_summary):
  # TODO: unigram, bigram etc. models for rouge? - do we care about the order of the words?
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer = False)
  rouge = scorer.score(''.join(predicted_summary), ''.join(actual_summary))

  return rouge

In [10]:
# TODO create matrix of configurations to iterate through
  # report config with best precision, best recall, best fmeasure 
  # loop over all documents and average results 

In [None]:
# TODO evaluation strategy. average of rouge1, rouge2, rouge3 (like bleu with weights?). then fmeasure? preicision and recall equally important? 
  # do some algorithms/configurations do better in precision and others do better in recall? 
# TODO compare evaluations between models with paired bootstrap test to test significance? 

In [11]:
configurations = [['bow', 'bow_binary'],
                  ['None', 'tf', 'tfidf'],
                  ['unigram', 'bigram', 'trigram', 'all']
                  ]

In [12]:
df = text_cleaning(df)
doc_processed = df.iloc[0].sentences_cleaned # version for modeling
doc_display = df.iloc[0].sentences # version for display (original punctuation, capitalization etc.)

In [16]:
config_results = {}
for config in list(itertools.product(*configurations)):
  print(config)
  local_results = {}

  bow = vector_representation(doc_processed, config)
  pr = pagerank(bow)
  local_results['predicted_summary'] = extract_summary(pr, doc_display, 3) # TODO: choose best number of sentences (iterate with validation; rule of thumb based on EDA)
  local_results['actual_summary'] = df.iloc[0].sentences_summary
  local_results['rouge'] = evaluate(local_results['predicted_summary'], local_results['actual_summary'])

  config_results[str(config)] = local_results

('bow', 'None', 'unigram')
('bow', 'None', 'bigram')
('bow', 'None', 'trigram')
('bow', 'None', 'all')
('bow', 'tf', 'unigram')
('bow', 'tf', 'bigram')
('bow', 'tf', 'trigram')
('bow', 'tf', 'all')
('bow', 'tfidf', 'unigram')
('bow', 'tfidf', 'bigram')
('bow', 'tfidf', 'trigram')
('bow', 'tfidf', 'all')
('bow_binary', 'None', 'unigram')
('bow_binary', 'None', 'bigram')
('bow_binary', 'None', 'trigram')
('bow_binary', 'None', 'all')
('bow_binary', 'tf', 'unigram')
('bow_binary', 'tf', 'bigram')
('bow_binary', 'tf', 'trigram')
('bow_binary', 'tf', 'all')
('bow_binary', 'tfidf', 'unigram')
('bow_binary', 'tfidf', 'bigram')
('bow_binary', 'tfidf', 'trigram')
('bow_binary', 'tfidf', 'all')


In [26]:
max_rouge1_fmeasure = 0
best_config = ''
for k,v in config_results.items():
  fmeasure = v['rouge']['rouge1'].fmeasure
  if precision > max_rouge1_fmeasure:
    max_rouge1_fmeasure = fmeasure
    best_config = k
best_config

"('bow_binary', 'tfidf', 'all')"

In [25]:
config_results["('bow_binary', 'tfidf', 'all')"]

{'actual_summary': ['Syrian official: Obama climbed to the top of the tree, "doesn\'t know how to get down"',
  'Obama sends a letter to the heads of the House and Senate',
  'Obama to seek congressional approval on military action against Syria',
  'Aim is to determine whether CW were used, not by whom, says U.N. spokesman'],
 'predicted_summary': ['Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.',
  'On Saturday, Obama proposed what he said would be a limited military action against Syrian President Bashar al-Assad.',
  "5 key assertions: U.S. intelligence report on Syria  Syria: Who wants what after chemical weapons horror  Reactions mixed to Obama's speech  A spokesman for the Syrian National Coalition said that the opposition group was disappointed by Obama's announcement."],
 'rouge': {'rouge1': Score(precis