<a href="https://colab.research.google.com/github/katrina906/CS6120-Summarization-Project/blob/main/text_rank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# load english wiki word vectors: https://fasttext.cc/docs/en/pretrained-vectors.html
# load wiki fasttext bin and save only model object: smaller 
#ft = FastText.load_fasttext_format("/content/drive/MyDrive/data/wiki.en.bin")
#ft.wv.save('/content/drive/MyDrive/data/wiki.en.model')

In [2]:
!pip install rouge-score
!pip install fasttext
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip



In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
# TODO: read into drive so don't have to wget every time? 

--2021-03-10 00:05:07--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-03-10 00:05:07--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-03-10 00:05:07--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2021

In [4]:
import os
import pandas as pd
import numpy as np
import pickle
import string
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import itertools
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer
import fasttext
import gensim
from gensim.models import FastText
import sys
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords  
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  
nltk.download('wordnet')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# load data
df = pd.read_pickle("/content/drive/MyDrive/data/cleaned_df.pkl")
df = df.head(10000)

In [7]:
# clean sentences for similarity comparisons; not for final display
# always do this function
def text_cleaning(df):
  # downcase everything
  df['sentences_cleaned'] = df.sentences.apply(lambda text: [sentence.lower() for sentence in text])
  # remove punctuation 
  df.sentences_cleaned = df.sentences_cleaned.apply(lambda text: [re.sub("[^\w\s]", '', sentence) for sentence in text])

  return df

In [76]:
# cleaning depending on configuration
def text_cleaning_config(doc, config, stop_words):
  words = [sentence.split() for sentence in doc]
  if 'stopwords' in config:
    words = [[w for w in sentence if not w in stop_words] for sentence in words]
  if 'stem' in config:
    stemmer = PorterStemmer()
    words = [[stemmer.stem(w) for w in sentence] for sentence in words]
  if 'lemma' in config:
    lemmatizer = WordNetLemmatizer()
    words = [[lemmatizer.lemmatize(w) for w in sentence] for sentence in words]

  doc = [' '.join(sentence) for sentence in words]

  return doc

### Train TFIDF in Corpus
Used in baseline model to sum tfidf scores within each sentence in each document 

In [8]:
def corpus_tfidf(df):
  # list of words in each article
  corpus = df.sentences_cleaned.to_list()
  corpus = [''.join(article) for article in corpus]  
  corpus = [article.split(' ') for article in corpus]

  # tfidf trained on entire corpus: document = article
  tfidf_vec = TfidfVectorizer(analyzer = 'word', 
                          tokenizer = lambda doc: doc, preprocessor = lambda doc: doc, token_pattern = None)
                          # already did preprocessing, so using identity functions for tokenizer and preprocessor
  tfidf = tfidf_vec.fit_transform(corpus) # sparse arrays of scores for each word in each article. articles x words
  feature_array = list(tfidf_vec.get_feature_names())
  
  return tfidf, feature_array

### Vector Representation 
Default: unigram bag of words with counts
Options: 
1. Bow
  - binary: bag of words with binary indicators rather than counts (don't use with tfidf)
  - tf: term frequency normalization 
    - Same as default if cosine similarity. Cosine similarity does the normalization (double check this!!)
  - idf: inverse document normalization 
  - include_bigrams/include_trigrams: include bigrams and/or trigrams of words in addition to unigrams as distinct tokens in bag of words
    - Gives sense of order in sentence, capture _concepts_ rather than just individual words
2. Embeddings (pre-trained)
  - GloVe
  - Fasttext
    - Advantage: generate embeddings for out of vocabulary words based on their parts
    - But memory issues

In [158]:
# vector representation of words in each sentence in document 
def vector_representation(doc, configuration, embeddings):

  # list of words in each sentence 
  words = [sentence.split() for sentence in doc]

  if 'bow' in configuration:

    # include bigrams and/or trigrams (in addition to unigrams) in bow 
    grams = []
    if 'bigram' in configuration or 'all' in configuration:
      bigrams = [list(nltk.bigrams(sentence)) if len(sentence) >= 2 else '' for sentence in words ]
      grams.append([[words[0] + ' ' + words[1] for words in sentence] for sentence in bigrams]) # combine tuples of words into string
    if 'trigram' in configuration or 'all' in configuration:
      trigrams = [list(nltk.trigrams(sentence)) if len(sentence) >= 3 else '' for sentence in words ]
      grams.append([[words[0] + ' ' + words[1] + ' ' + words[2] for words in sentence] for sentence in trigrams]) # combine tuples of words into string
    # concat with unigrams per sentence
    for i in range(len(grams)):
      words = [grams[i][j] + words[j] for j in range(len(words))] 

    # bag of words with binary indicators for words/n-grams rather than counts
    if 'binary' in configuration: 
      words = [set(sentence) for sentence in words]

    # bag of words: # sentences x # unique words
    vec = DictVectorizer()
    bow = vec.fit_transform(Counter(f) for f in words)

    # term frequency normalization
    if 'tf' in configuration: 
      tfidf_transformer = TfidfTransformer(use_idf = False)
      tfidf = tfidf_transformer.fit_transform(bow)
      return tfidf
    # term frequency-inverse document frequency normalization
    if 'tfidf' in configuration:
      tfidf_transformer = TfidfTransformer(use_idf = True)
      tfidf = tfidf_transformer.fit_transform(bow)
      return tfidf

    return bow

  # possible extension: continued training on specific corpus. Probably unnecessary since wikipedia and news article words should be similar
  if 'embedding' in configuration:

    if 'glove' in configuration:
      word_embeddings = embeddings['glove']
      # find average of word embeddings for each sentence 
      # if unknown word, give embedding = 0 
      sentence_vectors = []
      for sentence in doc_processed:
        sentence_vectors.append(sum([word_embeddings.get(word, np.zeros(100,)) for word in sentence.split()])/(len(sentence.split())))

      return np.array(sentence_vectors)

    # fasttext.
    if 'fasttext' in configuration:
      word_embeddings = embeddings['fasttext']
      # find average of word embeddings for each sentence 
      sentence_vectors = []
      for sentence in doc_processed:
        sentence_vectors.append(sum([word_embeddings[word] for word in sentence.split()])/(len(sentence.split())))

      return np.array(sentence_vectors)

### PageRank Model

In [10]:
# TODO: other similarity metrics?
# TODO: other algorithms
def pagerank(bow):
  # similarity matrix between sentences
  sim =  cosine_similarity(bow)
  # graph where node = sentence, edge weight = simialarity score
  G = nx.from_numpy_array(sim)
  # page rank
  pr = nx.pagerank(G)
  # sort keys in order of page rank
  bestkeys = sorted(pr, key=pr.get, reverse=True)

  return bestkeys

### Baseline Model
- Train TF-IDF on entire corpus where document = article. Get a score for each word in each document
- Sum scores for all words in each sentence 
- Produce sentences with highest total TF-IDF score 

Idea: Sentences that are indicative of the specifics of the article. High frequency in the article, but specific to the article

Could also try straight term frequencies within the article. (or weighted like above so fractional of most frequent rather than diff. magnitudes). Would need to drop stop words first (https://stackabuse.com/text-summarization-with-nltk-in-python/)

In [11]:
def tfidf_sum(doc, feature_array, tfidf):

  # sum tfidf score within each sentence. 
  # Normalize by length of sentence. Otherwise recommend longest sentences 
  sentence_words = [sentence.split() for sentence in doc]
  sentence_scores = [np.sum([tfidf[0,feature_array.index(word)] for word in sentence]) / len(sentence) for sentence in sentence_words]

  # sort keys in order of summed tfidf score
  bestkeys = np.argsort(sentence_scores)[::-1]

  return bestkeys

### Extract Summary
Grab best sentences based on ranking mechanism     
Length of summary (Number of sentences)?
- Number of sentences: generate 1 summary sentence per text sentence (average)
  - Problem: text sentences are much longer than summary sentences, and since we are producing text sentences as our predicted summary, predicted summary is much longer than label summary
- Number of words: generate 20 summary words per 1 text word
  - Strict version: words in summary must be less than the threshold
  - Less strict version: can go over limit by 1 sentence if reach threshold within the sentence

In [12]:
def extract_summary(bestkeys, doc, config):

  # summary based on number of sentences 
  if 'num_sentences' in config:
    max_sentences = int(np.floor(len(doc) / 6)) # average 6 summary sentences per doc sentence
    return [doc[i] for i in  bestkeys[0:max_sentences]]
  # summary based on number of words
  if 'num_words_gt' in config or 'num_words_lt' in config:
    summary = []
    num_words = 0
    max_words = np.floor(len(''.join(doc).split(' ')) / 20) # average 20 summary words per text word
    for i in bestkeys:
      num_words += len(doc[i].split(' ')) 
      # strict version: words in summary must be less than threshold
      if 'num_words_lt' in config:
        if num_words >= max_words:
          return summary
        summary.append(doc[i])
      # less strict version: can go over limit by 1 sentence 
      elif 'num_words_gt' in config:
        summary.append(doc[i])
        if num_words >= max_words:
          return summary

### Evaluation 
ROUGE metric:
https://kavita-ganesan.com/what-is-rouge-and-how-it-works-for-evaluation-of-summaries/#.YEKJyI5KiUl   
- Precision = # overlapping ngrams / # total ngrams in produced summary 
  - Measure of junk. Did we produce a lot in the generated summary that is not in the actual summary?
  - Important if we don't manually set the length. The generated summary could be very long which causes good recall
- Recall = # overlapping ngrams / # total ngrams in label summary  
  - Did we get all the words in the actual summary?
- F1 = harmonic mean
- N-Gram vs. LCS. Do we care about order? Don't need it to measure fluency/proper syntax. But ordering of words can indicate phrases 

Cons: 
- Doesn't look at sentence structure --> doesn't apply here because using correct sentences
- Doesn't consider meaning -- same words could have different meaning   
  
Also considered BLEU, but only gives precision.     
https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213

In [13]:
def evaluate(predicted_summary, actual_summary):
  # TODO: unigram, bigram etc. models for rouge? - do we care about the order of the words?
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer = False)
  rouge = scorer.score(''.join(predicted_summary), ''.join(actual_summary))

  return rouge

In [14]:
# TODO create matrix of configurations to iterate through
  # report config with best precision, best recall, best fmeasure 
  # loop over all documents and average results 

In [15]:
# TODO evaluation strategy. average of rouge1, rouge2, rouge3 (like bleu with weights?). then fmeasure? preicision and recall equally important? 
  # do some algorithms/configurations do better in precision and others do better in recall? 
# TODO compare evaluations between models with paired bootstrap test to test significance? 

In [73]:
configurations_bow = [['pagerank'],
                      ['nostop', 'stopwords'],
                      ['no_stemlemma', 'lemma', 'stem'],
                      ['bow'],
                      ['counts', 'binary'],
                      ['no_normalization', 'tf', 'tfidf'],
                      ['unigram', 'bigram', 'trigram', 'all'],
                      ['num_sentences', 'num_words_lt', 'num_words_gt']
                      ]
configurations_embeddings = [['pagerank'],
                             ['nostop', 'stopwords'],
                             ['no_stemlemma', 'lemma', 'stem'],
                             ['embedding'],
                             ['glove'], # 'fasttext'
                             ['num_sentences', 'num_words_lt', 'num_words_gt']
                             ]
configurations_baseline = [['baseline'],
                           ['nostop', 'stopwords'],
                           ['no_stemlemma', 'lemma', 'stem'],
                           ['num_sentences', 'num_words_lt', 'num_words_gt']
                           ]

In [17]:
embeddings = {}

# load glove embeddings - code from https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/
# 100 length vector for each word 
glove_wv = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_wv[word] = coefs
f.close()
embeddings['glove'] = glove_wv

# load fasttext embeddings 
#embeddings['fasttext'] = gensim.models.KeyedVectors.load("/content/drive/MyDrive/data/wiki.en.model")
# TODO memory issues! more work to limit memory? (can load in by itself - compress? https://gist.github.com/generall/68fddb87ae1845d6f54c958ed3d0addb)

In [18]:
df = text_cleaning(df)
doc_processed = df.iloc[0].sentences_cleaned # version for modeling
doc_display = df.iloc[0].sentences # version for display (original punctuation, capitalization etc.)

In [19]:
tfidf, feature_array = corpus_tfidf(df)

In [20]:
# TODO think about memory - run in batches and save config results in an append fashion?
# number of configurations quickly balooning -- currently 468 configurations...
  # maybe evaluate on smaller subsample and then pick top x configurations. Then train on full sample. 

In [159]:
config_results = {}
config_list = list(itertools.product(*configurations_bow)) + list(itertools.product(*configurations_embeddings)) + list(itertools.product(*configurations_baseline))
for config in config_list:
  print(config)
  local_results = {}

  doc_processed_config = text_cleaning_config(doc_processed, config, stop_words)

  if 'baseline' in config:
    bestKeys = tfidf_sum(doc_processed_config , feature_array, tfidf)
  elif 'pagerank' in config:
    vec = vector_representation(doc_processed_config, config, embeddings)
    bestKeys = pagerank(vec)

  local_results['predicted_summary'] = extract_summary(bestKeys, doc_display, config) 
  local_results['actual_summary'] = df.iloc[0].sentences_summary
  local_results['rouge'] = evaluate(local_results['predicted_summary'], local_results['actual_summary'])

  config_results[str(config)] = local_results

('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'unigram', 'num_sentences')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'unigram', 'num_words_lt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'unigram', 'num_words_gt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'bigram', 'num_sentences')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'bigram', 'num_words_lt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'bigram', 'num_words_gt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'trigram', 'num_sentences')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'trigram', 'num_words_lt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'trigram', 'num_words_gt')
('pagerank', 'nostop', 'no_stemlemma', 'bow', 'counts', 'no_normalization

ValueError: ignored

In [25]:
max_rouge1_fmeasure = 0
best_config = ''
for k,v in config_results.items():
  fmeasure = v['rouge']['rouge1'].fmeasure
  if fmeasure > max_rouge1_fmeasure:
    max_rouge1_fmeasure = fmeasure
    best_config = k
best_config

"('pagerank', 'bow', 'counts', 'no_normalization', 'unigram', 'num_words_lt')"

In [84]:
config_results["('pagerank', 'stopwords', 'no_stemlemma', 'bow', 'counts', 'no_normalization', 'trigram', 'num_sentences')"]

KeyError: ignored

In [29]:
config_results["('pagerank', 'bow', 'binary', 'no_normalization', 'bigram', 'num_words_lt')"]

{'actual_summary': ['Syrian official: Obama climbed to the top of the tree, "doesn\'t know how to get down"',
  'Obama sends a letter to the heads of the House and Senate',
  'Obama to seek congressional approval on military action against Syria',
  'Aim is to determine whether CW were used, not by whom, says U.N. spokesman'],
 'predicted_summary': ['Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.',
  'He noted that Ban has repeatedly said there is no alternative to a political solution to the crisis in Syria, and that "a military solution is not an option."'],
 'rouge': {'rouge1': Score(precision=0.4423076923076923, recall=0.3382352941176471, fmeasure=0.3833333333333333),
  'rouge2': Score(precision=0.21568627450980393, recall=0.16417910447761194, fmeasure=0.18644067796610167),
  'rougeL': Score(precision=0.32692