<a href="https://colab.research.google.com/github/katrina906/CS6120-Summarization-Project/blob/main/exploratory_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Prep & Exploratory Analysis

In [None]:
#%%capture
#!pip install -U seaborn

In [1]:
import os
import pandas as pd
import pickle
from nltk import tokenize
import string
import seaborn as sns
import matplotlib.pyplot as plt
import pyarrow
import nltk
import numpy as np
import bokeh
from bokeh.layouts import gridplot, column, row
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from bokeh.models import Div
from bokeh.models import Span

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Data

In [None]:
df = pd.read_pickle("/content/drive/MyDrive/data/stories_df.pkl")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Basic Cleaning & Text Element Extraction
- Separate summary from rest of text: marked by @highlight 
- Drop empty text (about 100 articles that only have summaries)
- Separate text and summary into sentences 
- Strip news source and location information off of first text sentence
  - Ex (CNN) New  York --- xxxx
- Create EDA features:
  - List of words from text and summary (lower case, no punctuation)
  - Number of words and sentences in each text and summary
- Drop observation when summary has more sentences or more words than text
  - If more sentences sentences: text not formatted into sentences. Ex listing soccer game scores
  - If more words: the text is a caption for video or photo. Summary includes more details than "text"
  - Both are poor examples to summarize
- Save as pickle for future use in modeling

In [4]:
def extract_summary(df):
    # find first instance of @highlight in text
    df['first_pos'] = df.text.str.find('@highlight')
    # new column for summary
    df['summary'] = df.apply(lambda row: row.text[row.first_pos:], axis = 1)
    df.summary = df.summary.str.replace('@highlight', '')
    # remove summary from text
    df.text = df.apply(lambda row: row.text[:row.first_pos], axis = 1)
    
    return df

def extract_words(df, source_var, word_var):
    df[word_var] = df[source_var].str.lower()
    df[word_var] = df[word_var].apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))
    df[word_var] = df[word_var].str.split(' ')
    
    return df

In [None]:
# extract summary from text block
df = extract_summary(df)
df.summary = df.summary.str.strip()

# drop empty text (about 100 articles that only have summaries)
df = df[df.text != '  ']

# get sentences
# summary: @highlight chunks not separated by punctuation, so split by spaces and then within split by sentences
df['sentences_summary'] = df.summary.str.split('    ')
df.sentences_summary = df.sentences_summary.apply(lambda row: [tokenize.sent_tokenize(i) for i in row])
df.sentences_summary = df.sentences_summary.apply(lambda row: [item for sublist in row for item in sublist])
# text sentence tokenization using nltk
df['sentences'] = df.text.apply(lambda row: tokenize.sent_tokenize(row))

# strip location and news source designations off of first sentence
df['first_sentence'] = df.sentences.apply(lambda row: row[0])
df.first_sentence = df.first_sentence.str.split(' -- ').str[-1] \
                                     .str.replace('\\(CNN\\)', '')
df.sentences = df.apply(lambda row: [row.first_sentence] + row.sentences[1:], axis = 1)

# get words without punctuation, downcased for EDA
df = extract_words(df, 'text', 'words')
df = extract_words(df, 'summary', 'words_summary')

# count number of sentences and words per text, summary
df['num_sentences'] = df.sentences.apply(lambda row: len(row))
df['num_words'] = df.words.apply(lambda row: len(row))

df['num_sentences_summary'] = df.sentences_summary.apply(lambda row: len(row))
df['num_words_summary'] = df.words_summary.apply(lambda row: len(row))

# drop when summary # of sentences or words > text summary/words
df = df[df.num_words > df.num_words_summary]
df = df[df.num_sentences > df.num_sentences_summary]

In [None]:
# save as pickle for future use in modeling
df.to_pickle("/content/drive/MyDrive/data/cleaned_df.pkl")

In [4]:
#df = pd.read_pickle("/content/drive/MyDrive/data/cleaned_df.pkl")

## Plotting Setup

In [None]:
#https://www.techwalla.com/articles/how-to-embed-an-excel-workbook-icon-into-powerpoint - embed html object into powerpoint

In [None]:
%cd drive/MyDrive/figures

In [6]:
output_notebook()

In [7]:
# rule for determining number of bins
def freedman_diaconis_rule(data):

  Q1 = np.quantile(data, 0.25)
  Q3 = np.quantile(data, 0.75)
  IQR = Q3 - Q1
  cube = np.cbrt(len(data))
  bin_width = 2*IQR/cube
  num_bins = data.max()/bin_width

  return num_bins

In [154]:
def histogram(df, var, title, xlabel, onebinper = False, number_of_bins = None):

  # create histogram
  # one bin per observation or Freedman-Diaconis rule 
  if onebinper:
    hist, edges = np.histogram(df[var], bins = int(df[var].max()))
  else:
    hist, edges = np.histogram(df[var], bins = int(np.floor(freedman_diaconis_rule(df[var]))))

  if number_of_bins != None: 
    hist, edges = np.histogram(df[var], bins = number_of_bins)

  # set up figure 
  p = figure(title=title, tools="hover, box_zoom, undo, crosshair", background_fill_color="#fafafa")

  # plot histogram
  p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
          fill_color="navy", line_color="white", alpha=0.5)
  p.line(np.linspace(df[var].min(), df[var].max()), line_color="#ff8888", line_width=4, alpha=0.7)

  # formatting
  p.y_range.start = 0
  p.xaxis.axis_label = xlabel
  p.yaxis.axis_label = 'Count'
  p.grid.grid_line_color="white"
  p.xaxis.axis_label_text_font_size = '12pt'
  p.yaxis.axis_label_text_font_size = '12pt'
  p.title.text_font_size = '14pt'

  return p

### Distribution of Number of Sentences, Words
- Most articles 15-20 sentences and 400 words
- Most summaries 4 sentences and 50 words

In [69]:
suptitle = Div(text = """
<html>
<head>
<style>
h2
</style>
</head>
<body>
<h2>Length Distributions</h1>
</body>
</html>
""")

In [94]:
output_file("length_distributions.html")
p1 = histogram(df, 'num_sentences', 'Article', 'Number of Sentences', True)
p2 = histogram(df, 'num_words', '', 'Number of Words')
p3 = histogram(df, 'num_sentences_summary', 'Summary', 'Number of Sentences', True)
p4 = histogram(df, 'num_words_summary', '', 'Number of Words')
show(column(suptitle, 
            gridplot([[p1, p2]], plot_height = 400, plot_width = 400),
            gridplot([[p3, p4]], plot_height = 400, plot_width = 400)))

### Do the summaries contain new words (not in article?) 
- Evidence for need for abstractive summarization - example summaries include words that are not in the main text
- Evidence that extractive summarization may be sufficient - most articles have 80% of their words appear in the article text

__Number of Words in Summary that don't appear in original text__

In [96]:
# set difference between words in summary and article
def set_diff_words(row):
    return set(row.words_summary).difference(set(row.words))

df['new_summary_words'] = df.apply(lambda row: set_diff_words(row), axis = 1)
df['num_new_summary_words'] = df.new_summary_words.apply(lambda row: len(row))

In [164]:
p1 = histogram(df, 'num_new_summary_words', 'Number of New Words', 'Number of Words', True)
show(p1)

__Percent of words that are new in summary__ 
(out of number of unique words)

In [99]:
df['num_unique_words'] = df.apply(lambda row: len(set(row.words_summary)), axis = 1)
df['perc_new_words'] = (df.num_new_summary_words / df.num_unique_words) * 100

In [165]:
p2 = histogram(df, 'perc_new_words', 'Percent of New Words', 'Percent of Words', True)
show(p2)

In [166]:
suptitle = Div(text = """
<html>
<head>
<style>
h2
</style>
</head>
<body>
<h2>Words in Summary not in Article</h1>
</body>
</html>
""")
output_file("new_words.html")
show(column(suptitle, gridplot([[p1, p2]], plot_height = 400, plot_width = 400)))

## How long should the summary be? 
- Heuristics: 
  - 8 article sentences per summary sentence
  - 12 article words per summary word
- Evidence number of sentences is not a good heuristic: article sentences are about 2 times longer than summary sentences

__Correlation between text and summary length__ 
- Positive correlation but not very strong

In [None]:
df[['num_sentences', 'num_sentences_summary']].corr()

Unnamed: 0,num_sentences,num_sentences_summary
num_sentences,1.0,0.203568
num_sentences_summary,0.203568,1.0


In [None]:
df[['num_words', 'num_words_summary']].corr()

Unnamed: 0,num_words,num_words_summary
num_words,1.0,0.269358
num_words_summary,0.269358,1.0


__Number of Article Sentences per Summary Sentences__
- About 8 article sentences per summary sentence

In [102]:
# number article sentences per article sentences 
df['sentence_ratio'] = df.num_sentences / df.num_sentences_summary
df.sentence_ratio.describe()

count    92322.000000
mean         9.133393
std          5.412633
min          1.200000
25%          5.000000
50%          8.000000
75%         11.750000
max         74.000000
Name: sentence_ratio, dtype: float64

In [161]:
p1 = histogram(df, 'sentence_ratio', 'Sentences', '# Article / # Summary Sentences', True)
vline = Span(location=8, dimension='height', line_color='black', line_width=2)
p1.renderers.extend([vline])
show(p1)

__Number of Article Words per Summary Words__
- About 12 text words per summary word


In [103]:
df['words_ratio'] = df.num_words / df.num_words_summary
df.words_ratio.describe()

count    92322.000000
mean        13.957446
std          7.551687
min          1.183673
25%          8.363997
50%         12.446291
75%         17.836642
max        121.555556
Name: words_ratio, dtype: float64

In [162]:
p2 = histogram(df, 'words_ratio', 'Words', '# Article / # Summary Words', True)
vline = Span(location=12, dimension='height', line_color='black', line_width=2)
p2.renderers.extend([vline])
show(p2)

In [163]:
suptitle = Div(text = """
<html>
<head>
<style>
h2
</style>
</head>
<body>
<h2>Relative length of Summary vs Article</h1>
</body>
</html>
""")
output_file("relative length.html")
show(column(suptitle, gridplot([[p1, p2]], plot_height = 400, plot_width = 400)))

### Number of words per summary sentence vs article sentence
- Text sentences about 2x longer than summary sentences
- Indicates that number of summary sentences per article sentences is not a good metric for determining length of generated summaries - will be overly restrictive: 8 article sentences per summary sentence, but each article sentence conveys a lot more information. 

In [136]:
df['words_per_sentence_summary'] = df.sentences_summary.apply(lambda row: np.mean([len(sentence.split(' ')) for sentence in row]))
df['words_per_sentence'] = df.sentences.apply(lambda row: np.mean([len(sentence.split(' ')) for sentence in row]))
df['words_per_sentence_ratio'] = df.words_per_sentence / df.words_per_sentence_summary

In [137]:
df.words_per_sentence_ratio.describe()

count    92322.000000
mean         1.839935
std          0.528256
min          0.451852
25%          1.527588
50%          1.787089
75%          2.083916
max         28.434066
Name: words_per_sentence_ratio, dtype: float64

In [167]:
p = histogram(df[df.words_per_sentence_ratio < 5], 'words_per_sentence_ratio', 'Article sentences 2x longer than Summary sentences', '# Article / # Summary Words Per Sentence', number_of_bins = 50)
output_file("articles_longer.html")
show(p)
# note truncated