# Colab setup

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords') 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Import packages

In [0]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict

# Encapsulating the parsing logic into a function

Each website will likely require a custom parser depending on their structure. After going over this code, please return to the Washington Post website and find an article that captures your attention. Substitute in the url and see if you like the summary. 

In [0]:
def getTextWaPo(url):
    # download page
    page = requests.get(url)
    # Instantiate soup object
    soup = BeautifulSoup(page.content)
    # Find all elements with a article tag, extract the text and join into one single string
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    # Remove all \xa0
    text = text.replace('\xa0', ' ')
    return text

In [0]:
# Need to find better way to remove the unneeded parts of the first sentence
# text = getTextWaPo("https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/?noredirect=on&utm_term=.fab4cac18632")
text = getTextWaPo("https://www.washingtonpost.com/opinions/global-opinions/child-death-rates-are-down-its-cause-for-optimism/2019/09/24/2b618662-dbcd-11e9-bfb1-849887369476_story.html")

# Encapsulate summary logic into function

In [0]:
def summarize(text, n):
    '''
    This function takes a string of text and the desired number of sentences in the ouput summary as inputs.
    '''
    # Get list of individual sentences in the text
    sents = sent_tokenize(text)
    
    # The full text must be longer than the desired summary
    assert n <= len(sents)
    # Get list of individual words in the text
    word_sent = word_tokenize(text.lower())
    # Create a set of stopwords
    _stopwords = set(stopwords.words('english') + list(punctuation) + ['“','”'])
    
    # Remove stopwords from our text
    word_sent = [word for word in word_sent if word not in _stopwords]
    # Construct a frequency distribution of words
    freq = FreqDist(word_sent)
    
    # Compute the significance score of each sentence by adding up the word frequencies. Add each ranking to a dictionary for lookup.
    ranking = defaultdict(int)
    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
    
    #  Pick most important sentences based on signifiance score
    sents_idx = nlargest(n, ranking, key=ranking.get)
    # Reorder sentences based on order in original text
    return " ".join([sents[j] for j in sorted(sents_idx)])

# How many sentences would you like to return?

In [8]:
summarize(text, 4)

'In fact, the Gates Foundation’s report notes, “Health and education are improving everywhere in the world.” The share of the world’s population living on $1.90 or less per day stood at 8 percent in 2018, down from 36 percent in 1990. The U.N.’s goal was to bring to zero the number of people living in extreme poverty by 2030; that seems unlikely, according to the Gates Foundation report, but the wonder is how close the world may come. The report’s title, “Examining Inequality,” is properly intended to emphasize that people’s life chances are still far too often a matter of such factors as geography and gender. And the southwest corner of one Sahel country, Chad, has a child mortality rate — about 15 percent — that is even higher than in the rest of the country.'