In [1]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict

In [2]:
# Encapsulating the parsing logic into a function
def getTextWaPo(url):
    # download page
    page = requests.get(url)
    # Instantiate soup object
    soup = BeautifulSoup(page.content)
    # Find all elements with a article tag, extract the text and join into one single string
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    # Remove all \xa0
    text = text.replace('\xa0', ' ')
    return text

In [8]:
# Need to find better way to remove the unneeded parts of the first sentence
# text = getTextWaPo("https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/?noredirect=on&utm_term=.fab4cac18632")
text = getTextWaPo("https://www.washingtonpost.com/health/2019/06/06/surgeons-opened-her-skull-remove-cancerous-tumor-instead-they-found-tapeworm/?utm_term=.3dff90b31157")

In [10]:
# Encapsulate summary logic into function
def summarize(text, n):
    '''
    This function takes a string of text and the desired number of sentences in the ouput summary as inputs.
    '''
    # Get list of individual sentences in the text
    sents = sent_tokenize(text)
    
    # The full text must be longer than the desired summary
    assert n <= len(sents)
    # Get list of individual words in the text
    word_sent = word_tokenize(text.lower())
    # Create a set of stopwords
    _stopwords = set(stopwords.words('english') + list(punctuation) + ['“','”'])
    
    # Remove stopwords from our text
    word_sent = [word for word in word_sent if word not in _stopwords]
    # Construct a frequency distribution of words
    freq = FreqDist(word_sent)
    
    # Compute the significance score of each sentence by adding up the word frequencies. Add each ranking to a dictionary for lookup.
    ranking = defaultdict(int)
    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
    
    #  Pick most important sentences based on signifiance score
    sents_idx = nlargest(n, ranking, key=ranking.get)
    # Reorder sentences based on order in original text
    return " ".join([sents[j] for j in sorted(sents_idx)])

In [11]:
summarize(text, 4)

'In September, scrubbed-up surgeons in an operating room at Mount Sinai Hospital in New York City opened Palma’s cranium and steeled themselves for a malignant brain tumor, said Jonathan Rasouli, chief neurosurgery resident at the Icahn School of Medicine at Mount Sinai. But upon inspection, Rasouli said that it was “clearly not a brain tumor.” Doctors diagnosed Palma with neurocysticercosis, a parasitic infection in the brain caused by the tapeworm Taenia solium. [Restaurant chides man for ‘irresponsible reaction’ — sharing a video of a worm in his food] Bobbi Pritt, director of the Clinical Parasitology Laboratory in the Mayo Clinic’s Department of Laboratory Medicine and Pathology, said Taenia solium is not common in the United States but, when people do become infected, the parasite can present in two different forms. For example, Pritt said, if the person who has the adult tapeworm gets the eggs on his or her hands and then prepares another person’s food, that other person can unk