In [1]:
'''
Auto-Summarizing Text (Rule-based)

Abstract Extraction
    Retrieve text 
      - DL webpage
      - Parse text with BeautifulSoup
    Preprocess text
      - Tokenize into sentences
      - Tokenize sentences into words
      - Remove stopwords
    Extract sentences
      - Find most important words (frequency = importance)
      - Compute significance scores of sentences (significance = sum(word importance))
      - Rank sentences by their score
      - Pick top N sentences

Uses the following packages:
    - requests 
    - BeautifulSoup
    - nltk
'''

'\nAuto-Summarizing Text (Rule-based)\n\nAbstract Extraction\n    Retrieve text \n      - DL webpage\n      - Parse text with BeautifulSoup\n    Preprocess text\n      - Tokenize into sentences\n      - Tokenize sentences into words\n      - Remove stopwords\n    Extract sentences\n      - Find most important words (frequency = importance)\n      - Compute significance scores of sentences (significance = sum(word importance))\n      - Rank sentences by their score\n      - Pick top N sentences\n\nUses the following packages:\n    - requests \n    - BeautifulSoup\n    - nltk\n'

In [6]:
# Retrieve Text
import requests
from bs4 import BeautifulSoup
articleURL = "https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/?utm_term=.32c73cb18d54"

# Get content from url, then get all content from <article> elements in single string 
def getTextWaPo(url):
    page = requests.get(articleURL)
    page_content = page.text.encode(encoding='ascii', errors='ignore') # Encode text to ascii to remove any special characters
    soup = BeautifulSoup(page_content, "lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))  # join content from all article elements on page into a single string
    return text

In [7]:
text = getTextWaPo(articleURL)

In [8]:
# Preprocess text
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from string import punctuation
from collections import defaultdict
from heapq import nlargest

def summarize(text, n):
    '''
    Summaraize text into specified number of sentences.
        1. tokenize words
        2. create stopword list
        3. filter out stopwords from word list
        4. calculaute importance of each word based on frequency
        5. get the top 10 most frequently occurring/important words
        6. place most important words in dictionary (key:value => word:importance)
        7. calculate significance of each sentence by summing the significance of each word in each sentence
        8. get indices of n most significant sentences
        9. get sentences, order them by index, and return

    param text: input text to be summarized
    param n: number of sentences to be condense text
    '''
    sents = sent_tokenize(text)

    assert n <= len(sents)
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation))
  
    word_sent = [word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)

    nlargest(10, freq, key=freq.get) # get n most frequently occurring/important words in list
    ranking = defaultdict(int)

    # calculate significance of each sentence by summing the significance of each word in sentence
    for i,sent in enumerate(sents): # enumerate zips each element with an index
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]

    sents_idx = nlargest(n, ranking, key=ranking.get)
    return [sents[j] for j in sorted(sents_idx)]

In [9]:
summarize(text, 4)


['On Tuesday, the Defense Departmenttook another significant step toward monitoring all of the cosmic junk swirling around in space, by deliveringa gigantic new telescope capable of seeing small objects from very far away.',
 'DARPA says the advanced technology in the massive, 90-ton telescope wouldallow officials to go from seeing only a few large objects at a time through the equivalent of a drinking straw to a windshield view with 10,000 objects at a time.It is also being used by NASA to monitor asteroids and other near-Earth objects that could collide with the planet, officials said.',
 'The telescope is a big improvement over the legacy ground-based optical telescopes that are used by the U.S. Air Force, because it can search large areas of sky and also track very faint (small) objects in and around GEO, Brian Weeden, a Technical Advisor at the Secure World Foundation, wrote in an email.',
 'Every military operation that takes place in the world today is critically dependent on sp