In [20]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

In [21]:
class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set(stopwords.words('english') + list(punctuation))
        
    def _compute_frequencies(self, word_sent):
        freq = defaultdict(int)
        for sentence in word_sent:
#             print sentence
            for word in sentence:
#                 print word
                if word not in self._stopwords:
                    freq[word] += 1
        m = float(max(freq.values()))
        
        for w in freq.keys():
            freq[w] = freq[w]/m
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                del freq[w]
                
        return freq
    
    def summarize(self, text, n):
        sents = sent_tokenize(text)  # sent_tokenize 僅適用於英文
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                for w in self._freq:
#                     print self._freq[w]
                    ranking[i] += self._freq[w]
    
        sent_idx = nlargest(n, ranking, key = ranking.get)
        return [sents[j] for j in sent_idx]

##### class FrequencySummarizer1:
    # indentation changes - we are now inside the class definition
    def __init__(self, min_cut=0.1, max_cut=0.9):
        # The constructor named __init__
        # THis function will be called each time an object of this class is 
        # instantiated
        # btw, note how the special keyword 'self' is passed in as the first
        # argument to each method (member function).
        self._min_cut = min_cut
        self._max_cut = max_cut 
        # Words that have a frequency term lower than min_cut 
        # or higer than max_cut will be ignored.
        self._stopwords = set(stopwords.words('english') + list(punctuation))
        # Punctuation symbols and stopwords (common words like 'an','the' etc) are ignored
        #
        # Here self._min_cut, self._max_cut and self._stopwords are all member variables
        # i.e. each object (instance) of this class will have an independent version of these
        # variables. 
        # Note how this function is used to set up the member variables to their appropriate values
    # indentation changes - we are out of the constructor (member function, but we are still inside)
    # the class.
    # One important note: if you are used to programming in Java or C#: if you define a variable here
    # i.e. outside a member function but inside the class - it becomes a STATIC member variable
    # THis is an important difference from Java, C# (where all member variables would be defined here)
    # and is a common gotcha to be avoided.

    def _compute_frequencies(self, word_sent):
        # next method (member function) which takes in self (the special keyword for this same object)
        # as well as a list of sentences, and outputs a dictionary, where the keys are words, and
        # values are the frequencies of those words in the set of sentences
        freq = defaultdict(int)
        # defaultdict, which we referred to above - is a class that inherits from dictionary,
        # with one difference: Usually, a Python dictionary throws a KeyError if you try 
        # to get an item with a key that is not currently in the dictionary. 
        # The defaultdict in contrast will simply create any items that you try to access 
        # (provided of course they do not exist yet). THe 'int' passed in as argument tells
        # the defaultdict object to create a default value of 0
        for s in word_sent:
        # indentation changes - we are inside the for loop, for each sentence
          for word in s:
            # indentation changes again - this is an inner for loop, once per each word_sent
            # in that sentence
            if word not in self._stopwords:
                # if the word is in the member variable (dictionary) self._stopwords, then ignore it,
                # else increment the frequency. Had the dictionary freq been a regular dictionary (not a 
                # defaultdict, we would have had to first check whether this word is in the dict
                freq[word] += 1
        # Done with the frequency calculation - now go through our frequency list and do 2 things
        #   normalize the frequencies by dividing each by the highest frequency (this allows us to 
        #            always have frequencies between 0 and 1, which makes comparing them easy
        #   filter out frequencies that are too high or too low. A trick that yields better results.
        m = float(max(freq.values()))
        # get the highest frequency of any word in the list of words
        for w in freq.keys():
            # indentation changes - we are inside the for loop
            freq[w] = freq[w]/m
            # divide each frequency by that max value, so it is now between 0 and 1
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                # indentation changes - we are inside the if statement - if we are here the word is either
                # really common or really uncommon. In either case - delete it from our dictionary
                del freq[w]
                # remember that del can be used to remove a key-value pair from the dictionary
        return freq
        # return the frequency list
        

In [22]:
fs = FrequencySummarizer()
fs.summarize(u'Blues brothers Don Jr. and Eric Trump gamble on Mississippi tourism - The Washington Post,CLEVELAND, MISS. \u2014 Jake Brown crooned the Mississippi blues to a nearly all-black audience on the outskirts of town, his guitar filling the darkened club with pangs of heartbreak and regret.   Between numbers, the local singer paused and in a gravelly drawl, beseeched the crowd to be thankful. For God. For the Mississippi blues. And for Donald Trump\u2019s hotel, being built on the other side of Cleveland. \u201cHave you all been out west of Cleveland?\u201d he queried his audience. \u201cTo those that don\u2019t know, get ready. Get ready, \u2019cause the blues is on the way.\u201d President Trump\u2019s hotel company, the New York-based managers of luxury properties and golf courses around the globe, seems an unlikely presence in this struggling stretch of the Delta, where new businesses are hard to recruit and black residents are eight times more likely than whites to face unemployment.  But in June, the Trump Organization, now run by the president\u2019s sons Donald Trump Jr. and Eric Trump, bestowed a singular distinction upon Cleveland, population 12,000, and two nearby towns. It announced it would debut two new hotel brands here, beginning with a four-star, 100-room Scion hotel originally designed to replicate an antebellum plantation.   A sign in downtown Cleveland, Miss., shows another hotel development in the area where the Trump family plans to open properties. The hope of the buildup is to lure tourists in search of blues music. (Lee Powell/The Washington Post)  In a partnership with local owners, the company said it would reopen two Comfort Inns and a Rodeway Inn after bringing them up to Trump standards and use the properties to launch its newest.', 5)
# fs._compute_frequencies(u'Blues brothers Don Jr. and Eric Trump gamble on Mississippi tourism - The Washington Post,CLEVELAND, MISS. \u2014 Jake Brown crooned the Mississippi blues to a nearly all-black audience on the outskirts of town, his guitar filling the darkened club with pangs of heartbreak and regret.   Between numbers, the local singer paused and in a gravelly drawl, beseeched the crowd to be thankful. For God. For the Mississippi blues. And for Donald Trump\u2019s hotel, being built on the other side of Cleveland. \u201cHave you all been out west of Cleveland?\u201d he queried his audience. \u201cTo those that don\u2019t know, get ready. Get ready, \u2019cause the blues is on the way.\u201d President Trump\u2019s hotel company, the New York-based managers of luxury properties and golf courses around the globe, seems an unlikely presence in this struggling stretch of the Delta, where new businesses are hard to recruit and black residents are eight times more likely than whites to face unemployment.  But in June, the Trump Organization, now run by the president\u2019s sons Donald Trump Jr. and Eric Trump, bestowed a singular distinction upon Cleveland, population 12,000, and two nearby towns. It announced it would debut two new hotel brands here, beginning with a four-star, 100-room Scion hotel originally designed to replicate an antebellum plantation.   A sign in downtown Cleveland, Miss., shows another hotel development in the area where the Trump family plans to open properties. The hope of the buildup is to lure tourists in search of blues music. (Lee Powell/The Washington Post)  In a partnership with local owners, the company said it would reopen two Comfort Inns and a Rodeway Inn after bringing them up to Trump standards and use the properties to launch its newest.')

[u'Get ready, \u2019cause the blues is on the way.\u201d President Trump\u2019s hotel company, the New York-based managers of luxury properties and golf courses around the globe, seems an unlikely presence in this struggling stretch of the Delta, where new businesses are hard to recruit and black residents are eight times more likely than whites to face unemployment.',
 u'(Lee Powell/The Washington Post)  In a partnership with local owners, the company said it would reopen two Comfort Inns and a Rodeway Inn after bringing them up to Trump standards and use the properties to launch its newest.',
 u'But in June, the Trump Organization, now run by the president\u2019s sons Donald Trump Jr. and Eric Trump, bestowed a singular distinction upon Cleveland, population 12,000, and two nearby towns.',
 u'\u2014 Jake Brown crooned the Mississippi blues to a nearly all-black audience on the outskirts of town, his guitar filling the darkened club with pangs of heartbreak and regret.',
 u'It announc

In [23]:
fs = FrequencySummarizer()

fs._compute_frequencies(u'中文測試，中文文字寫會如何')
print fs.summarize(u'中文測試，中文文字寫會如何. 你那隻到手 一定要先會拆裝一次. 了解結構, ＬＡＭＹ寫多之後.，覺得筆尖 有開，轉折時墨水粗細分明', 2)
## 需要用 '.' 來分句子
print fs._freq.viewvalues

[]
<built-in method viewvalues of collections.defaultdict object at 0x11479ccb0>


In [24]:
# 擷取網頁

import urllib2
from bs4 import BeautifulSoup

In [25]:
def get_only_text(url):
    page = urllib2.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page, 'lxml')
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    soup2 = BeautifulSoup(text, 'lxml')
    
    if soup2.find_all('p')!=[]:
        text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text

In [26]:
someUrl = "https://www.washingtonpost.com/news/the-switch/wp/2015/08/06/why-kids-are-meeting-more-strangers-online-than-ever-before/"
# the article we would like to summarize
textOfUrl = get_only_text(someUrl)
# get the title and text
fs = FrequencySummarizer()
# instantiate our FrequencySummarizer class and get an object of this class
summary = fs.summarize(textOfUrl[1], 3)
# get a summary of this article that is 3 sentences long
summary

[u'"The digital world has taken its place alongside school and friends\' houses and extracurriculars as a place where teens go to make and strengthen friendships," said Amanda Lenhart, author of the report "Teens, Technology & Friendships" and an associate\xa0director of research at Pew.',
 u'"Young people are very aware that people have highly curated images and that text fights can quickly go out of control and they are trying to sort it all out," said Rosalind Wiseman, author of "Queen Bees and Wannabes" and speaker on youth issues.',
 u"Nearly half of those surveyed say they've at least occasionally seen posts about events that they were invited to; and\xa085 percent said they think social media users present a carefully crafted image of themselves online that may not be authentic."]

### 講座34 : Put it to work : News Article Classification using K-Nearest Neighbors

In [27]:
# download and parse an article from its url
def getWashPostText(url, token):
    try:
        page = urllib2.urlopen(url).read().decode('utf8')
    except:
        return (None, None)
    
    soup = BeautifulSoup(page, "lxml")
    if soup is None:
        return (None, None)
    
    text = ""
    if soup.find_all(token) is not None:
        text = ''.join(map(lambda p: p.text, soup.find_all(token)))
        soup2 = BeautifulSoup(text, "lxml")
        if soup2.find_all('p') is not None:
            text = ''.join(map(lambda p: p.text, soup2.find_all('p')))
            
    return text, soup.title.text


In [28]:
getText = getWashPostText(someUrl, 'article')
getText

(u'Multitask1021  The majority of teenagers don\'t consider meeting strangers online a taboo, with six in 10 saying they have met\xa0at least\xa0one new friend on the Web. Teens are also texting and communicating through online games and social networks more frequently\xa0than they are spending time together in person. And of those who\xa0meet people online, one-third also followed up with an in-person meeting. These findings are part of a new in-depth\xa0study from the Pew Research Center aimed at understanding how online interactions are shaping the social lives and identities of American teens. Broadly speaking, the research found that the line between the virtual and real worlds has almost completely blurred -- and that kids say they have deep and meaningful relationships with people online and in person. "The digital world has taken its place alongside school and friends\' houses and extracurriculars as a place where teens go to make and strengthen friendships," said Amanda Lenhar

In [29]:
def getNYTText(url, token):
    response = request.get(url)
    soup = BeautifulSoup(response.content)
    page = str(soup)
    title = soup.find('title').text
    mydivs = soup.findAll("p",{"class":"story-body-text story-content"})
    text = ''.join(map(lambda p:p.text, mydivs))
    
    return text, title

In [30]:
# THIS FUNCTION RETURNS A DICTIONARY
# THE URLS AS KEYS AND THE CORRESPONDING ARTICLE T1TLE TEXT AS VALUES
def scrapeSource(url, magicFrag='2015', scraperFunction=getNYTText, token='None'):  #magic frag 譯為魔法碎片？
    urlBodies = {}
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    soup = BeautifulSoup(response)
    
    numErrors = 0
    for a in soup.findAll('a'):
        try:
            url = a['href']
            if((url not in urlBodies) and (magicFrag is not None and magicFrag in url) or (magicFrag is None)):
                body = scraperFunction(url, token)
        except:
            numErrors += 1
            
    return urlBodies

### set up the training dataset