In [1]:
# Importing the libraries

import re
import requests
import urllib.request
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
def getNYTText(url):
    """
    This function takes in a url from The NewYork Times and returns the text and title parts of the article. 
    It uses BeautifulSoup to find the relevant tags.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    page = str(soup)     # this fetches the full html part of page as text from the soup object
    title = soup.find("h1").getText()    # tag for article title
    mydivs = soup.findAll("div", {"class":re.compile("^StoryBodyCompanionColumn")})    # tag for article text
#     print(mydivs)
    text = ''.join(map(lambda d : d.text.strip(), mydivs))    # combining the text in all the divs in mydivs
    return text, title

In [3]:
print(getNYTText("https://www.nytimes.com/2018/10/23/science/stephen-hawking-final-paper.html?action=click&contentCollection=science&region=rank&module=package&version=highlights&contentPlacement=2&pgtype=sectionfront"))

('The cosmologist and pop-science icon Stephen Hawking, who died last March on Einstein’s birthday, spoke out from the grave recently in the form of his last scientific paper. Appropriately for a man on the Other Side, the paper is about how to escape from a black hole.Cleansed of its abstract mathematics, the paper is an ode to memory, loss and the oldest of human yearnings, the desire for transcendence. As the doomed figure in Bruce Springsteen’s “Atlantic City” sings, “Everything dies, baby, that’s a fact, but maybe everything that dies someday comes back.”Dr. Hawking was the manifestation of perseverance; stricken by Lou Gehrig’s disease, he managed to conquer the universe from a wheelchair. The fate of matter or information caught in a black hole is one that defined his career, and it has become one of the deepest issues in physics.Black holes are objects so dense that, according to Einstein’s law of general relativity, not even light can escape. In 1974, Dr. Hawking turned these 

In [4]:
def getTheHinduText(url):
    """
    This function takes in a url from The Hindu and returns the text and title parts of the article. 
    It uses BeautifulSoup to find the relevant tags.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    page = str(soup)    # Get the html from the response page using soup object
    title = soup.find("h1", {"class":"title"}).text.strip()    # tag for title
    mydivs = soup.findAll("div", {"id":re.compile("^content-body-")})    # tag for divs having text
    text = ' '.join(map(lambda d : d.text.strip(), mydivs))    # combine all the text in mydivs 
    return text, title

In [5]:
print(getTheHinduText("https://www.thehindu.com/sci-tech/science/iit-guwahati-uses-aloe-vera-to-remove-oil-from-water/article25345614.ece"))

('Researchers have for the first time used a naturally occurring material — aloe vera gel — which inherently has superior oil repelling (oleophobic) property to separate oil from water. So far researchers have only been mimicking the structure of naturally existing materials such as fish scales to achieve super oleophobicity. While conventionally the topography and chemistry of materials had to optimised to make them repel oil extremely, the use of aloe vera gel–based coating modified with some molecules was sufficient to make the surface of substrates extremely oil repelling.Aloe leavesThe team led by Dr. Uttam Manna from the Department of Chemistry at Indian Institute of Technology (IIT) Guwahati used the thick gel contained in the leaves of aloe vera plant to convert a commercially available porous material that is oil-loving (oleophilic) to become extremely oil-repelling by coating it with the gel.Like a drop of water that nearly retains its spherical shape when placed on a lotus l

In [6]:
def scrapeSource(url, checkFragment, scraperFunc=getTheHinduText):
    """
    This function is used like a scraper. The url of some section is given to it and it checks all the associated
    links in the page and filters the valid links of articles. It uses the individal getText functions to get 
    their text and title and stores them together.
    """
    urlBodies = {}
    response = requests.get(url)
    soup = BeautifulSoup(response.content)   # pass the response to initialise BeautifulSoup
    
    badLinks = 0    # just counting the invalid links

    for link in soup.findAll("a"):
        try:
            if ( link['href'] not in urlBodies and \
                            ((checkFragment is not None and checkFragment in link['href'] ) \
                                or (checkFragment is None )) and link['href'].endswith(".ece") ):
                bodyText = scraperFunc(link['href'])
                if len(bodyText) > 0:
                    urlBodies[link['href']] = bodyText
        except:
            # print("Error : ", link)
            badLinks += 1
    return urlBodies

In [7]:
print(len(scrapeSource("https://www.thehindu.com/sci-tech/science/", "https://www.thehindu.com/sci-tech/science/")))

47


In [8]:
class FrequencySummarizer:
    """
    This Frequency summarizer class has methods to find the most important words and sentences from the given
    (title, article text) pair. Stopwords and punctuation are excluded as part of the pre-processing
    """
    def __init__(self, min_cut = 0.1, max_cut = 0.9):
        # min and max cut are the threshold for max and min allowed frequency
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set(stopwords.words('english') + list(punctuation) + [u"'s", '"'])
        
    def _compute_frequencies(self, word_sent, customStopWords = None):
        """
        Method to find the frequencies based scoring of the words and sentences in the text
        """
        freq = defaultdict(int) # Using defaultdict so that no special handling is needed to add new keys
        if customStopWords is None:
            stopwords = self._stopwords
        else:
            # we can pass our own set of stopwords to merge them with the default set of stopwords
            stopwords = set(customStopWords).union(self._stopwords)
        
        for sent in word_sent:
            for word in sent:
                if word not in stopwords:
                    # Add 1 to frequency score for each occurrence of the word
                    freq[word] += 1
        
        m = float(max(freq.values()))    # Get the maximum value of frequency to normalise
        for word in list(freq.keys()):
            freq[word] /= m
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]
                
        return freq
    
    def extractFeatures(self, article, nfeatures, customStopWords=None):
        """
        This method returns the selected features(words based on frequency scores) from the given article. 
        """
        # The articles passed to this method is itself a tuple(articles-text, title). The text and title are 
        # extracted and the text is tokenized into sentences and words
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        words_sent = [word_tokenize(sent.lower()) for sent in sentences]
        
        # using the class defined method to get the freq scores of each words
        self._freq = self._compute_frequencies(words_sent, customStopWords) 
        
        if nfeatures < 0:
            # for negative value, we return all the words as features
            return nlargest(len(self._freq.keys()), self._freq, key=self._freq.get)
        else:
            # for positive values return the first 'nfeatures' important words
            return nlargest(nfeatures, self._freq, key=self._freq.get)
    
    def extractRawFrequencies(self, article):
        """
        This method just returns the frequency count of all the words in the given article
        """
        # The articles passed to this method is itself a tuple(articles-text, title). The text and title are 
        # extracted and the text is tokenized into sentences and words
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        words_sent = [word_tokenize(sent.lower()) for sent in sentences]
        
        freq = defaultdict(int)
        for sents in words_sent:
            for word in sents:
                if word not in self._stopwords:
                    # Add 1 to frequency score for each occurrence of the word
                    freq[word] += 1
        return freq
    
    def summarize(self, article, nfeatures):
        """
        This method summarizes the given article based on the given number of features
        """
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        words_sent = [word_tokenize(sent.lower()) for sent in sentences]
        
        # Get the frequency using the class defined method
        self._freq = self._compute_frequencies(words_sent)
        
        ranking = defaultdict(int)    # this is for ranking each of the sentences
        for i, sentence in enumerate(words_sent):
            for word in sentence:
                if word in self._freq:
                    # if the word is found in the frequency list, then the ranking of the sentence(which was 
                    # carrying this word) increases with the same value as the normalised frequency of the word
                    ranking[i] += self._freq[word]
        
        # Getting the top 'nfeatures' sentences
        sentences_idx = nlargest(nfeatures, ranking, key=ranking.get)
        return [sentences[i] for i in sentences_idx]        

In [9]:
# This is for setting up the urls from where the articles will be picked up

urlTheHinduNonScience = "https://www.thehindu.com/sport/"
urlNewYorkTimesNonScience = "https://www.nytimes.com/pages/sports/index.html"
urlTheHinduScience = "https://www.thehindu.com/sci-tech/science/"
urlNewYorkTimesScience = "https://www.nytimes.com/section/science"

In [10]:
# This actually uses the urls defined above and collects all the urls together

theHinduScienceArticles = scrapeSource(urlTheHinduScience, urlTheHinduScience)
theHinduNonScienceArticles = scrapeSource(urlTheHinduNonScience, urlTheHinduNonScience)
theNewYorkTimesScienceArticles = scrapeSource(urlNewYorkTimesScience, '2018', getNYTText)
theNewYorkTimesNonScienceArticles = scrapeSource(urlNewYorkTimesNonScience, '2018', getNYTText)

In [11]:
# Now we will bring the articles together in an easy to classify form

articleSummaries = {}

# this is for clubbing the Science articles 
for scienceURLDict in [theNewYorkTimesScienceArticles, theHinduScienceArticles]:
    for articleURL in scienceURLDict:
        # this checks if their is a text-article format for this url
        if scienceURLDict[articleURL][0] is not None:
            # this checks if the text has been extracted properly
            if len(scienceURLDict[articleURL][0]) > 0:
                fs = FrequencySummarizer()
                # get 25 most important words
                summary = fs.extractFeatures(scienceURLDict[articleURL], 25)
                articleSummaries[articleURL] = {'feature-vector' : summary, 'label' : 'Science'}

# this is for clubbing the Non Science articles
for nonScienceURLDict in [theNewYorkTimesNonScienceArticles, theHinduNonScienceArticles]:
    for articleURL in nonScienceURLDict:
        # this checks if their is a text-article format for this url
        if nonScienceURLDict[articleURL][0] is not None:
            # this checks if the text has been extracted properly
            if len(nonScienceURLDict[articleURL][0]) > 0:
                fs = FrequencySummarizer()
                # get 25 most important words
                summary = fs.extractFeatures(nonScienceURLDict[articleURL], 25)
                articleSummaries[articleURL] = {'feature-vector' : summary, 'label' : 'Non-Science'}

In [12]:
print(articleSummaries)



In [13]:
def getDoxyDonkeyText(testUrl, token):
    """
    This is the function to read text from the doxy donkey post. We will read posts from this blog collection
    and use the data from our The Hindu and The NewYork Times papers to classify if it is Science or Non-Science
    """
    response = requests.get(testUrl)
    soup = BeautifulSoup(response.content)
    page = str(soup)
    title = soup.find("title").text.strip()
    mydivs = soup.findAll("div", {"class":token})
    text = ''.join(map(lambda d : d.text, mydivs))
    return text, title

In [14]:
testUrl = "http://doxydonkey.blogspot.com/"
testArticle = getDoxyDonkeyText(testUrl,"post-body")
# print(testArticle)

In [15]:
fs = FrequencySummarizer()
# get the summary of the test article in the given limit of features
testArticleSummary = fs.extractFeatures(testArticle, 25)

In [16]:
similarities = {}
for articleUrl in articleSummaries:
    # pass through all the article Summaries and check each of their similarity with the test article
    oneArticleSummary = articleSummaries[articleUrl]['feature-vector']
    similarities[articleUrl] = len(set(testArticleSummary).intersection(set(oneArticleSummary)))

In [17]:
labels = defaultdict(int)
# get the top 5 train articles which are nearest to the test article. Nearest here means how many words, both the
# summaries of the test and train articles have in common
knn = nlargest(5, similarities, key = similarities.get)
for oneNeighbour in knn:
    labels[articleSummaries[oneNeighbour]['label']] += 1

# Each of the neighbours vote if the article is science or non-science. The label which is in majority is tagged
# to the article
print(nlargest(1, labels, key = labels.get))

['Science']


In [18]:
# Just validating if the article summary truly has science related words
print(testArticleSummary)

['company', 'china', 'said', 'new', 'year', 'billion', 'uber', 'million', 'percent', 'technology', 'people', 'could', 'investors', 'amazon', 'drivers', 'money', '1', 'chief', 'like', 'india', 'bitcoin', 'last', 'years', 'pinterest', 'raised']


In [19]:
# Now we will carry out the same classification problem but using a different approach - Naive bayes Classifier

cumulativeRawFrequencies = {'Science': defaultdict(int), 'Non-Science': defaultdict(int)}
# Clubbing together Sciencee and Non-Science data from The Hindu
trainingData = {'Science': theHinduScienceArticles, 'Non-Science' : theHinduNonScienceArticles}
for label in trainingData:
    # For each label in trainingData
    for articleUrl in trainingData[label]:
        # For each url in the training data for the specified label
        if (len(trainingData[label][articleUrl][0]) > 0):
            # If the text length of the article of specified url and label is greater than 0
            fs = FrequencySummarizer()
            rawFrequencies = fs.extractRawFrequencies(trainingData[label][articleUrl])
            for eachWord in rawFrequencies:
                # For each word in the raw frequencies count increase the cumulative raw frequency value by the 
                # value of word frequency. This will create a cumulative frequency count of each word in the 
                # whole corpus
                cumulativeRawFrequencies[label][eachWord] += rawFrequencies[eachWord]


In [20]:
# initialise the likelihood of this article being scientific and non-scientific to both 1.0

scientific = 1.0
non_scientific = 1.0

for eachWord in testArticleSummary:
    # for each feature of the test Article, compute the conditional probability of that word(feature) being in 
    # an article given that the article is a science article
    if eachWord in cumulativeRawFrequencies['Science']:
        scientific *= 1e3 * cumulativeRawFrequencies['Science'][eachWord] / \
                                                float(sum(cumulativeRawFrequencies['Science'].values()))
        # P(Science/Words in Article) = P(Science) * P(word1/Science) * P(word2/Science)...
    else:
        # if the word is not found at all in the cumulative Raw Frequency dictionary, then we multiply by
        # a very small number instead of following the theorectial rule of multiplying by 0. Otherwise for 
        # a single missing word in the corpus, the entire 'scietific' value of the article will become 0
        scientific /= 1e3
    if eachWord in cumulativeRawFrequencies['Non-Science']:
        non_scientific *= 1e3 * cumulativeRawFrequencies['Non-Science'][eachWord] / \
                                                float(sum(cumulativeRawFrequencies['Non-Science'].values()))
    else:
        non_scientific /= 1e3
    

In [21]:
# we need to scale the scientific and non-scientific of this article by the probabilites of overall scientific 
# and non-scientific. (OR) the ratio of number of words in the scientific and non-scientific articles respectively
# to the total number of words

scientific *= float(sum(cumulativeRawFrequencies['Science'].values())) / \
                        (float(sum(cumulativeRawFrequencies['Science'].values())) + \
                         float(sum(cumulativeRawFrequencies['Non-Science'].values())))

non_scientific *= float(sum(cumulativeRawFrequencies['Non-Science'].values())) / \
                        (float(sum(cumulativeRawFrequencies['Science'].values())) + \
                         float(sum(cumulativeRawFrequencies['Non-Science'].values())))

if scientific > non_scientific:
    label = 'Science'
else:
    label = 'Non-Science'
    
print(label, scientific, non_scientific)

Science 2.522274924938075e-27 1.3848700402224402e-29


In [68]:
count = 0
def getAllDoxyDonkeyPosts(urls, links):
    """
    This is the function to read the texts from all the links in the main url of the doxy donkey post. We will 
    read posts from this blog collection and use the data from our The Hindu and The NewYork Times papers to 
    classify if it is Science or Non-Science.
    """
    response = requests.get(urls)
    soup = BeautifulSoup(response.content)
    page = str(soup)
    for a in soup.findAll('a'):
        try :
            url = a['href']
            title = a['title']
            if title == "Older Posts":
                # for the link names Older Posts, we recursively pass the link to the getAllDoxyDonkeyPosts 
                # function
#                 print(title, url)
                links.append(url)
                global count
                count += 1
                if count >= 40: # max 86
                    break
                getAllDoxyDonkeyPosts(url, links)
        except :
            title = ""
    return

In [69]:
blogUrl = "http://doxydonkey.blogspot.in"
links = []
getAllDoxyDonkeyPosts(blogUrl, links)
doxyDonkeyPosts = {}
for link in links:
    # collet the texts from the links collected
    doxyDonkeyPosts[link] = getDoxyDonkeyText(link, 'post-body')

In [70]:
documentCorpus = []
for onePost in doxyDonkeyPosts.values():
    # creating a huge document corpus from all the doxydonkey blogs
    documentCorpus.append(onePost[0])

In [71]:
vectorizer = TfidfVectorizer(max_df = 0.8, min_df = 2, stop_words = 'english')
X = vectorizer.fit_transform(documentCorpus)
km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)
km.fit(X)

Initialization complete
Iteration  0, inertia 127.407
Iteration  1, inertia 66.894
Converged at iteration 1: center shift 0.000000e+00 within tolerance 6.436236e-09


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [72]:
keywords = {}
for i, cluster in enumerate(km.labels_):
    oneDocument = documentCorpus[i]
    fs = FrequencySummarizer()
    summary = fs.extractFeatures((oneDocument, ""), 100, \
                                [u"according",u"also",u"billion",u"like",u"new", u"one",u"year",u"first",u"last"])
    if cluster not in keywords:
        keywords[cluster] = set(summary)
    else:
        keywords[cluster] = keywords[cluster].intersection(set(summary))

print(keywords)

{1: {'said', 'company', 'percent'}, 2: {'“', 'said', '”'}, 3: {'said', 'percent', '“', 'would', 'company', '”', 'million'}, 4: {'said', '“', 'company', '”', 'million'}, 0: {'said', '—', 'percent', 'people', 'million'}}
