# Predicting news as economy or cricket using knn.

In [1]:
#importing library for webscrapping.for this purpose we need request and urllib.
#requests is used for requesting a data from client side.
import requests
import urllib
# BeautifulSoup is used as a replacement for regular expression for html.
from bs4 import BeautifulSoup
# importing nltk tools for tokenizing word and sentence.
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [2]:
# loading the data from thehindu.com
# returing text and its heading.
def get_text_from_hindu(url):
    try:
        page = urllib.request.urlopen(url).read().decode('utf8')
    except:
        return (None,None)
    
    soup = BeautifulSoup(page)
    
    if soup is None:
        return (None,None)
    
    text = ''
    if soup.find_all('p') is not None:
        text = ''.join(map(lambda x:x.text,soup.find_all('p')[1:-4]))
        
    return text,soup.title.text

In [3]:
article = get_text_from_hindu('http://www.thehindu.com/sport/cricket/bizarre-decisions-proved-costly-for-tn-ranji-team/article21119357.ece')
article



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


('The underwhelming Tamil Nadu, lacking consistency and cutting edge, did not deserve to go through to the knock-out stages of the Ranji Trophy.The side’s tactics and the selection of its eleven appeared bizarre on occasions. The different arms of the team did not move in cohesion.The signs were ominous when Tamil Nadu crawled to 112 for two in 54 overs on day three after conceding the lead and needing to force the pace against Andhra in the season-opener. The defensive mind-set became pronounced as the season progressed, in several decisive moments of the league phase.Despite gaining the lead against Mumbai after an early stumble with B. Indrajith and V. Yo Mahesh notching up centuries, Tamil Nadu could not build on the momentum gained.The side, inexplicably, conceded the lead to Odisha after notching up 530 for eight. The bowling unit stood exposed. Over-dependence on left-arm spinner Rahil Shah — effective on turners and ordinary on other tracks — did not help the side’s cause. His 

In [4]:
class FrequencyCounter():
    
    def __init__(self,min_cut=0.1,max_cut=0.9):
        self.min_cut = min_cut
        self.max_cut = max_cut
        
        self._stopwords = set(stopwords.words('english')+list(punctuation)+['.',',',"'", '“', '”','’', '‘'])
        
    def _compute_frequency(self,word_sent):
        freq_dict = {}
        
        for sent in word_sent:
            for word in sent:
                if word not in self._stopwords:
                    c = freq_dict.get(word,0)
                    freq_dict[word]=c+1
    
        m = float(max(freq_dict.values()))
        
        #print(freq_dict)
        #eliminating unnecessary word using max_cut and min_cut
        for w in list(freq_dict.keys()):
            freq_dict[w] = freq_dict[w]/m
            if freq_dict[w]<self.min_cut or freq_dict[w]>self.max_cut:
                del freq_dict[w]
                
        return freq_dict
    
    def extractFeatures(self,article,n):
        text = article[0]
        sentence = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentence]
        self._freq = self._compute_frequency(word_sent)
        #print(self._freq)
        list1 = sorted(self._freq.keys(),key= lambda x:self._freq[x],reverse=True)
        return list1[:n]
    
    def summarize(self,text,n):
        sents = sent_tokenize(text)
        
        assert n <= len(sents)
        # assert is a way of making sure a condition holds true, else an exception is thrown. Used to do 
        # sanity checks like making sure the summary is shorter than the original article.
        
        word_sent = [word_tokenize(s.lower()) for s in sents]
        freq = self._compute_frequency(word_sent)
        #print("\n\nfrequency->\n",freq,"\n\n")
        ranking = {}
        for i,sent in enumerate(word_sent):
            ranking[i]=0
            for word in sent:
                if word in freq:
                    ranking[i]+=freq[word]
                    
        sort_index = sorted(ranking.keys(),key=lambda x:ranking[x], reverse =True)
        top_n = [sents[j] for j in sort_index[:n]]
        return top_n
    
    

In [5]:
fs = FrequencyCounter()
#fs.extractFeatures(article,2)
fs.summarize(article[0],2)
fs.extractFeatures(article,25)

['tamil',
 'bowlers',
 'eleven',
 'lead',
 'ranji',
 'tactics',
 'two',
 'three',
 'andhra',
 'moments',
 'b.',
 'indrajith',
 'yo',
 'mahesh',
 'notching',
 'bowling',
 '—',
 'turners',
 'tracks',
 'came',
 'average',
 'many',
 'all-rounders',
 'make',
 'ability']

In [6]:
## for cricket news 
def scrapeSite(url,scraperFunction,magicFrag = 'cricket',token = None):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    soup1 = BeautifulSoup(response)
    
    urldict = {}
    
    numerror = 0
    for a in soup1.findAll('a'):
        try:
            url=a.get('href')
#            if ((url not in urldict) and ((magicFrag is not None and magicFrag in url) or magicFrag is None)):
            if( (url not in urldict) and ((magicFrag is not None and magicFrag in url) or magicFrag is None)):
                body = scraperFunction(url)
#                print(body)                    
                if body and len(body)>0:
                    urldict[url] = body
#                print(url)
                
        except:
            numerror +=1
    return urldict

In [7]:
urldict = scrapeSite("http://www.thehindu.com/sport/cricket/", get_text_from_hindu,'cricket')




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [8]:
#urldict

In [9]:
# for economy news
def scrapeSite1(url,scraperFunction,magicFrag = 'Economy',token = None):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    soup1 = BeautifulSoup(response)
    
    urldict = {}
    
    numerror = 0
    for a in soup1.findAll('a'):
        try:
            url=a.get('href')
#            if ((url not in urldict) and ((magicFrag is not None and magicFrag in url) or magicFrag is None)):
            if( (url not in urldict) and ((magicFrag is not None and magicFrag in url) or magicFrag is None)):
                body = scraperFunction(url)
#                print(body)                    
                if body and len(body)>0:
                    urldict[url] = body
#                print(url)
                
        except:
            numerror +=1
    return urldict

In [10]:
ulldict2 = scrapeSite1("http://www.thehindu.com/business/Economy/", get_text_from_hindu,'Economy')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [11]:
# contains the conclusion of every news as cricket or ecocnomy.
articleSummaries = {}


for articleUrl in urldict:
    if urldict[articleUrl][1] is not None:
        if len(urldict[articleUrl][0]) > 0:
            fs = FrequencyCounter()
            summary = fs.extractFeatures(urldict[articleUrl],25)
            articleSummaries[articleUrl] = {'feature-vector': summary,
                                           'label': 'cricket'}
for articleUrl in ulldict2:
    if ulldict2[articleUrl][1] is not None:
        if len(ulldict2[articleUrl][0]) > 0:
            fs = FrequencyCounter()
            summary = fs.extractFeatures(ulldict2[articleUrl],25)
            articleSummaries[articleUrl] = {'feature-vector': summary,
                                           'label': 'economy'}

In [12]:
for i in articleSummaries.keys():
    print('\n',i,'\n',articleSummaries[i],'\n')


 http://www.thehindu.com/sport/cricket/ 
 {'feature-vector': ['fees', 'announced', 'chennai', 'warner', 'future', 'team', 'coa', 'missed', 'revising', 'officials', 'recently', 'steep', 'hike', 'player', 'match', 'mitchell', 'marsh', 'travis', 'head', 'captain', 'four-day', 'one-day', 'teams', 'respectively', 'september'], 'label': 'cricket'} 


 http://www.thehindu.com/sport/cricket/hike-for-domestic-match-officials/article24039249.ece 
 {'feature-vector': ['match', 'umpires', '—', 'day', 'referees', 'top', 'domestic', '20', 'others', '15,000', 'bcci', 'hike', 'coa', 'hindu', 'cricket', 'raman', 'pay', 'officials', 'revision', 'last', 'fees', 'march', 'years', 'indian', 'hirwani'], 'label': 'cricket'} 


 http://www.thehindu.com/sport/cricket/star-studded-australia-a-squads-named-for-india-tour/article24031556.ece 
 {'feature-vector': ['tour', 'squad', 'travis', 'alex', 'players', 'marsh', 'four-day', 'head', 'mitch', 'india', 'one-day', 'test', 'experience', 'group', 'carey', 'ashton

In [13]:
def gethindustantimestext(url):
    page = urllib.request.urlopen(url).read().decode('utf8').encode('cp850','replace').decode('cp850')
    soup = BeautifulSoup(page)
    divs = soup.findAll('div',{'class':'story-details'})
    text = ''.join(map(lambda x:x.text,divs))
    return text,soup.find('title').text


In [14]:
#hturl = "http://www.hindustantimes.com/india-vs-sri-lanka-2017/ms-dhoni-still-pillar-of-indian-cricket-team-after-defiant-knock-in-dharamsala/story-PpwHKt9ruEEI5ghMvtun0I.html"
#hturl = "https://www.hindustantimes.com/cricket/dale-steyn-there-are-great-players-and-then-there-s-ab-de-villiers/story-kcDFnbwxtgdHPlivGqVYmL.html"
#testarticle = gethindustantimestext(hturl)
#fs1 = FrequencyCounter()
#hindusumm = fs1.extractFeatures(testarticle,25)


In [15]:

"""simmilarity = {}
for articleurl in articleSummaries:
    onearticlesumm = articleSummaries[articleurl]['feature-vector']
    simmilarity[articleurl]=len(set(hindusumm).intersection(set(onearticlesumm)))
"""

"simmilarity = {}\nfor articleurl in articleSummaries:\n    onearticlesumm = articleSummaries[articleurl]['feature-vector']\n    simmilarity[articleurl]=len(set(hindusumm).intersection(set(onearticlesumm)))\n"

In [16]:
#simmilarity

In [21]:
# predicting news from hindustantime.com
"""from collections import defaultdict
labels = defaultdict(int)  
knn = sorted(simmilarity.keys(),key = lambda x:simmilarity[x],reverse = True)[:5]
print(knn)
for i in knn:
    labels[articleSummaries[i]['label']]+=1
print(labels)"""

"from collections import defaultdict\nlabels = defaultdict(int)  \nknn = sorted(simmilarity.keys(),key = lambda x:simmilarity[x],reverse = True)[:5]\nprint(knn)\nfor i in knn:\n    labels[articleSummaries[i]['label']]+=1\nprint(labels)"

In [19]:
# Predicting the economy class
url_buss = "http://www.thehindu.com/business/Economy/need-to-clarify-development-within-wto-us/article21461527.ece"
testarticle = get_text_from_hindu(url_buss)
fs1 = FrequencyCounter()
hindusumm = fs1.extractFeatures(testarticle,25)

simmilarity = {}
for articleurl in articleSummaries:
    onearticlesumm = articleSummaries[articleurl]['feature-vector']
    simmilarity[articleurl]=len(set(hindusumm).intersection(set(onearticlesumm)))
    
labels1 = defaultdict(int)  
knn = sorted(simmilarity.keys(),key = lambda x:simmilarity[x],reverse = True)[:5]
print(knn)
for i in knn:
    labels1[articleSummaries[i]['label']]+=1
print("\n --------------------------\n Predictin is:-\n")    
print(labels1)

['http://www.thehindu.com/business/Economy/why-are-crude-oil-prices-going-up/article23937146.ece', 'http://www.thehindu.com/business/Economy/india-resists-lobbying-by-us-payment-firms-to-ease-local-data-storage-rules/article24030702.ece', 'http://www.thehindu.com/business/Economy/pakistan-seeks-economic-lifeline-with-fresh-china-loans/article24002329.ece', 'http://www.thehindu.com/business/Economy/india-sixth-wealthiest-country-with-total-wealth-of-8230-billion/article23942050.ece', 'http://www.thehindu.com/business/Economy/an-inch-forward-but-miles-to-go/article23874501.ece']

 --------------------------
 Predictin is:-

defaultdict(<class 'int'>, {'economy': 5})




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [20]:
# Lets predict the cricket class
url_cricket = "http://www.thehindu.com/sport/cricket/star-studded-australia-a-squads-named-for-india-tour/article24031556.ece"
testarticle = get_text_from_hindu(url_cricket)
fs1 = FrequencyCounter()
hindusumm = fs1.extractFeatures(testarticle,25)

simmilarity = {}
for articleurl in articleSummaries:
    onearticlesumm = articleSummaries[articleurl]['feature-vector']
    simmilarity[articleurl]=len(set(hindusumm).intersection(set(onearticlesumm)))
    
labels1 = defaultdict(int)  
knn = sorted(simmilarity.keys(),key = lambda x:simmilarity[x],reverse = True)[:5]
print(knn)
for i in knn:
    labels1[articleSummaries[i]['label']]+=1
print("\n --------------------------------\n Prediction is:-\n")
print(labels1)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


['http://www.thehindu.com/sport/cricket/star-studded-australia-a-squads-named-for-india-tour/article24031556.ece', 'http://www.thehindu.com/sport/cricket/', 'http://www.thehindu.com/sport/cricket/?page=2', 'http://www.thehindu.com/sport/cricket/?page=3', 'http://www.thehindu.com/sport/cricket/?page=4']

 --------------------------------
 Prediction is:-

defaultdict(<class 'int'>, {'cricket': 5})
