### Coletando notícias do G1

In [13]:
import time
from selenium import webdriver
import csv

In [74]:
import requests
from bs4 import BeautifulSoup

class G1PageScrapper:
    
    mainUrl = "https://g1.globo.com/"
    searchPath = "busca/?q="
    
    
    def __init__(self):
        self.data = []
    
    def access(self, link):
        ret = BeautifulSoup(requests.get(link).content, 'html.parser')
        return ret
        
    def search(self, expression, page=1):
        words = expression.split(" ")
        
        queryParamValue =  ""
        
        for word in words:
            queryParamValue += '+' + word 
        
        queryParamValue = queryParamValue.replace('+','',1)    
        
        searchUrl = self.mainUrl+self.searchPath+queryParamValue
               
        return G1SearchReader(self.access(searchUrl))
    
class G1SearchReader:
    
    def __init__(self, bsObject):
        self.content = bsObject
    
    def getContent(self):
        return self.content
    
    def listNewsCardList(self):
        self.content.find_all("ul") 
    

In [76]:
links = ['https://g1.globo.com/rs/rio-grande-do-sul/noticia/2018/08/10/entidade-pede-que-ministerio-publico-do-rs-analise-fala-do-vice-de-bolsonaro-sobre-negros.ghtml',
        'https://g1.globo.com/politica/blog/andreia-sadi/post/2018/08/10/apos-debate-morno-estrategistas-de-alckmin-defendem-campanha-na-tv-para-confrontar-bolsonaro.ghtml'
        ]

webPageScrapper = G1PageScrapper()

searchReader = webPageScrapper.search("jair bolsonaro")

print(searchReader.listNewsCardList())






None


In [None]:
news_data = []
for link in links:
    title, texto = webPageScrapper.access(link)
    news_data.append({'title': title, 'content': texto})

for news in news_data:
    print(news['title'], '\n\n', news['content'], '\n\n\n')

### Calculando estatísticas básicas de cada notícia coletada (news_data)

In [9]:
import sys
import json
import nltk



stop_words = nltk.corpus.stopwords.words('portuguese') + [
    '.',
    ',',
    '--',
    '\'s',
    '?',
    ')',
    '(',
    ':',
    '\'',
    '\'re',
    '"',
    '-',
    '}',
    '{',
    ]

for news in news_data:
    sentences = nltk.tokenize.sent_tokenize(news['content'])

    words = [w.lower() for sentence in sentences for w in
             nltk.tokenize.word_tokenize(sentence)]

    fdist = nltk.FreqDist(words)

    # Basic stats

    num_words = sum([i[1] for i in fdist.items()])
    num_unique_words = len(fdist.keys())

    # Hapaxes are words that appear only once

    num_hapaxes = len(fdist.hapaxes())

    top_10_words_sans_stop_words = [w for w in fdist.items() if w[0]
                                    not in stop_words][:10]

    print(news['title'])
    print('\tNum Sentences:'.ljust(25), len(sentences))
    print('\tNum Words:'.ljust(25), num_words)
    print('\tNum Unique Words:'.ljust(25), num_unique_words)
    print('\tNum Hapaxes:'.ljust(25), num_hapaxes)
    print('\tTop 10 Most Frequent Words (sans stop words):\n\t\t', \
            '\n\t\t'.join(['%s (%s)'
            % (w[0], w[1]) for w in top_10_words_sans_stop_words]))
    print()

Entidade pede que Ministério Público do RS analise fala do vice de Bolsonaro sobre negros 
	Num Sentences:           10
	Num Words:               400
	Num Unique Words:        216
	Num Hapaxes:             167
	Top 10 Most Frequent Words (sans stop words):
		 entidade (1)
		educafro (3)
		protocolou (1)
		ministério (1)
		público (1)
		rio (2)
		grande (2)
		sul (3)
		mp-rs (1)
		nesta (1)

Após debate morno, estrategistas de Alckmin defendem confrontar Bolsonaro na campanha da TV
	Num Sentences:           20
	Num Words:               410
	Num Unique Words:        199
	Num Hapaxes:             142
	Top 10 Most Frequent Words (sans stop words):
		 primeiro (1)
		debate (3)
		presidencial (1)
		tv (3)
		dividiu (1)
		opiniões (1)
		campanhas (2)
		principais (4)
		candidatos (1)
		servirá (1)



In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Igor\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

### Sumarizando as notícias coletadas

In [10]:
import sys
import json
import nltk
import numpy

N = 100  # Number of words to consider
CLUSTER_THRESHOLD = 5  # Distance between words to consider
TOP_SENTENCES = 5  # Number of sentences to return for a "top n" summary

# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn

def _score_sentences(sentences, important_words):
    scores = []
    sentence_idx = -1

    for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:

        sentence_idx += 1
        word_idx = []

        # For each word in the word list...
        for w in important_words:
            try:
                # Compute an index for where any important words occur in the sentence

                word_idx.append(s.index(w))
            except (ValueError) as e: # w not in this particular sentence
                pass

        word_idx.sort()

        # It is possible that some sentences may not contain any important words at all
        if len(word_idx)== 0: continue

        # Using the word index, compute clusters by using a max distance threshold
        # for any two consecutive words

        clusters = []
        cluster = [word_idx[0]]
        i = 1
        while i < len(word_idx):
            if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
                cluster.append(word_idx[i])
            else:
                clusters.append(cluster[:])
                cluster = [word_idx[i]]
            i += 1
        clusters.append(cluster)

        # Score each cluster. The max score for any given cluster is the score 
        # for the sentence

        max_cluster_score = 0
        for c in clusters:
            significant_words_in_cluster = len(c)
            total_words_in_cluster = c[-1] - c[0] + 1
            score = 1.0 * significant_words_in_cluster \
                * significant_words_in_cluster / total_words_in_cluster

            if score > max_cluster_score:
                max_cluster_score = score

        scores.append((sentence_idx, score))

    return scores

def summarize(txt):
    sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
    normalized_sentences = [s.lower() for s in sentences]

    words = [w.lower() for sentence in normalized_sentences for w in
             nltk.tokenize.word_tokenize(sentence)]

    fdist = nltk.FreqDist(words)

    top_n_words = [w[0] for w in fdist.items() 
            if w[0] not in nltk.corpus.stopwords.words('portuguese')][:N]

    scored_sentences = _score_sentences(normalized_sentences, top_n_words)

    # Summaization Approach 1:
    # Filter out non-significant sentences by using the average score plus a
    # fraction of the std dev as a filter

    avg = numpy.mean([s[1] for s in scored_sentences])
    std = numpy.std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]

    # Summarization Approach 2:
    # Another approach would be to return only the top N ranked sentences

    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])

    # Decorate the post object with summaries

    return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
                mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])


if __name__ == '__main__':

    # Load in output from blogs_and_nlp__get_feed.py

    #BLOG_DATA = sys.argv[1]
    #blog_data = json.loads(open(BLOG_DATA).read())
    
    for post in news_data:
       
        post.update(summarize(post['content']))

        print(post['title'])
        print('-' * len(post['title']))
        print()
        print('-------------')
        print('Top N Summary')
        print('-------------')
        print(' '.join(post['top_n_summary']))
        print()
        print('-------------------')
        print('Mean Scored Summary')
        print('-------------------')
        print(' '.join(post['mean_scored_summary']))
        print()

Entidade pede que Ministério Público do RS analise fala do vice de Bolsonaro sobre negros 
------------------------------------------------------------------------------------------

-------------
Top N Summary
-------------

 A entidade Educafro protocolou no Ministério Público do Rio Grande do Sul (MP-RS) nesta sexta-feira (10) uma solicitação para que o órgão analise a declaração do general Antonio Hamilton Mourão (PRTB), candidato a vice-presidente na chapa de Jair Bolsonaro (PSL) nas Eleições de 2018, que disse que o Brasil herdou a "malandragem" do africano. O político estava em uma reunião-almoço da Câmara de Indústria, Comércio e Serviços de Caxias do Sul, na Serra do Rio Grande do Sul, na segunda-feira (6), quando deu a declaração. No documento encaminhado ao MP, dois dos advogados da Educafro, Ary Bergher e Marcello Ramalho, citam que "ao vociferar a qualidade negativa do negro na composição cultural do Povo Brasileiro, traz à tona uma aversão aos seus valores e culturas, man

### Visualização HTML da sumarização

In [11]:
import os
import sys
import json
import nltk
import numpy

HTML_TEMPLATE = """<html>
    <head>
        <title>%s</title>
        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
    </head>
    <body>%s</body>
</html>"""

if __name__ == '__main__':

    # Marked up version can be written out to disk

    if not os.path.isdir('out/summarize'):
        os.makedirs('out/summarize')

    for post in news_data:
       
        post.update(summarize(post['content']))

        for summary_type in ['top_n_summary', 'mean_scored_summary']:
            post[summary_type + '_marked_up'] = '<p>%s</p>' % (post['content'], )
            for s in post[summary_type]:
                post[summary_type + '_marked_up'] = \
                post[summary_type + '_marked_up'].replace(s, '<strong>%s</strong>' % (s, ))

            filename = post['title'] + '.summary.' + summary_type + '.html'
            f = open(os.path.join('out', 'summarize', filename), 'w')
            html = HTML_TEMPLATE % (post['title'] + ' Summary', post[summary_type + '_marked_up'],)
            #f.write(str(html.encode('utf-8')))
            f.write(html)
            f.close()

            print("Data written to", f.name)

Data written to out\summarize\Entidade pede que Ministério Público do RS analise fala do vice de Bolsonaro sobre negros .summary.top_n_summary.html
Data written to out\summarize\Entidade pede que Ministério Público do RS analise fala do vice de Bolsonaro sobre negros .summary.mean_scored_summary.html
Data written to out\summarize\Após debate morno, estrategistas de Alckmin defendem confrontar Bolsonaro na campanha da TV.summary.top_n_summary.html
Data written to out\summarize\Após debate morno, estrategistas de Alckmin defendem confrontar Bolsonaro na campanha da TV.summary.mean_scored_summary.html
