In [1]:
import urllib3
import re
import os
import justext
from bs4 import BeautifulSoup
from pprint import pprint

In [2]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
user_agent = { 'user-agent': 'Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.' }

http = urllib3.PoolManager(10, headers=user_agent)

In [4]:
class Crawler:
    
    def __init__(self, corpus_path, seed_url, url_pattern, max_files=50):
        self.corpus_path = corpus_path
        self.max_files = max_files
        self.seed_url = seed_url
        self.url_pattern = url_pattern
        self.visited_links = {}
        self.to_be_visited = []
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
        
    def crawl(self):
        first_urls = self.get_page(self.seed_url)
        self.add_links(first_urls)
        next_link = self.get_next_link()

        file_counter = 1
        while next_link and file_counter < self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
            file_counter += 1
    
    def get_page(self, url):
        print("getting page {}".format(url))
        response = http.request('GET', url)

        # store text content
        try:
            paragraphs = justext.justext(response.data, justext.get_stoplist("Portuguese"))
            l = url.split('/')
            post_name = l[-1] if l[-1] != '' else l[-2]
            post_name = post_name.split('.html')[0]
            fl_name = '{}_{}_{}'.format(l[3], l[4], post_name)

            with open("{}/{}.txt".format(self.corpus_path, fl_name), "w") as output_file:
                for paragraph in paragraphs:
                    if not paragraph.is_boilerplate:
                        output_file.write(paragraph.text)
        except:
            print("jumping")
        
        # get links
        soup = BeautifulSoup(response.data, 'html.parser')
        links = [link.get('href') for link in soup.findAll('a', attrs={'href': re.compile(self.url_pattern)})]

        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = self.to_be_visited.pop(0)
        self.visited_links[next_link] = None
        return next_link

In [10]:
# crawler_momentum = Crawler(
#     '../data/corpora/blog/momentum_saga',
#     'http://www.momentumsaga.com/2019/02/resenha-skyward-de-brandon-sanderson.html',
#     '^http(s)?:\/\/www\.momentumsaga\.com\/\d{4}\/\d{2}\/.+\.html$'
# )

# crawler_momentum.crawl()

In [5]:
crawler_penumbra = Crawler(
    '../data/corpora/blog/penumbra_livros',
    'http://www.penumbralivros.com.br/2018/12/as-vantagens-de-praticar-magia-do-caos/',
    '^http(s)?:\/\/www\.penumbralivros\.com\.br\/\d{4}\/\d{2}\/\D+\/$',
    max_files=200
)

crawler_penumbra.crawl()

getting page http://www.penumbralivros.com.br/2018/12/as-vantagens-de-praticar-magia-do-caos/
getting page http://www.penumbralivros.com.br/2019/03/o-inconsciente-coletivo-como-fonte-de-materia-magicka-pt-i/
getting page http://www.penumbralivros.com.br/2017/09/servidores-magicos-e-inteligencia-artificial/
getting page http://www.penumbralivros.com.br/2019/03/entendendo-as-premonicoes-e-mudando-o-futuro/
getting page http://www.penumbralivros.com.br/2018/12/servidores-publicos-sera-que-funcionam/
getting page http://www.penumbralivros.com.br/2016/12/cores-magia-magia-verde/
getting page http://www.penumbralivros.com.br/2019/03/encontrando-a-morte-diferentes-faces-de-uma-mesma-potencia/
getting page http://www.penumbralivros.com.br/2019/04/cuidado-grupo-internacional-ocultista-se-alimenta-da-depressao-e-busca-levar-bruxas-a-morte/
getting page http://www.penumbralivros.com.br/2019/04/como-o-inconsciente-coletivo-pode-ser-fonte-de-magia/
getting page http://www.penumbralivros.com.br/2017

getting page http://www.penumbralivros.com.br/2017/08/incubacao-tecnica-esquecida-dos-sonhos/
getting page http://www.penumbralivros.com.br/2017/12/hipocrisia-natalina-brasileira/
getting page http://www.penumbralivros.com.br/2018/11/visao-holistica-e-a-questao-do-dualismo/
getting page http://www.penumbralivros.com.br/2019/03/comecou-o-envio-dos-quarenta-servidores/
getting page http://www.penumbralivros.com.br/2017/10/diferenca-invocacao-evocacao/
getting page http://www.penumbralivros.com.br/2016/12/animais-fantasticos-esfinge/
getting page http://www.penumbralivros.com.br/2017/02/missa-do-caos/
getting page http://www.penumbralivros.com.br/2017/01/besta-oito-circuitos/
getting page http://www.penumbralivros.com.br/2017/09/frater-optimus-e-os-paradigmas-magicos/
getting page http://www.penumbralivros.com.br/2017/10/usando-a-magia-para-o-mal/
getting page http://www.penumbralivros.com.br/2016/09/iluminacao-caos/
getting page http://www.penumbralivros.com.br/2016/10/os-bastidores-de-k

KeyboardInterrupt: 