In [1]:
import requests
import re
import os
import justext
import pickle
from bs4 import BeautifulSoup
from pprint import pprint

## Base Crawler

In [2]:
class BaseCrawler:
    
    def __init__(self, base_url, domain, file_prefix, savepoint=5):
        self.base_url = base_url
        self.corpus_path = '../data/corpora/{}'.format(domain)
        self.file_prefix = file_prefix
        self.links_todo = []
        self.links_done = []
        self.savepoint = savepoint
        self.pickle_dir = '.{}_links.pickle'.format(file_prefix)
        
        if os.path.isfile(self.pickle_dir):
            (self.links_done, self.links_todo) = pickle.load(open(self.pickle_dir, 'rb'))
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
            
    def save_pickle(self):
        pickle.dump((self.links_done, self.links_todo), open(self.pickle_dir, 'wb'))
        print('pickle saved')
    
    def run(self):
        if len(self.links_todo) < 1:
            self.get_post_list(1)
            self.save_pickle()
        
        l = [ x for x in self.links_todo if x not in self.links_done ]
        
        for i in range(len(l)):
            url = l[i]
            self.get_article(url)
            self.links_done.append(url)
            
            if (i+1) % self.savepoint == 0:
                self.save_pickle()
    
    def get_post_list(self, pg):
        raise NotImplementedError
    
    def get_article(self, url):
        raise NotImplementedError

## Crawler para o blog da Penumbra livros

In [3]:
class PenumbraCrawler(BaseCrawler):

    def get_post_list(self, pg):
        print('Getting page {:02d}'.format(pg), end='\t\t')
        url = '{}/page/{}'.format(self.base_url, pg)
        res = requests.get(url)

        soup = BeautifulSoup(res.text, 'html.parser')
        articles = soup.findAll('article', { 'class': 'post' })
        
        for article in articles:
            self.links_todo.append(article.find('a').get('href'))
        
        print('Got {} articles links'.format(len(articles)))
        
        span = soup.find('span', { 'class': 'nav-previous' })
        if span == None:
            print('Got {} articles links in total'.format(len(self.links)))
            return
        
        next_pg = span.find('a').get('href').split('/')[-2]
        self.get_post_list(int(next_pg))
    
    def get_article(self, url):
        print('getting page {}'.format(url))
        res = requests.get(url)
        
        l = url.split('/')
        post_name = l[-1] if l[-1] != '' else l[-2]
        post_name = post_name.split('.html')[0].replace("-", "_")
        fl_name = '{}__{}_{}__{}'.format(self.file_prefix, l[3], l[4], post_name)

        paragraphs = justext.justext(res.text, justext.get_stoplist("Portuguese"))
        
        with open("{}/{}.txt".format(self.corpus_path, fl_name), "w") as fl:
            for p in paragraphs:
                if not p.is_boilerplate:
                    fl.write(p.text + '\n')

In [4]:
crawler = PenumbraCrawler('http://www.penumbralivros.com.br/home/blog/', 'ocultismo', 'penumbralivros')
crawler.run()

Getting page 01		Got 20 articles links
Getting page 02		Got 20 articles links
Getting page 03		Got 20 articles links
Getting page 04		Got 20 articles links
Getting page 05		

KeyboardInterrupt: 

## Crawler para o blog do Deldebbio

In [5]:
class DeldebbioCrawler(BaseCrawler):
    
    def get_post_list(self, pg):
        print('Getting page {:02d}'.format(pg), end='\t\t')
        
        url = '{}/page/{}'.format(self.base_url, pg)
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        
        articles = soup.find_all('div', { 'class': 'post' })
        for article in articles:
            self.links_todo.append(article.find('a').get('href'))
            
        print('Got {} articles links'.format(len(articles)))
        
        span = soup.find('a', { 'class': 'nextpostslink' })
        if span == None:
            print('Got {} articles links in total'.format(len(self.links_todo)))
            return
        
        next_pg = span.get('href').split('/')[-2]
        self.get_post_list(int(next_pg))
    
    def get_article(self, url):
        print('getting page {}'.format(url))
        res = requests.get(url)
        
        l = url.split('/')
        post_name = l[-1] if l[-1] != '' else l[-2]
        post_name = post_name.split('.html')[0].replace("-", "_")
        fl_name = '{}__{}'.format(self.file_prefix, post_name)

        paragraphs = justext.justext(res.text, justext.get_stoplist("Portuguese"))
        
        with open("{}/{}.txt".format(self.corpus_path, fl_name), "w") as fl:
            for p in paragraphs:
                if not p.is_boilerplate:
                    fl.write(p.text + '\n')

In [7]:
crawler = DeldebbioCrawler(
    'https://www.deldebbio.com.br/category/colunas/magia-do-caos/',
    'ocultismo',
    'deldebbio_caos'
)
crawler.run()

Getting page 01		Got 16 articles links
Getting page 02		Got 16 articles links
Getting page 03		Got 13 articles links
Got 45 articles links in total
pickle saved
getting page https://www.deldebbio.com.br/o-nirvana-caoista/
getting page https://www.deldebbio.com.br/o-mago-dos-jogos/


KeyboardInterrupt: 

In [9]:
crawler = DeldebbioCrawler(
    'https://www.deldebbio.com.br/category/religioes/umbanda/',
    'ocultismo',
    'deldebbio_umbanda'
)
crawler.run()

getting page https://www.deldebbio.com.br/espacos-magicos-e-corrente/
getting page https://www.deldebbio.com.br/mito-da-criacao-ioruba/
getting page https://www.deldebbio.com.br/por-que-religar/
getting page https://www.deldebbio.com.br/reflexao-sobre-a-umbanda/
getting page https://www.deldebbio.com.br/mitologia-dos-orixas-exu-e-a-encruzilhada/
pickle saved
getting page https://www.deldebbio.com.br/rubens-saraceni/
getting page https://www.deldebbio.com.br/marcelo-del-debbio-na-radio-toques-de-aruanda/
getting page https://www.deldebbio.com.br/exu-mirim/
getting page https://www.deldebbio.com.br/contos-de-ifa/
getting page https://www.deldebbio.com.br/o-caminho-de-um-medium/
pickle saved
getting page https://www.deldebbio.com.br/taoismo-e-umbanda/
getting page https://www.deldebbio.com.br/podcast-conversa-entre-adeptus-26-vertentes-de-umbanda-parte-01/
getting page https://www.deldebbio.com.br/promocao-oxalanaminhacasa/
getting page https://www.deldebbio.com.br/as-7-linhas-de-umbanda-

In [None]:
a = soup.find_all('div', { 'class': 'post' })
print(len(a))