In [None]:
import requests
import re
import os
import justext
import pickle
from bs4 import BeautifulSoup
from pprint import pprint

## Base Crawler

In [None]:
class BaseCrawler:
    
    def __init__(self, base_url, domain, file_prefix, savepoint=5):
        self.base_url = base_url
        self.corpus_path = '../data/corpora/{}'.format(domain)
        self.file_prefix = file_prefix
        self.links_todo = []
        self.links_done = []
        self.savepoint = savepoint
        self.pickle_dir = '.{}_links.pickle'.format(file_prefix)
        
        if os.path.isfile(self.pickle_dir):
            (self.links_done, self.links_todo) = pickle.load(open(self.pickle_dir, 'rb'))
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
            
    def save_pickle(self):
        pickle.dump((self.links_done, self.links_todo), open(self.pickle_dir, 'wb'))
        print('pickle saved')
    
    def run(self):
        if len(self.links_todo) < 1:
            self.get_post_list(1)
            self.save_pickle()
        
        l = [ x for x in self.links_todo if x not in self.links_done ]
        
        for i in range(len(l)):
            url = l[i]
            self.get_article(url)
            self.links_done.append(url)
            
            if (i+1) % self.savepoint == 0:
                self.save_pickle()
    
    def get_post_list(self, pg):
        raise NotImplementedError
    
    def get_article(self, url):
        raise NotImplementedError

## Crawler para o blog da Penumbra livros

In [None]:
class PenumbraCrawler(BaseCrawler):

    def get_post_list(self, pg):
        print('Getting page {:02d}'.format(pg), end='\t\t')
        url = '{}/page/{}'.format(self.base_url, pg)
        res = requests.get(url)

        soup = BeautifulSoup(res.text, 'html.parser')
        articles = soup.findAll('article', { 'class': 'post' })
        
        for article in articles:
            self.links_todo.append(article.find('a').get('href'))
        
        print('Got {} articles links'.format(len(articles)))
        
        span = soup.find('span', { 'class': 'nav-previous' })
        if span == None:
            print('Got {} articles links in total'.format(len(self.links)))
            return
        
        next_pg = span.find('a').get('href').split('/')[-2]
        self.get_post_list(int(next_pg))
    
    def get_article(self, url):
        print('getting page {}'.format(url))
        res = requests.get(url)
        
        l = url.split('/')
        post_name = l[-1] if l[-1] != '' else l[-2]
        post_name = post_name.split('.html')[0].replace("-", "_")
        fl_name = '{}__{}_{}__{}'.format(self.file_prefix, l[3], l[4], post_name)

        paragraphs = justext.justext(res.text, justext.get_stoplist("Portuguese"))
        
        with open("{}/{}.txt".format(self.corpus_path, fl_name), "w") as fl:
            for p in paragraphs:
                if not p.is_boilerplate:
                    fl.write(p.text + '\n')

In [None]:
crawler = PenumbraCrawler('http://www.penumbralivros.com.br/home/blog/', 'ocultismo', 'penumbralivros')
crawler.run()

## Crawler para o blog do Deldebbio

In [None]:
class DeldebbioCrawler(BaseCrawler):
    
    def get_post_list(self, pg):
        print('Getting page {:02d}'.format(pg), end='\t\t')
        
        url = '{}/page/{}'.format(self.base_url, pg)
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        
        articles = soup.find_all('div', { 'class': 'post' })
        for article in articles:
            self.links_todo.append(article.find('a').get('href'))
            
        print('Got {} articles links'.format(len(articles)))
        
        span = soup.find('a', { 'class': 'nextpostslink' })
        if span == None:
            print('Got {} articles links in total'.format(len(self.links_todo)))
            return
        
        next_pg = span.get('href').split('/')[-2]
        self.get_post_list(int(next_pg))
    
    def get_article(self, url):
        print('getting page {}'.format(url))
        res = requests.get(url)
        
        l = url.split('/')
        post_name = l[-1] if l[-1] != '' else l[-2]
        post_name = post_name.split('.html')[0].replace("-", "_")
        fl_name = '{}__{}'.format(self.file_prefix, post_name)

        paragraphs = justext.justext(res.text, justext.get_stoplist("Portuguese"))
        
        with open("{}/{}.txt".format(self.corpus_path, fl_name), "w") as fl:
            for p in paragraphs:
                if not p.is_boilerplate:
                    fl.write(p.text + '\n')

In [None]:
crawler = DeldebbioCrawler(
    'https://www.deldebbio.com.br/category/colunas/magia-do-caos/',
    'ocultismo',
    'deldebbio_caos'
)
crawler.run()

In [None]:
crawler = DeldebbioCrawler(
    'https://www.deldebbio.com.br/category/religioes/umbanda/',
    'ocultismo',
    'deldebbio_umbanda'
)
crawler.run()

## NerdBunker - Games

In [24]:
class NerdBunkerCrawler(BaseCrawler):
    
    def get_post_list(self, pg):
        print('Getting page {:02d}'.format(pg), end='\t\t')
        
        url = '{}/?search=&category=games&page={}'.format(self.base_url, pg)
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        
        articles = soup.find_all('article', { 'class': 'entry-card' })
        for article in articles:
            self.links_todo.append(article.find('h2').find('a').get('href'))
            
        print('Got {} articles links'.format(len(articles)))
        
        if len(self.links_todo) >= 500:
            print('Got {} articles links in total'.format(len(self.links_todo)))
            return
        
        next_pg = pg + 1
        self.get_post_list(int(next_pg))
    
    def get_article(self, url):
        print('getting page {}'.format(url))
        res = requests.get(url)
        
        l = url.split('/')
        post_name = l[-1] if l[-1] != '' else l[-2]
        post_name = post_name.split('.html')[0].replace("-", "_")
        fl_name = '{}__{}'.format(self.file_prefix, post_name)

        paragraphs = justext.justext(res.text, justext.get_stoplist("Portuguese"))
        
        with open("{}/{}.txt".format(self.corpus_path, fl_name), "w") as fl:
            for p in paragraphs:
                if not p.is_boilerplate:
                    fl.write(p.text + '\n')

In [25]:
crawler = NerdBunkerCrawler(
    'https://jovemnerd.com.br/nerdbunker/categoria/games/',
    'games',
    'nerdbunker'
)
crawler.run()

Getting page 01		Got 10 articles links
Getting page 02		Got 10 articles links
Getting page 03		Got 10 articles links
Getting page 04		Got 10 articles links
Getting page 05		Got 10 articles links
Getting page 06		Got 10 articles links
Getting page 07		Got 10 articles links
Getting page 08		Got 10 articles links
Getting page 09		Got 10 articles links
Getting page 10		Got 10 articles links
Getting page 11		Got 10 articles links
Getting page 12		Got 10 articles links
Getting page 13		Got 10 articles links
Getting page 14		Got 10 articles links
Getting page 15		Got 10 articles links
Getting page 16		Got 10 articles links
Getting page 17		Got 10 articles links
Getting page 18		Got 10 articles links
Getting page 19		Got 10 articles links
Getting page 20		Got 10 articles links
Getting page 21		Got 10 articles links
Getting page 22		Got 10 articles links
Getting page 23		Got 10 articles links
Getting page 24		Got 10 articles links
Getting page 25		Got 10 articles links
Getting page 26		Got 10 a

getting page https://jovemnerd.com.br/nerdbunker/jogador-zera-sekiro-bongos-donkey-kong/
getting page https://jovemnerd.com.br/nerdbunker/dragon-ball-fighterz-proximo-dlc-goku-dragon-ball-gt/
getting page https://jovemnerd.com.br/nerdbunker/evento-crossover-fortnite-vingadores-teaser-homem-de-ferro/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/apex-legends-perdeu-popularidade/
getting page https://jovemnerd.com.br/nerdbunker/cyberpunk-2077-inspirado-deus-ex-vampire-the-masquerade/
getting page https://jovemnerd.com.br/nerdbunker/teaser-crossover-fortnite-vingadores-ultimato-martelo-thor/
getting page https://jovemnerd.com.br/nerdbunker/promocao-xbox-game-pass-assinatura-tres-meses-r-1/
getting page https://jovemnerd.com.br/nerdbunker/novo-trailer-mortal-kombat-11-frost/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/mortal-kombat-11-melhor-lancamento-historia-franquia/
getting page https://jovemnerd.com.br/nerdbunker/overwatch-ferramenta-criar-herois-mod

getting page https://jovemnerd.com.br/nerdbunker/evento-crossover-fortnite-vingadores-teaser-homem-de-ferro/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/apex-legends-perdeu-popularidade/
getting page https://jovemnerd.com.br/nerdbunker/cyberpunk-2077-inspirado-deus-ex-vampire-the-masquerade/
getting page https://jovemnerd.com.br/nerdbunker/teaser-crossover-fortnite-vingadores-ultimato-martelo-thor/
getting page https://jovemnerd.com.br/nerdbunker/promocao-xbox-game-pass-assinatura-tres-meses-r-1/
getting page https://jovemnerd.com.br/nerdbunker/novo-trailer-mortal-kombat-11-frost/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/mortal-kombat-11-melhor-lancamento-historia-franquia/
getting page https://jovemnerd.com.br/nerdbunker/overwatch-ferramenta-criar-herois-modos-de-jogo/
getting page https://jovemnerd.com.br/nerdbunker/jogador-zera-sekiro-bongos-donkey-kong/
getting page https://jovemnerd.com.br/nerdbunker/dragon-ball-fighterz-proximo-dlc-goku-drag

pickle saved
getting page https://jovemnerd.com.br/nerdbunker/apex-legends-perdeu-popularidade/
getting page https://jovemnerd.com.br/nerdbunker/cyberpunk-2077-inspirado-deus-ex-vampire-the-masquerade/
getting page https://jovemnerd.com.br/nerdbunker/teaser-crossover-fortnite-vingadores-ultimato-martelo-thor/
getting page https://jovemnerd.com.br/nerdbunker/promocao-xbox-game-pass-assinatura-tres-meses-r-1/
getting page https://jovemnerd.com.br/nerdbunker/novo-trailer-mortal-kombat-11-frost/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/mortal-kombat-11-melhor-lancamento-historia-franquia/
getting page https://jovemnerd.com.br/nerdbunker/overwatch-ferramenta-criar-herois-modos-de-jogo/
getting page https://jovemnerd.com.br/nerdbunker/jogador-zera-sekiro-bongos-donkey-kong/
getting page https://jovemnerd.com.br/nerdbunker/dragon-ball-fighterz-proximo-dlc-goku-dragon-ball-gt/
getting page https://jovemnerd.com.br/nerdbunker/evento-crossover-fortnite-vingadores-teaser-home

getting page https://jovemnerd.com.br/nerdbunker/teaser-crossover-fortnite-vingadores-ultimato-martelo-thor/
getting page https://jovemnerd.com.br/nerdbunker/promocao-xbox-game-pass-assinatura-tres-meses-r-1/
getting page https://jovemnerd.com.br/nerdbunker/novo-trailer-mortal-kombat-11-frost/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/mortal-kombat-11-melhor-lancamento-historia-franquia/
getting page https://jovemnerd.com.br/nerdbunker/overwatch-ferramenta-criar-herois-modos-de-jogo/
getting page https://jovemnerd.com.br/nerdbunker/jogador-zera-sekiro-bongos-donkey-kong/
getting page https://jovemnerd.com.br/nerdbunker/dragon-ball-fighterz-proximo-dlc-goku-dragon-ball-gt/
getting page https://jovemnerd.com.br/nerdbunker/evento-crossover-fortnite-vingadores-teaser-homem-de-ferro/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/apex-legends-perdeu-popularidade/
getting page https://jovemnerd.com.br/nerdbunker/cyberpunk-2077-inspirado-deus-ex-vampire-the-

getting page https://jovemnerd.com.br/nerdbunker/promocao-xbox-game-pass-assinatura-tres-meses-r-1/
getting page https://jovemnerd.com.br/nerdbunker/novo-trailer-mortal-kombat-11-frost/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/mortal-kombat-11-melhor-lancamento-historia-franquia/
getting page https://jovemnerd.com.br/nerdbunker/overwatch-ferramenta-criar-herois-modos-de-jogo/
getting page https://jovemnerd.com.br/nerdbunker/jogador-zera-sekiro-bongos-donkey-kong/
getting page https://jovemnerd.com.br/nerdbunker/dragon-ball-fighterz-proximo-dlc-goku-dragon-ball-gt/
getting page https://jovemnerd.com.br/nerdbunker/evento-crossover-fortnite-vingadores-teaser-homem-de-ferro/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/apex-legends-perdeu-popularidade/
getting page https://jovemnerd.com.br/nerdbunker/cyberpunk-2077-inspirado-deus-ex-vampire-the-masquerade/
getting page https://jovemnerd.com.br/nerdbunker/teaser-crossover-fortnite-vingadores-ultimato-ma

getting page https://jovemnerd.com.br/nerdbunker/novo-trailer-mortal-kombat-11-frost/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/mortal-kombat-11-melhor-lancamento-historia-franquia/
getting page https://jovemnerd.com.br/nerdbunker/overwatch-ferramenta-criar-herois-modos-de-jogo/
getting page https://jovemnerd.com.br/nerdbunker/jogador-zera-sekiro-bongos-donkey-kong/
getting page https://jovemnerd.com.br/nerdbunker/dragon-ball-fighterz-proximo-dlc-goku-dragon-ball-gt/
getting page https://jovemnerd.com.br/nerdbunker/evento-crossover-fortnite-vingadores-teaser-homem-de-ferro/
pickle saved
getting page https://jovemnerd.com.br/nerdbunker/apex-legends-perdeu-popularidade/
getting page https://jovemnerd.com.br/nerdbunker/cyberpunk-2077-inspirado-deus-ex-vampire-the-masquerade/
getting page https://jovemnerd.com.br/nerdbunker/teaser-crossover-fortnite-vingadores-ultimato-martelo-thor/
getting page https://jovemnerd.com.br/nerdbunker/promocao-xbox-game-pass-assinatura-tres

In [11]:
url = 'https://jovemnerd.com.br/nerdbunker/categoria/games/?search=&category=games&page=1'
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')

In [23]:
soup.find_all('article', { 'class': 'entry-card' })[0].find('h2').find('a').get('href')

'https://jovemnerd.com.br/nerdbunker/mortal-kombat-11-melhor-lancamento-historia-franquia/'