In [1]:
import requests
import re
import os
import justext
import pickle
from bs4 import BeautifulSoup
from pprint import pprint

## Base Crawler

In [2]:
class BaseCrawler:
    
    def __init__(self, base_url, domain, file_prefix, savepoint=5):
        self.base_url = base_url
        self.corpus_path = '../data/corpora/{}'.format(domain)
        self.file_prefix = file_prefix
        self.links_todo = []
        self.links_done = []
        self.savepoint = savepoint
        self.pickle_dir = '.{}_links.pickle'.format(file_prefix)
        
        if os.path.isfile(self.pickle_dir):
            (self.links_done, self.links_todo) = pickle.load(open(self.pickle_dir, 'rb'))
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
            
    def save_pickle(self):
        pickle.dump((self.links_done, self.links_todo), open(self.pickle_dir, 'wb'))
        print('pickle saved')
    
    def run(self):
        if len(self.links_todo) < 1:
            self.get_post_list(1)
            self.save_pickle()
        
        l = [ x for x in self.links_todo if x not in self.links_done ]
        
        for i in range(len(l)):
            url = l[i]
            self.get_article(url)
            self.links_done.append(url)
            
            if (i+1) % self.savepoint == 0:
                self.save_pickle()
    
    def get_post_list(self, pg):
        raise NotImplementedError
    
    def get_article(self, url):
        raise NotImplementedError

## Crawler para o blog da Penumbra livros

In [None]:
class PenumbraCrawler(BaseCrawler):

    def get_post_list(self, pg):
        print('Getting page {:02d}'.format(pg), end='\t\t')
        url = '{}/page/{}'.format(self.base_url, pg)
        res = requests.get(url)

        soup = BeautifulSoup(res.text, 'html.parser')
        articles = soup.findAll('article', { 'class': 'post' })
        
        for article in articles:
            self.links_todo.append(article.find('a').get('href'))
        
        print('Got {} articles links'.format(len(articles)))
        
        span = soup.find('span', { 'class': 'nav-previous' })
        if span == None:
            print('Got {} articles links in total'.format(len(self.links)))
            return
        
        next_pg = span.find('a').get('href').split('/')[-2]
        self.get_post_list(int(next_pg))
    
    def get_article(self, url):
        print('getting page {}'.format(url))
        res = requests.get(url)
        
        l = url.split('/')
        post_name = l[-1] if l[-1] != '' else l[-2]
        post_name = post_name.split('.html')[0].replace("-", "_")
        fl_name = '{}__{}_{}__{}'.format(self.file_prefix, l[3], l[4], post_name)

        paragraphs = justext.justext(res.text, justext.get_stoplist("Portuguese"))
        
        with open("{}/{}.txt".format(self.corpus_path, fl_name), "w") as fl:
            for p in paragraphs:
                if not p.is_boilerplate:
                    fl.write(p.text + '\n')

In [None]:
crawler = PenumbraCrawler('http://www.penumbralivros.com.br/home/blog/', 'ocultismo', 'penumbralivros')
crawler.run()

## Crawler para o blog do Deldebbio

In [None]:
class DeldebbioCrawler(BaseCrawler):
    
    def get_post_list(self, pg):
        print('Getting page {:02d}'.format(pg), end='\t\t')
        
        url = '{}/page/{}'.format(self.base_url, pg)
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        
        articles = soup.find_all('div', { 'class': 'post' })
        for article in articles:
            self.links_todo.append(article.find('a').get('href'))
            
        print('Got {} articles links'.format(len(articles)))
        
        span = soup.find('a', { 'class': 'nextpostslink' })
        if span == None:
            print('Got {} articles links in total'.format(len(self.links_todo)))
            return
        
        next_pg = span.get('href').split('/')[-2]
        self.get_post_list(int(next_pg))
    
    def get_article(self, url):
        print('getting page {}'.format(url))
        res = requests.get(url)
        
        l = url.split('/')
        post_name = l[-1] if l[-1] != '' else l[-2]
        post_name = post_name.split('.html')[0].replace("-", "_")
        fl_name = '{}__{}'.format(self.file_prefix, post_name)

        paragraphs = justext.justext(res.text, justext.get_stoplist("Portuguese"))
        
        with open("{}/{}.txt".format(self.corpus_path, fl_name), "w") as fl:
            for p in paragraphs:
                if not p.is_boilerplate:
                    fl.write(p.text + '\n')

In [None]:
crawler = DeldebbioCrawler(
    'https://www.deldebbio.com.br/category/colunas/magia-do-caos/',
    'ocultismo',
    'deldebbio_caos'
)
crawler.run()

In [None]:
crawler = DeldebbioCrawler(
    'https://www.deldebbio.com.br/category/religioes/umbanda/',
    'ocultismo',
    'deldebbio_umbanda'
)
crawler.run()

## AAAAAA

In [5]:
class TecnoblogCrawler(BaseCrawler):
    
    def get_post_list(self, pg):
        print('Getting page {:02d}'.format(pg), end='\t\t')
        
        url = '{}/page/{}'.format(self.base_url, pg)
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        
        articles = soup.find_all('article')
        for article in articles:
            self.links_todo.append(article.find('a').get('href'))
            
        print('Got {} articles links'.format(len(articles)))
        
        if len(self.links_todo) >= 500:
            print('Got {} articles links in total'.format(len(self.links_todo)))
            return
        
        next_pg = pg + 1
        self.get_post_list(int(next_pg))
    
    def get_article(self, url):
        print('getting page {}'.format(url))
        res = requests.get(url)
        
        l = url.split('/')
        post_name = l[-1] if l[-1] != '' else l[-2]
        post_name = post_name.split('.html')[0].replace("-", "_")
        fl_name = '{}__{}'.format(self.file_prefix, post_name)

        paragraphs = justext.justext(res.text, justext.get_stoplist("Portuguese"))
        
        with open("{}/{}.txt".format(self.corpus_path, fl_name), "w") as fl:
            for p in paragraphs:
                if not p.is_boilerplate:
                    fl.write(p.text + '\n')

In [8]:
crawler = TecnoblogCrawler(
    'https://tecnoblog.net/cat/computador-e-pc/',
    'tecnologia',
    'tecnoblog'
)
crawler.run()

Getting page 01		Got 12 articles links
Getting page 02		Got 12 articles links
Getting page 03		Got 12 articles links
Getting page 04		Got 12 articles links
Getting page 05		Got 12 articles links
Getting page 06		Got 12 articles links
Getting page 07		Got 12 articles links
Getting page 08		Got 12 articles links
Getting page 09		Got 12 articles links
Getting page 10		Got 12 articles links
Getting page 11		Got 12 articles links
Getting page 12		Got 12 articles links
Getting page 13		Got 12 articles links
Getting page 14		Got 12 articles links
Getting page 15		Got 12 articles links
Getting page 16		Got 12 articles links
Getting page 17		Got 12 articles links
Getting page 18		Got 12 articles links
Getting page 19		Got 12 articles links
Getting page 20		Got 12 articles links
Getting page 21		Got 12 articles links
Getting page 22		Got 12 articles links
Getting page 23		Got 12 articles links
Getting page 24		Got 12 articles links
Getting page 25		Got 12 articles links
Getting page 26		Got 12 a

getting page https://tecnoblog.net/273899/acer-swift-7-notebook-ces19/
getting page https://tecnoblog.net/273889/asus-zenbook-s13-notebook-ces19/
getting page https://tecnoblog.net/273861/alienware-m17-m15-ces19/
pickle saved
getting page https://tecnoblog.net/272836/microsoft-webcam-windows-hello/
getting page https://tecnoblog.net/272681/gadgets-retrospectiva-2018/
getting page https://tecnoblog.net/272612/amd-athlon-220ge-240ge/
getting page https://tecnoblog.net/272427/como-usar-dois-monitores/
getting page https://tecnoblog.net/272044/microsoft-retrospectiva-2018/
pickle saved
getting page https://tecnoblog.net/272233/google-fotos-limite-20-mil/
getting page https://tecnoblog.net/272153/google-chrome-botao-voltar/
getting page https://tecnoblog.net/271907/malware-twitter-meme/
getting page https://tecnoblog.net/271871/apple-retrospectiva-2018/
getting page https://tecnoblog.net/271529/lg-gram-2018/
pickle saved
getting page https://tecnoblog.net/271518/notebook-9-pen-ultrabook-sam

pickle saved
getting page https://tecnoblog.net/257557/microsoft-atualizacao-kb4100347-spectre-bug-boot-intel-amd/
getting page https://tecnoblog.net/257220/intel-microsoft-arm-surface-go/
getting page https://tecnoblog.net/257030/nvidia-rtx-2080-versus-gtx-1080/
getting page https://tecnoblog.net/256519/macbook-air-retina-mac-mini/
getting page https://tecnoblog.net/256411/nvidia-geforce-rtx-2080-ti-placa-video/
pickle saved
getting page https://tecnoblog.net/256282/nvidia-queda-acoes-mineradores-criptomoeda/
getting page https://tecnoblog.net/256010/arm-processadores-desempenho-intel/
getting page https://tecnoblog.net/255795/microsoft-onedrive-backup-nuvem-area-trabalho/
getting page https://tecnoblog.net/255760/intel-core-xeon-falha-foreshadow-f1tf/
getting page https://tecnoblog.net/255607/nvidia-turing-quadro-rtx-8000/
pickle saved
getting page https://tecnoblog.net/255504/google-chromebooks-windows-10/
getting page https://tecnoblog.net/255497/thinkpad-p1-workstation-fina/
getti

getting page https://tecnoblog.net/234400/vaio-s11-s13-notebook-lancamento-brasil/
getting page https://tecnoblog.net/234358/windows-10-mais-rapido-ultimate-performance/
pickle saved
getting page https://tecnoblog.net/234349/intel-programa-cacadores-bugs/
getting page https://tecnoblog.net/234175/amd-ryzen-gpu-desempenho/
getting page https://tecnoblog.net/233762/lenovo-recall-thinkpad-x1-carbon/
getting page https://tecnoblog.net/232985/ssd-samsung-860-pro-evo/
getting page https://tecnoblog.net/232793/intel-interrompe-correcao-spectre/
pickle saved
getting page https://tecnoblog.net/232733/microsoft-laptops-educacionais-2018/
getting page https://tecnoblog.net/232667/cameras-intel-realsense/
getting page https://tecnoblog.net/232590/samsung-memorias-gddr6/
getting page https://tecnoblog.net/232551/intel-meltdown-spectre-reboot/
getting page https://tecnoblog.net/232275/raspberry-pi-zero-wh-gpio/
pickle saved
getting page https://tecnoblog.net/231368/acer-aspire-vx-15-notebook-review/

getting page https://tecnoblog.net/217853/asus-gpu-criptomoeda/
getting page https://tecnoblog.net/217814/amd-ryzen-3-especificacoes/
getting page https://tecnoblog.net/217580/falha-skylake-kaby-lake-ht/
getting page https://tecnoblog.net/216674/samsung-style-2-em-1-notebook-review/
pickle saved
getting page https://tecnoblog.net/217420/formatar-pen-drive-pc/
getting page https://tecnoblog.net/217240/amd-epyc-32-nucleos/
getting page https://tecnoblog.net/217241/intel-joule-edison-galileu-raspberry-pi-rip/
getting page https://tecnoblog.net/217036/surface-laptop-ifixit-reparabilidade/
getting page https://tecnoblog.net/216963/mudar-wallpaper-windows-mac/
pickle saved
getting page https://tecnoblog.net/216966/microsoft-modern-keyboard-teclado-leitor-impressoes-digitais/
getting page https://tecnoblog.net/216466/overclock-helio-liquido-intel/
getting page https://tecnoblog.net/216481/intel-qualcomm-windows-10-arm-briga/
getting page https://tecnoblog.net/216330/detalhes-intel-compute-car

In [None]:
url = 'https://tecnoblog.net/cat/computador-e-pc/page/1/'
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')

In [None]:
for article in soup.find_all('article'):
    a = article.find('a').get('href')
    print(a)
    