In [1]:
import pandas as pd
import httpx
import pymupdf
from bs4 import BeautifulSoup, SoupStrainer
from trafilatura import extract

In [2]:
def get_page_links(response, actual_links):
    page_links = [link['href'] for link in BeautifulSoup(response.text, 'html.parser', parse_only=SoupStrainer('a')) if link.get('href') and 'olavodecarvalho.org' in link.get('href') and link.get('href') not in actual_links]
    page_links = pd.Series(page_links)
    return page_links

In [3]:
def get_page_title(response):
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('h1', {'class': 'single-pagetitle'})
    if title:
        return title.text.strip()
    else:
        return None

In [4]:
def get_date(response):
    soup = BeautifulSoup(response.text, 'html.parser')
    metadata = soup.find('section', {'class': 'postmetadata'})
    if metadata:
        return metadata.find_all('span')[0].text.strip()
    else:
        return None

In [5]:
def get_category(response):
    soup = BeautifulSoup(response.text, 'html.parser')
    metadata = soup.find('section', {'class': 'postmetadata'})
    if metadata:
        return metadata.find_all('span')[2].text.strip()
    else:
        return None

In [6]:
client = httpx.Client(follow_redirects=True)

In [7]:
df_content = pd.DataFrame(columns=['link', 'content_type', 'html', 'title', 'text', 'date', 'category'])
df_content

Unnamed: 0,link,content_type,html,title,text,date,category


In [8]:
links = ['https://olavodecarvalho.org']
links

['https://olavodecarvalho.org']

In [9]:
contador = 1
for link in links:
    print(f'Extraindo link: {link} / Link extraído: {contador} / Links contabilizados: {len(links)}')

    if link not in df_content.link:

        try:
            response = client.get(link)
        except:
            response = httpx.Response(status_code=999, headers={'content-type': 'error'})
    
        new_links = get_page_links(response=response, actual_links=links)
        links.extend(new_links)
    
        if 'text/html' in response.headers.get('content-type'):
            
            title = get_page_title(response)
            text = extract(response.text)
            date = get_date(response)
            category = get_category(response)
    
            df_in_context = pd.DataFrame([[link, response.headers.get('content-type'), response.text, title, text, date, category]], columns=['link', 'content_type', 'html', 'title', 'text', 'date', 'category'])
            df_content = pd.concat([df_content, df_in_context], ignore_index=True).reset_index(drop=True)

        elif 'application/pdf' in response.headers.get('content-type'):
            
            title = None
            
            # Text
            text = ''
            pdf = pymupdf.open(stream=response.content, filetype='pdf')
            for page in pdf:
                text += page.get_text()
                
            date = None
            category = None

            df_in_context = pd.DataFrame([[link, response.headers.get('content-type'), None, None, text, None, None]], columns=['link', 'content_type', 'html', 'title', 'text', 'date', 'category'])
            df_content = pd.concat([df_content, df_in_context], ignore_index=True).reset_index(drop=True)
            
        else:
            
            df_in_context = pd.DataFrame([[link, response.headers.get('content-type'), None, None, None, None, None]], columns=['link', 'content_type', 'html', 'title', 'text', 'date', 'category'])
            df_content = pd.concat([df_content, df_in_context], ignore_index=True).reset_index(drop=True)

        contador += 1

Extraindo link: https://olavodecarvalho.org / Link extraído: 1 / Links contabilizados: 1
Extraindo link: https://olavodecarvalho.org/ / Link extraído: 2 / Links contabilizados: 47
Extraindo link: https://olavodecarvalho.org/category/diario/ / Link extraído: 3 / Links contabilizados: 47
Extraindo link: https://olavodecarvalho.org/category/artigos/ / Link extraído: 4 / Links contabilizados: 58
Extraindo link: https://olavodecarvalho.org/category/apostilas/ / Link extraído: 5 / Links contabilizados: 84
Extraindo link: https://olavodecarvalho.org/category/leituras/ / Link extraído: 6 / Links contabilizados: 94
Extraindo link: https://olavodecarvalho.org/category/trechos-de-livros/ / Link extraído: 7 / Links contabilizados: 119
Extraindo link: https://olavodecarvalho.org/canal-de-video/ / Link extraído: 8 / Links contabilizados: 130
Extraindo link: https://olavodecarvalho.org/category/english/ / Link extraído: 9 / Links contabilizados: 130
Extraindo link: https://olavodecarvalho.org/categor


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  page_links = [link['href'] for link in BeautifulSoup(response.text, 'html.parser', parse_only=SoupStrainer('a')) if link.get('href') and 'olavodecarvalho.org' in link.get('href') and link.get('href') not in actual_links]


Extraindo link: https://olavodecarvalho.org/comments/feed/ / Link extraído: 47 / Links contabilizados: 297
Extraindo link: https://olavodecarvalho.org/negociando-o-inegociavel/ / Link extraído: 48 / Links contabilizados: 297
Extraindo link: https://olavodecarvalho.org/cristianismo/ / Link extraído: 49 / Links contabilizados: 297
Extraindo link: https://olavodecarvalho.org/category/diario/page/2/ / Link extraído: 50 / Links contabilizados: 297
Extraindo link: https://olavodecarvalho.org/category/diario/page/3/ / Link extraído: 51 / Links contabilizados: 297
Extraindo link: https://olavodecarvalho.org/category/diario/page/4/ / Link extraído: 52 / Links contabilizados: 297
Extraindo link: https://olavodecarvalho.org/category/diario/page/5/ / Link extraído: 53 / Links contabilizados: 299
Extraindo link: https://olavodecarvalho.org/category/diario/page/10/ / Link extraído: 54 / Links contabilizados: 303
Extraindo link: https://olavodecarvalho.org/category/diario/page/20/ / Link extraído: 55

In [10]:
df_content

Unnamed: 0,link,content_type,html,title,text,date,category
0,https://olavodecarvalho.org,text/html; charset=UTF-8,"<!DOCTYPE html>\r\n<html lang=""pt-BR"" xmlns:fb...",,Prof. Valdemar Munaro\nEste nobre e generoso e...,Em 13 de março de 2022,Artigos de terceiros
1,https://olavodecarvalho.org/,text/html; charset=UTF-8,"<!DOCTYPE html>\r\n<html lang=""pt-BR"" xmlns:fb...",,Prof. Valdemar Munaro\nEste nobre e generoso e...,Em 13 de março de 2022,Artigos de terceiros
2,https://olavodecarvalho.org/category/diario/,text/html; charset=UTF-8,"<!DOCTYPE html>\r\n<html lang=""pt-BR"" xmlns:fb...",,Em 24 de dezembro de 2021 / Diário\nQuando dig...,Em 25 de janeiro de 2022,Diário
3,https://olavodecarvalho.org/category/artigos/,text/html; charset=UTF-8,"<!DOCTYPE html>\r\n<html lang=""pt-BR"" xmlns:fb...",,O Oriente contra Ocidente\nPor Olavo de Carval...,Em 7 de março de 2021,Artigos
4,https://olavodecarvalho.org/category/apostilas/,text/html; charset=UTF-8,"<!DOCTYPE html>\r\n<html lang=""pt-BR"" xmlns:fb...",,Tema para desenvolvimento em classe no Seminár...,Em 21 de dezembro de 2003,Apostilas
...,...,...,...,...,...,...,...
11302,https://olavodecarvalho.org/page/764/,text/html; charset=UTF-8,"<!DOCTYPE html>\r\n<html lang=""pt-BR"" xmlns:fb...",,Olavo de Carvalho\n5 de outubro de 1999\nNum r...,Em 5 de outubro de 1999,Leituras
11303,https://olavodecarvalho.org/page/745/,text/html; charset=UTF-8,"<!DOCTYPE html>\r\n<html lang=""pt-BR"" xmlns:fb...",,Olavo de Carvalho\nNota para uma das próximas ...,Em 10 de maio de 2000,Apostilas
11304,https://olavodecarvalho.org/page/766/,text/html; charset=UTF-8,"<!DOCTYPE html>\r\n<html lang=""pt-BR"" xmlns:fb...",,OLAVO DE CARVALHO\nTranslated by Marcelo De Po...,Em 16 de setembro de 1999,English
11305,https://olavodecarvalho.org/page/755/,text/html; charset=UTF-8,"<!DOCTYPE html>\r\n<html lang=""pt-BR"" xmlns:fb...",,"Olavo de Carvalho\nBravo!, fevereiro de 2000\n...",Em 10 de fevereiro de 2000,Artigos


In [12]:
df_content.to_parquet('olavodecarvalho_org.parquet')