In [1]:
import json
import requests
from bs4 import BeautifulSoup
import re

In [2]:
var_dict = {
    "Type": "type",
    "Title": "title",
    "Other Titles": "otherTitles",
    "Author(s)/Inventor(s)": "author",
    "Advisor": "advisor",
    "Co-advisor": "coAdvisor",
    "Abstract": "abstract",
    "AbstractEnglish": "abstractEnglish",
    "Keywords": "keywords",
    "Subject CNPq": "subjectCnpq",
    "Program": "program",
    "Production unit": "productionUnit",
    "Publisher": "publisher",
    "Issue Date": "issueDate",
    "Publisher country": "publisherCountry",
    "Language": "language",
    "Right access": "rightAccess",
    "Appears in Collections": "appearsInCollections",
}

def get_pdf(url):
    url = url.replace('http://hdl.handle.net/', 'https://pantheon.ufrj.br/handle/')
    response = requests.get(url)
    html = response.content
    
    soup = BeautifulSoup(html, 'html.parser')

    # acha o link do pdf usando regex pelo grupo
    regex = 'href="(\/bitstream\/.*?\.pdf)"'
    matches = re.search(regex, soup.decode())

    pdf_url = None
    if matches:
        pdf_url = 'https://pantheon.ufrj.br' + matches.group(1)

    # Extract the URL
    if pdf_url:
        print(f"PDF URL: {pdf_url}")
    else:
        print("PDF URL not found.")

    # Find the table rows
    rows = soup.find('table', class_='itemDisplayTable').find_all('tr')

    # Create a dictionary to store the data
    data_dict = {}
    abstract = False

    # Loop through rows and extract key-value pairs
    for row in rows:
        label = row.find('td', class_='metadataFieldLabel')
        value = row.find('td', class_='metadataFieldValue')
        
        if label and value:
            label_text = label.get_text(strip=True).rstrip(':')
            value_text = value.get_text(strip=True)

            if label_text == "Abstract":
                if abstract:
                    data_dict["abstractEnglish"] = value_text
                else:
                    data_dict["abstract"] = value_text
                    abstract = True
            else:
                data_dict[var_dict[label_text]] = value_text

    data_dict["pdfUrl"] = pdf_url
    return data_dict

In [6]:
data = []
# número máximo de documentos a parsear
num = 5000

# abre thesis.json
with open('thesis_meta.json') as thesis:
    # carrega o arquivo json em um dicionário
    thesis = json.load(thesis)

    for tese in thesis:
        if num == 0:
            break
        url = tese['urls'][0]
        try:
            info = get_pdf(url)
            print(tese['subjectsPOR'])
            info['subjectsPOR'] = [subjectPOR[0] for subjectPOR in tese['subjectsPOR']]
            # join array info['subjectsPOR] whith ','
            info['keywords'] = ','.join(info['subjectsPOR'])
            info['subjectCnpq'] = info['subjectCnpq'].replace('::', ',')
            data.append(info)
            num -= 1
        except:
            print("Erro ao obter dados do pdf")
            continue

with open('parsed_thesis.json', 'w') as thesis:
    # parse data to utf-8 json file
    data = json.dumps(data, indent=2, ensure_ascii=True).encode('utf8')
    thesis.write(data.decode())


PDF URL: https://pantheon.ufrj.br/bitstream/11422/5160/1/613494.pdf
[['Arte brasileira']]
PDF URL: https://pantheon.ufrj.br/bitstream/11422/5161/1/612585.pdf
[['Natureza morta (Pintura)'], ['Pintura brasileira'], ['Séc. XIX']]
PDF URL: https://pantheon.ufrj.br/bitstream/11422/5169/1/455986.pdf
[['História do Brasil'], ['Salão'], ['Estado histórico']]
PDF URL: https://pantheon.ufrj.br/bitstream/11422/5187/3/712842.pdf
[['Estrelas'], ['Gêmeas solares'], ['Composição química']]
PDF URL: https://pantheon.ufrj.br/bitstream/11422/5353/1/186440.pdf
[['Energia elétrica'], ['Demanda energética']]
PDF URL: https://pantheon.ufrj.br/bitstream/11422/5354/1/535285.pdf
[['Escolas'], ['Arquitetura'], ['Rio de Janeiro (RJ)']]
PDF URL: https://pantheon.ufrj.br/bitstream/11422/11462/1/LuizaDeMesquitaOrtiz.pdf
[['Posicionamento de Manifold Submarino'], ['Maximização da Produção'], ['Variação da Produção no Tempo']]
PDF URL: https://pantheon.ufrj.br/bitstream/11422/11465/3/AlineHelenaDelfinoSteffens.pdf
[[