## Obtenção de dados

Scrapping repositório UFPE

### Requirements

In [14]:
import requests
from bs4 import BeautifulSoup
import json
import os
from pathlib import Path

### Funções Auxiliares

In [9]:
def read_page_get_links(keywords: str) -> list:
    """
    Recebe a query como input e retorna os links disponiveis na primeira página do repositório da UFPE.
    
    Exemplo de query:
    
    "machine learning"
        
    """
    keywords = keywords.replace(" ", "+")
    
    response = requests.get("https://repositorio.ufpe.br/simple-search?location=123456789%2F50&query={}".format(keywords))
    soup = BeautifulSoup(response.text.strip(), 'html.parser')
    tags = soup.find_all(name='table')
    tag_table = tags[0]
    tags_a = tag_table.find_all(name='a')
    
    urls = []
    
    for i in range(0, len(tags_a), 2): #TODO: If query return 1 break
        link = tags_a[i]["href"]
        urls.append(link)
    
    return urls

In [10]:
def read_table_get_title_and_abstract(info_table: list[str]) -> dict:
    """
    Recebe uma tabela de informações HTML e extrai o title e abstract
    
    """
    all_titles = info_table.find_all(name='td', attrs={'class': 'dc_title'})
    title = all_titles[-1].contents[0]

    all_abst = info_table.find_all(name='td', attrs={'class': 'dc_description_abstract'})
    abst = all_abst[-1].contents[0]

    dict_thesis_info = {}
    dict_thesis_info = {
        'title': title,
        'abstract': abst,
    }
    return dict_thesis_info

### Scrapping

In [12]:
query = "aquecimento global"

In [None]:
urls = read_page_get_links(query)

In [None]:
urls

In [65]:
info_table = []

for url in urls:
    response = requests.get(f'https://repositorio.ufpe.br/{url}', allow_redirects=True)
    soup = BeautifulSoup(response.text.strip(), 'html.parser')
    tags_table = soup.find_all(name='table', attrs={'class': 'itemDisplayTable'})
    info_table.append(tags_table[0])

In [66]:
len(info_table)

10

### Saving files

In [67]:
counter = 1
path = Path("./output")
query = query.replace(" ", "_")

if not path.exists():
    path.mkdir() #Se pasta não existir, cria pasta output
    
if not Path(f"{path}/{query}").exists():
    Path(f"{path}/{query}").mkdir() #Se pasta da query não existir, cria pasta da query dentro de output

for table in info_table:
    output = read_table_get_title_and_abstract(table)
    
    with open(f"./output/{query}/thesis{counter}.json", 'w') as json_file:
        json.dump(output, json_file) #Salva os textos como json
        
    counter = counter + 1