# BASE (UNISINOS)

## Bibliotecas importadas

In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

## Funções básicas

In [2]:
def criar_query(key_word):
    return "+".join(key_word.split())

In [3]:
def listar_links_pagina(url_search):
    page = requests.get(url_search)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all('h2', {'class': 'title'})
    a_links = []
    for item in results:
        try:
            remover_links = 'editorial' in item.find('a').text.lower()
            remover_links = remover_links or 'folha de rosto' in item.find('a').text.lower()
            remover_links = remover_links or 'dados da edição ' in item.find('a').text.lower()
            if not remover_links:
                a_links.append(item.find('a')['href'])
        except:
            pass
    return a_links

In [4]:
def listar_links(url_search_text, query, pg_num=1):
    a_links = [] 
    while True:
        url_search = url_search_text.replace('__query__', query).replace('__pg_num__', str(pg_num))
        tmp = listar_links_pagina(url_search)
        if len(tmp) != 0:
            pg_num += 1
            a_links += tmp
        else:
            break
    return a_links

In [5]:
url_search_text = 'http://revistas.unisinos.br/index.php/base/search/search?query=__query__&searchJournal=3&authors=&title=&abstract=&galleyFullText=&discipline=&subject=&type=&coverage=&indexTerms=&dateFromMonth=01&dateFromDay=1&dateFromYear=2012&dateToMonth=12&dateToDay=31&dateToYear=2021&orderBy=score&orderDir=desc&searchPage=__pg_num__#results'

links = listar_links(url_search_text, 'contabilidade+ambiental', pg_num=1)

link = links[0]

link


'http://revistas.unisinos.br/index.php/base/article/view/19555'

In [6]:
links

['http://revistas.unisinos.br/index.php/base/article/view/19555']

In [7]:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')

In [8]:
def completar_lista(lista):
    if len(lista) < 6:
        lista += (6 - len(lista)) * ['']
    return lista

In [9]:
def limpar_afiliacao(afiliacao):
    afiliacao = ' '.join(afiliacao.split())
    afiliacao = afiliacao.split(',')[0]
    afiliacao = afiliacao.split(' - ')[0]
    afiliacao = afiliacao.split('(')[0]
    afiliacao = afiliacao.split('/')[0]
    return afiliacao.strip()

In [10]:
def captura_titulo(soup):
    try:
        title = soup.find('h1', {'class': 'page_title'}).text
        title = ' '.join(title.split())
    except:
        title = ''
    return {'Título': title}

In [11]:
captura_titulo(soup)

{'Título': 'The Impact of Product Diversity on the Relationship Between Internationalization and Performance of Brazilian Multinational Companies'}

In [12]:
def captura_data(soup):
    try:
        date_tag = soup.find('div', {'class': 'item published'})
        date = date_tag.find('div', {'class': 'value'}).text
        date = ' '.join(date.split())
    except:
        date = ''
    return {'Data de Publicação': date[:10]}

In [13]:
captura_data(soup)

{'Data de Publicação': '2021-08-17'}

In [14]:
def captura_resumo(soup):
    try:
        abstract_section = soup.find('section', {'class': 'item abstract'})
        abstract = abstract_section.text
        abstract = abstract.replace('Abstract:', '')
        abstract = abstract.replace('Abstract', '').strip()
        #abstract = ' '.join(abstract.split())
    except:
        abstract = ''
    return {'Resumo': abstract}

In [15]:
captura_resumo(soup)

{'Resumo': 'The aim of this study was to investigate the moderating role of product diversification in the relationship between internationalization and the performance of Brazilian multinational companies. This study used internationalization data collected in reports from Fundação Dom Cabral, financial data collected in the Economática database, in addition to secondary data from 33 Brazilian companies collected on the website of the Brazilian Securities and Exchange Commission (CVM). The work refers to an eight-year period (2010 to 2017), and used multiple linear regression with fixed effects and panel data. It was identified that the relationship between internationalization and performance of multinational Brazilian companies occurs in a U shape. Furthermore, a negative moderating role was identified in the relationship between product diversification and low-level internationalization, and a positive moderating effect on the performance of Brazilian companies internationalized wi

In [16]:
def captura_palavras_chave(soup):
    try:
        kw_section = soup.find('section', {'class': 'item keywords'})
        kw = kw_section.text.replace('Keywords:', '')
        kw = ' '.join(kw.split())
        kw = kw.replace(',', '.').replace(';', '.')
    except:
        kw = ''
    if kw == '':
        try:
            kw_section = soup.find('section', {'class': 'item abstract'})
            p_list = kw_section.find_all('p')
            for p in p_list:
                if 'Palavras-chave' in p.text:
                    kw_full = p.text
                    kw = kw_full.replace('Palavras-chave', '').replace(':', '')
                    kw = ' '.join(kw.split())
                    kw = kw.replace(',', '.').replace(';', '.')
        except:
            pass
    return {'Palavras-chave': kw}

In [17]:
captura_palavras_chave(soup)

{'Palavras-chave': 'Internacionalization. Performance. Product diversification. Role moderator'}

In [18]:
def captura_autores(soup):
    autores = []
    try:
        authors_ul = soup.find('ul', {'class': 'authors'})
        autores = authors_ul.find_all('span', {'class': 'name'})
        autores = [' '.join(a.text.split()) for a in autores]
    except:
        pass
    qtd_autores = len(autores)
    autores = completar_lista(autores)
    autores = [qtd_autores] + autores
    
    key_autores = [f"Autor {i}" for i in range(1, 7)]
    key_autores = ['Qtd. de Autores'] + key_autores

    return dict(zip(key_autores, autores))

In [19]:
captura_autores(soup)

{'Qtd. de Autores': 3,
 'Autor 1': 'Vitor Fonseca Machado Beling Dias',
 'Autor 2': 'Antônio Sergio Torres Penedo',
 'Autor 3': 'Vinicius Silva Pereira',
 'Autor 4': '',
 'Autor 5': '',
 'Autor 6': ''}

In [20]:
def captura_afiliacao(soup):
    afiliacao = []
    try:
        authors_ul = soup.find('ul', {'class': 'authors'})
        afiliacao = authors_ul.find_all('span', {'class': 'affiliation'})
        afiliacao = [' '.join(a.text.split()) for a in afiliacao]
        afiliacao = [limpar_afiliacao(a) for a in afiliacao]
    except:
        pass
    afiliacao = completar_lista(afiliacao)
    
    key_affiliation = [f"Afiliação {i}" for i in range(1, 7)]

    return dict(zip(key_affiliation, afiliacao))

In [21]:
captura_afiliacao(soup)

{'Afiliação 1': 'Universidade Federal de Uberlândia',
 'Afiliação 2': 'Universidade Federal de Uberlândia',
 'Afiliação 3': 'Universidade Federal de Uberlândia',
 'Afiliação 4': '',
 'Afiliação 5': '',
 'Afiliação 6': ''}

In [22]:
def realizar_consulta(key_words_list):
    periodico_nome_dict = {'Revista': 'BASE (UNISINOS)'}
    url_search_text = 'http://revistas.unisinos.br/index.php/base/search/search?query=__query__&searchJournal=3&authors=&title=&abstract=&galleyFullText=&discipline=&subject=&type=&coverage=&indexTerms=&dateFromMonth=01&dateFromDay=1&dateFromYear=2012&dateToMonth=12&dateToDay=31&dateToYear=2021&orderBy=score&orderDir=desc&searchPage=__pg_num__#results'
    a_links = []
    for key_word in key_words_list:
        query = criar_query(key_word)
        a_links += listar_links(url_search_text, query)
        
    resultados = []
    for link in a_links:
        link_dict = {'Url Artigo': link}
        page = requests.get(link)
        soup = BeautifulSoup(page.content, 'html.parser')
        dict_artigo = {
            **captura_data(soup),
            **captura_titulo(soup),
            **captura_resumo(soup),
            **captura_palavras_chave(soup),
            **captura_autores(soup),
            **captura_afiliacao(soup),
            **link_dict,
            **periodico_nome_dict
        }
        resultados.append(dict_artigo)
    df = pd.DataFrame(resultados)
    df['Data de Publicação'] = pd.to_datetime(df['Data de Publicação'], format='%Y-%m-%d')
    return df

## Pesquisa por artigos

In [23]:
key_words_list = ['contabilidade ambiental', 'balanço social', 'relato integrado', 'nbct 15']
df = realizar_consulta(key_words_list)
df.head(3)

Unnamed: 0,Data de Publicação,Título,Resumo,Palavras-chave,Qtd. de Autores,Autor 1,Autor 2,Autor 3,Autor 4,Autor 5,Autor 6,Afiliação 1,Afiliação 2,Afiliação 3,Afiliação 4,Afiliação 5,Afiliação 6,Url Artigo,Revista
0,2021-08-17,The Impact of Product Diversity on the Relatio...,The aim of this study was to investigate the m...,Internacionalization. Performance. Product div...,3,Vitor Fonseca Machado Beling Dias,Antônio Sergio Torres Penedo,Vinicius Silva Pereira,,,,Universidade Federal de Uberlândia,Universidade Federal de Uberlândia,Universidade Federal de Uberlândia,,,,http://revistas.unisinos.br/index.php/base/art...,BASE (UNISINOS)
1,2021-05-25,A model of social audit for university foundat...,The Social Audit is the report that informs ab...,,3,Sady Mazzioni,João Eduardo Prudêncio Tinoco,Antonio Benedito Silva Oliveira,,,,,,,,,,http://revistas.unisinos.br/index.php/base/art...,BASE (UNISINOS)


In [24]:
df.to_csv('../data/BASE_UNISINOS.csv', index=False)

In [25]:
df

Unnamed: 0,Data de Publicação,Título,Resumo,Palavras-chave,Qtd. de Autores,Autor 1,Autor 2,Autor 3,Autor 4,Autor 5,Autor 6,Afiliação 1,Afiliação 2,Afiliação 3,Afiliação 4,Afiliação 5,Afiliação 6,Url Artigo,Revista
0,2021-08-17,The Impact of Product Diversity on the Relatio...,The aim of this study was to investigate the m...,Internacionalization. Performance. Product div...,3,Vitor Fonseca Machado Beling Dias,Antônio Sergio Torres Penedo,Vinicius Silva Pereira,,,,Universidade Federal de Uberlândia,Universidade Federal de Uberlândia,Universidade Federal de Uberlândia,,,,http://revistas.unisinos.br/index.php/base/art...,BASE (UNISINOS)
1,2021-05-25,A model of social audit for university foundat...,The Social Audit is the report that informs ab...,,3,Sady Mazzioni,João Eduardo Prudêncio Tinoco,Antonio Benedito Silva Oliveira,,,,,,,,,,http://revistas.unisinos.br/index.php/base/art...,BASE (UNISINOS)
