# Revista de Contabilidade do Mestrado em Ciências Contábeis

## Bibliotecas importadas

In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

## Funções básicas

In [2]:
def criar_query(key_word):
    return "+".join(key_word.split())

In [41]:
def listar_links_pagina(url_search):
    page = requests.get(url_search)
    soup = BeautifulSoup(page.content, 'html.parser')
    tbl_results = soup.find('table', {'class': 'listing'})
    a_links = []
    for a in tbl_results.find_all('a'):
           if 'Resumo' in a.text:
                a_links.append(a['href'])
    return a_links



In [42]:
def listar_links(url_search_text, query, pg_num=1):
    a_links = [] 
    while True:
        url_search = url_search_text.replace('__query__', query).replace('__pg_num__', str(pg_num))
        tmp = listar_links_pagina(url_search)
        if len(tmp) != 0:
            pg_num += 1
            a_links += tmp
        else:
            break
    return a_links



In [43]:
url_search_text = 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/search/advancedResults?query=__query__&searchJournal=&author=&title=&fullText=&supplementaryFiles=&discipline=&subject=&type=&coverage=&dateFromMonth=&dateFromDay=&dateFromYear=2012&dateToMonth=&dateToDay=&dateToYear=2021&searchPage=__pg_num__#results'
links = listar_links(url_search_text, 'contabilidade+ambiental', pg_num=1)

link = links[0]

link

'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/1398'

In [6]:
links

['http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/1398',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/3360',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/1402',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/2952',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/2953',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/3549',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/3415',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/3063',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/3623',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/3568',
 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/article/view/3610']

In [44]:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')

link = soup.find('frame')['src']
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')

In [45]:
def completar_lista(lista):
    if len(lista) < 6:
        lista += (6 - len(lista)) * ['']
    return lista

In [46]:
def captura_titulo(soup):
    try:
        title_tag = soup.find('div', {'id': 'content'})
        title = title_tag.find('h3').text
    except:
        title = ''
    return {'Título': title}

In [48]:
captura_titulo(soup)

{'Título': 'UMA REFLEXÃO SOBRE AS OPORTUNIDADES PARA A CONTABILIDADE AMBIENTAL'}

In [49]:
def captura_data(soup):
    try:
        year_tags = soup.find('div', {'id': 'breadcrumb'})
        year_tag = year_tags.find_all('a')
        year_text = year_tag[1].text
        year = year_text[-5:-1]
    except:
        year = ''
    return {'Data de Publicação': year}

In [50]:
captura_data(soup)

{'Data de Publicação': '2012'}

In [51]:
def captura_resumo(soup):
    try:
        abstract_tag = soup.find('div', {'id': 'articleAbstract'})
        abstract = abstract_tag.find('div').text
    except:
        abstract = ''
    return {'Resumo': abstract}

In [52]:
captura_resumo(soup)

{'Resumo': ''}

In [53]:
def captura_palavras_chave(soup):
    try:
        kw_tag = soup.find('div', {'id': 'articleSubject'})
        kw = kw_tag.find('div').text.replace(',', '.').replace(';', '.')
    except:
        kw = ''
    return {'Palavras-chave': kw}

In [17]:
captura_palavras_chave(soup)

{'Palavras-chave': ''}

In [18]:
def captura_autores(soup):
    autores = []
    try:
        authors_tag = soup.find('div', {'id': 'authorString'})
        authors_full = authors_tag.find('em')
        autores = authors_full.text.split(', ')
    except:
        pass

    qtd_autores = len(autores)
    autores = completar_lista(autores)
    autores = [qtd_autores] + autores
    
    key_autores = [f"Autor {i}" for i in range(1, 7)]
    key_autores = ['Qtd. de Autores'] + key_autores

    return dict(zip(key_autores, autores))

In [19]:
authors_tag = soup.find('div', {'id': 'authorString'})
authors_full = authors_tag.find('em')
autores = authors_full.text.split(', ')

autores

AttributeError: 'NoneType' object has no attribute 'find'

In [None]:
captura_autores(soup)

In [None]:
def captura_afiliacao(soup):
    affiliation_list = []
    try:
        authors_tag = soup.find_all('div', {'class': 'authorBio'})
        for p in authors_tag:
            try:
                affiliation = p.find_all('br')[-2].next_sibling
                affiliation = affiliation.replace('\t', '')
            except:
                affiliation = ''
            affiliation_list.append(affiliation)
    except:
        pass
    affiliation_list = completar_lista(affiliation_list)
    key_affiliation = [f"Afilição {i}" for i in range(1, 7)]

    return dict(zip(key_affiliation, affiliation_list))

In [None]:
captura_afiliacao(soup)

In [None]:
def realizar_consulta(key_words_list):
    periodico_nome_dict = {'Revista': 'Revista de Contabilidade do Mestrado em Ciências Contábeis'}
    url_search_text = 'http://www.atena.org.br/revista/ojs-2.2.3-08/index.php/UERJ/search/advancedResults?query=__query__&searchJournal=&author=&title=&fullText=&supplementaryFiles=&discipline=&subject=&type=&coverage=&dateFromMonth=&dateFromDay=&dateFromYear=2012&dateToMonth=&dateToDay=&dateToYear=2021&searchPage=__pg_num__#results'

    a_links = []
    for key_word in key_words_list:
        query = criar_query(key_word)
        a_links += listar_links(url_search_text, query)
        
    resultados = []
    for link in a_links:
        link_dict = {'Url Artigo': link}
        page = requests.get(link)
        soup = BeautifulSoup(page.content, 'html.parser')
        
        if 'Pré-textual' in captura_titulo(soup)['Título'] or 'CAP Accounting and Management' in captura_titulo(soup)['Título']:
            continue
            
        dict_artigo = {
            **captura_data(soup),
            **captura_titulo(soup),
            **captura_resumo(soup),
            **captura_palavras_chave(soup),
            **captura_autores(soup),
            **captura_afiliacao(soup),
            **link_dict,
            **periodico_nome_dict
        }
        resultados.append(dict_artigo)
    df = pd.DataFrame(resultados)
    df['Data de Publicação'] = pd.to_datetime(df['Data de Publicação'], format='%Y')
    return df

## Pesquisa por artigos

In [None]:
key_words_list = ['contabilidade ambiental', 'balanço social', 'relato integrado', 'nbct 15']
df = realizar_consulta(key_words_list)
df.head()

In [None]:
df.to_csv('../data/RCMCC_UERJ.csv', index=False)

In [None]:
df