## Camjol

Mar. 2019

Scrapt to the repository [Central American Journals Online](https://camjol.info/).

In [2]:
import requests
from lxml import html
import json

## List of Journals

In [26]:
url_base = 'https://camjol.info/index.php/index'
# Getting the index page
page = requests.get(url_base)

In [27]:
# Storing the index page as a local file
with open('index.html', 'wb') as fd:
    fd.write(page.content)
with open('index.html', 'rb') as fd:
    content = fd.read()
data = html.fromstring(content)

In [28]:
# Obtaining the list of journals
journals = data.xpath('//div[@class="body"]')

In [29]:
print("Number of journals: %d" % len(journals))

Number of journals: 49


In [30]:
def journal(idx):
    j = {}
    a = journals[idx].xpath('h3/a')[0]
    j['url'] = a.xpath('@href')[0]
    j['name'] = (a.xpath('text()')[0]).strip()
    d = journals[idx].xpath('div[@class="description"]/p/text()')
    j['desc'] = d[0] if len(d) > 0 else ''
    return j

In [31]:
jrnls = [journal(idx) for idx in range(len(journals))]

In [32]:
jrnls[30]

{'url': 'https://camjol.info/index.php/FAREM',
 'name': 'Revista Científica de FAREM-Estelí',
 'desc': 'Revista Científica de la FAREM Estelí es una revista científica multidisciplinaria de publicación electrónicas a partir de 2011. Revista Científica de la FAREM Estelí está indizada en: Google scholar y Latindex'}

## List of issues for each journal

In [33]:
url_base = 'https://camjol.info/index.php/CEIBA/issue/archive'
# Getting the archive page
page = requests.get(url_base)

In [34]:
# Storing the archive page in a local file
with open('archive.html', 'wb') as fd:
    fd.write(page.content)
with open('archive.html', 'rb') as fd:
    content = fd.read()
data = html.fromstring(content)

In [35]:
issues = data.xpath('//div[@class="obj_issue_summary"]')

In [36]:
def issue(idx, issues):
    ret = []
    for j in range(len(issues)):
        i = {}
        a = issues[j].xpath('a[@class="title"]')[0]
        i['url'] = a.xpath('@href')[0]
        i['title'] = a.xpath('text()')[0].strip()
        cod = issues[j].xpath('div[@class="series"]/text()')
        if len(cod) > 0:
            i['cod'] = cod[0].strip()
        else:
            i['cod'] = ''
        ret.append(i)
    return ret

In [37]:
def journal_issues():
    for j in range(len(jrnls)):
        url_base = jrnls[j]['url'] + '/issue/archive'
        page = requests.get(url_base)
        data = html.fromstring(page.content)
        issues = data.xpath('//div[@class="obj_issue_summary"]')
        jrnls[j]['issues'] = issue(j, issues)

In [38]:
journal_issues()

In [40]:
with open('index.json', 'w') as fd:
    json.dump(jrnls, fd)

## Listado de artículos

In [42]:
with open('index.json', 'r') as fd:
    jrnls = json.load(fd)

In [43]:
jrnls[47]

{'url': 'https://camjol.info/index.php/UYC',
 'name': 'Universidad y Ciencia',
 'desc': 'La Revista Universidad y Ciencia de la Universidad Nacional Autónoma de Nicaragua-Managua.',
 'issues': [{'url': 'https://camjol.info/index.php/UYC/issue/view/852',
   'title': 'Vol. 10 Núm. 16 (2017)',
   'cod': ''},
  {'url': 'https://camjol.info/index.php/UYC/issue/view/622',
   'title': 'Vol. 9 Núm. 15 (2017)',
   'cod': ''},
  {'url': 'https://camjol.info/index.php/UYC/issue/view/568',
   'title': 'Vol. 9 Núm. 14 (2016)',
   'cod': ''},
  {'url': 'https://camjol.info/index.php/UYC/issue/view/566',
   'title': 'Vol. 8 Núm. 13 (2015)',
   'cod': ''},
  {'url': 'https://camjol.info/index.php/UYC/issue/view/565',
   'title': 'Vol. 8 Núm. 12 (2015)',
   'cod': ''},
  {'url': 'https://camjol.info/index.php/UYC/issue/view/284',
   'title': 'Vol. 7 Núm. 11 (2013)',
   'cod': ''},
  {'url': 'https://camjol.info/index.php/UYC/issue/view/276',
   'title': 'Vol. 7 Núm. 10 (2013)',
   'cod': ''},
  {'url':

In [45]:
url_base = 'https://camjol.info/index.php/WANI/issue/view/394'
page = requests.get(url_base)
with open('content.html', 'wb') as fd:
    fd.write(page.content)
with open('content.html', 'rb') as fd:
    content = fd.read()
data = html.fromstring(content)

In [46]:
articles = data.xpath('//div[@class="title"]/a/@href')

In [47]:
articles

['https://camjol.info/index.php/WANI/article/view/2939',
 'https://camjol.info/index.php/WANI/article/view/2940',
 'https://camjol.info/index.php/WANI/article/view/2941',
 'https://camjol.info/index.php/WANI/article/view/2942',
 'https://camjol.info/index.php/WANI/article/view/2944',
 'https://camjol.info/index.php/WANI/article/view/2945',
 'https://camjol.info/index.php/WANI/article/view/2946',
 'https://camjol.info/index.php/WANI/article/view/2947',
 'https://camjol.info/index.php/WANI/article/view/2943']

In [48]:
def article(url):
    card = {}
    page = requests.get(url)
    data = html.fromstring(page.content)
    card['title'] = data.xpath('//meta[@name="citation_title"]/@content')[0]
    authors = data.xpath('//meta[@name="citation_author"]/@content')
    institutions = data.xpath('//meta[@name="citation_author_institution"]/@content')
    if len(authors) == len(institutions):
        card['authors'] = [
            {
                'author': authors[i], 
                'institution': institutions[i]
            } 
            for i in range(len(authors))
        ]
    else:
        card['authors'] = [
            {
                'author': authors[i],
                'institution': ''
            } 
            for i in range(len(authors))
        ]
    card['abstracts'] = data.xpath('//meta[@name="DC.Description"]/@content')
    card['journal'] = data.xpath('//meta[@name="citation_journal_title"]/@content')[0]
    volume = data.xpath('//meta[@name="citation_volume"]/@content')
    card['volume'] = '' if len(volume) == 0 else volume[0]
    card['date'] = data.xpath('//meta[@name="citation_date"]/@content')[0]
    first_page = data.xpath('//meta[@name="citation_firstpage"]/@content')
    if len(first_page) > 0:
        card['first_page'] = first_page[0]
        card['last_page'] = data.xpath('//meta[@name="citation_lastpage"]/@content')[0]
    else:
        card['first_page'] = ''
        card['last_page'] = ''
    card['keywords'] = data.xpath('//meta[@name="citation_keywords"]/@content')
    doi = data.xpath('//meta[@name="citation_doi"]/@content')
    card['doi'] = '' if len(doi) == 0 else doi[0]
    url = data.xpath('//meta[@name="citation_pdf_url"]/@content')
    card['url'] = '' if len(url) == 0 else url[0]
    return card

In [49]:
url = 'https://camjol.info/index.php/WANI/article/view/2945'
page = requests.get(url)
with open('article.html', 'wb') as fd:
    fd.write(page.content)
with open('article.html', 'rb') as fd:
    content = fd.read()
data = html.fromstring(content)

In [50]:
article('https://camjol.info/index.php/WANI/article/view/2945')

{'title': 'Selección y presión de caza sobre la fauna silvestre en el área de amortiguamiento de BOSAWAS',
 'authors': [{'author': 'José Thomás Quijano Maradiaga',
   'institution': 'Bluefields Indian & Caribbean University'},
  {'author': 'Rene Cassells Martínez',
   'institution': 'Bluefields Indian & Caribbean University'},
  {'author': 'Jordi Bartolomé Filella',
   'institution': 'Universitat Autónoma de Barcelona'}],
 'abstracts': ['This article is a contribution to the protection and sustainable exploitation of the natural fauna through the characterization of the hunt pressure over the species present in the Bosawas natural reserve. The investigation will facilitate future environmental studies oriented to formulate project of conservation. It contains information, for each of the communities included in the research, about the preferred locations for hunting activities, the use and destination of the products obtained, the techniques and the selection and pressure exerted on sp

In [51]:
def update_articles(idx):
    issues = jrnls[idx]['issues']
    for issue in issues:
        print(issue['title'])
        page = requests.get(issue['url'])
        data = html.fromstring(page.content)
        arts = data.xpath('//div[@class="title"]/a/@href')
        if 'articles' in issue.keys() and len(arts) == len(issue['articles']):
            continue
        issue['articles'] = []
        for url in arts:
            print(url)
            issue['articles'].append(article(url))
        issue['num_articles'] = len(arts)

In [27]:
for idx in range(len(jrnls)):
    print(idx)
    update_articles(idx)
    print()

0
Vol. 8 Núm. 1 (2017)
Vol. 7 Núm. 2 (2016-2017)
Vol. 7 Núm. 1 (2016)
Vol. 6, Núm. 2 (2015-2016)
Vol. 6 Núm. 1 (2015)
Vol. 5 Núm. 1-2 (2015)
Vol. 4 Núm. 2 (2014)

1
Año 11, Vol. 2, No. 29, Julio-Diciembre 2017
Año 11, Vol. 1, No. 28, Enero-Junio 2017.
Año 10, Vol. 2, No 27, Julio-Diciembre 2016
Año 10, Vol. 1, No 26, Enero-Junio 2016
Año 9, Vol. 2, No 25, Julio-Diciembre 2015

2
Vol. 2 Núm. 1 (2019)
Vol. 1 Núm. 1 (2018)
Vol. 1 Núm. 2 (2018)

3
Honduran Smilax L. (zarzaparrilla) (Liliales, Smilacaceae): an estimate of the amount of plants required for the Boston 1855 exports
Producción de tilapia  (Oreochromis niloticus L.) en la etapa de engorde con dos estrategias de alimentación
Regeneración natural en sitios impactados por incendios en la Reserva Biológica Uyuca, Honduras
First records for fifteen species of Lepidoptera for Honduras
Vol. 55 Núm. 1 (2018)
Vol. 54 Núm. 2 (2016)
Conferencia Internacional Ensminger para la Ganadería
Vol. 53 Núm. 2 (2012)
Vol. 53 Núm. 1 (2012)
Vol. 52 Nú

ConnectionError: HTTPSConnectionPool(host='camjol.info', port=443): Max retries exceeded with url: /index.php/KOOT/article/view/2982 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f5f76fce7f0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [52]:
with open('index.json', 'w') as fd:
    json.dump(jrnls, fd)
with open('index.json', 'r') as fd:
    jrnls = json.load(fd)    

In [107]:
jrnls[30]

{'url': 'https://camjol.info/index.php/FAREM',
 'name': 'Revista Científica de FAREM-Estelí',
 'desc': 'Revista Científica de la FAREM Estelí es una revista científica multidisciplinaria de publicación electrónicas a partir de 2011. Revista Científica de la FAREM Estelí está indizada en: Google scholar y Latindex',
 'issues': [{'url': 'https://camjol.info/index.php/FAREM/issue/view/993',
   'title': 'Núm. 27 (2018)',
   'cod': '',
   'num_articles': 9,
   'articles': [{'title': 'El 25 aniversario de una universidad comunitaria',
     'authors': [{'author': 'Alvaro Rivas', 'institution': 'Editor'}],
     'abstracts': ['Wani Vol.71 2016 pp. 1', 'Wani Vol.71 2016 pp.1'],
     'journal': 'Wani',
     'volume': '71',
     'date': '2016/10/24',
     'first_page': '1',
     'last_page': '1',
     'keywords': [],
     'doi': '10.5377/wani.v71i0.2939',
     'url': 'https://camjol.info/index.php/WANI/article/download/2939/2679'},
    {'title': 'El veinticinco aniversario de la Bluefields Indian 