# Sciencedirect (Elsevier) commands for publications
Here it is only about the steps, to parse are necessary so that it can be included in the end.

In [2]:
url_free1 = 'https://www.sciencedirect.com/science/article/pii/S0140988315002571?via%3Dihub'
url_free2 = 'https://www.sciencedirect.com/science/article/pii/S2451929420300851?via%3Dihub'  # url of a open access article

url_pay1 = 'https://www.sciencedirect.com/science/article/abs/pii/S104732031830230X?via%3Dihub'  # url of a non subscribed article
url_pay2 = 'https://www.sciencedirect.com/science/article/abs/pii/S0306437918300838?via%3Dihub'  # url of a non subscribed article

In [367]:
import requests
import time
from bs4 import BeautifulSoup
import bs4
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from helium import *
import cloudscraper
from requests_html import HTMLSession
import import_ipynb
import re

# Accessing HTML (mutliple ways)

In [133]:
def get_HTML_selenium(url, os):
    """
    Get HTML from a website using Selenium and ChromeDriver. Methods runs headless per default and has JS activated.
    Be aware that this method is quite slow and schould only be used if classic requests method cannot access information thus only use that for dynamic data.
    :param url: URL of a website
    :param os: Operating system of the user (Windows, Linux, Mac)
    :return: HTML with all loaded content
    """
    if os == 'mac':
        PATH_MAC = '../driver/chromedriverMAC'
    options = Options()
    options.add_argument("--headless=chrome")
    options.add_argument("--enable-javascript")
    options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])

    start = time.time()

    driver = webdriver.Chrome(PATH_MAC, options=options)
    driver.get(url)
    break_time = 5
    time.sleep(break_time)

    # todo if content owned by WWU add sleep to load more content
    html = driver.page_source
    driver.close()

    end = time.time()

    print(
        f'Browser closed in {end - start} seconds, including {break_time} seconds of waiting, thus {end - start - break_time} seconds of loading.')
    return html


def get_page_with_requests(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15'}
    r = requests.get(url, headers=headers)
    print(r.status_code)
    assert r.status_code == 200
    return r


def get_page_with_cloudscraper(url):
    scraper = cloudscraper.create_scraper(
        browser={
            'custom': 'ScraperBot/1.0',
        }
    )
    r = scraper.get(url)
    print(r.status_code)
    assert r.status_code == 200
    return r


def get_page_with_requsts_html(url):
    s = HTMLSession()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15'}
    r = s.get(url, headers=headers)
    print(r.status_code)
    assert r.status_code == 200
    return r


In [155]:
def get_bs(url, method='requests'):
    try:
        if method == 'requests':
            r = get_page_with_requests(url)
            bs = BeautifulSoup(r.content, 'html.parser')
        elif method == 'cloud':
            r = get_page_with_cloudscraper(url)
            bs = BeautifulSoup(r.content, 'html.parser')
        elif method == 'requests_html':
            r = get_page_with_requsts_html(url)
            bs = BeautifulSoup(r.content, 'html.parser')
        elif method == 'selenium':
            r = get_HTML_selenium(url, os='mac')
            bs = BeautifulSoup(r, 'html.parser')  # selenium already returns html
        else:
            raise ValueError('Method not known')

    except Exception as e:
        print(f'Error: {e}', url)
        return None
    return bs



Create soups of test objects

In [248]:
free1_soup = get_bs(url_free1, method='cloud')
free2_soup = get_bs(url_free2, method='cloud')
pay1_soup = get_bs(url_pay1, method='cloud')
pay2_soup = get_bs(url_pay2, method='cloud')
elsevier_soups = [free1_soup, free2_soup, pay1_soup, pay2_soup]

200
200
200
200


## Main Fields
### Title


In [179]:
def get_title(bs):
    """
    Get title of a publication
    :param bs: Bs4 object
    :return:
    """
    title = bs.find('span', class_='title-text').text.strip()
    return title

In [180]:
check = ['Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union',
         'A Structure-Based Platform for Predicting Chemical Reactivity',
         'Artistic movement recognition by consensus of boosted SVM based experts',
         'Semi-automatic inductive construction of reference process models that represent best practices in public administrations: A method']

for soup, check in zip(elsevier_soups, check):
    print(get_title(soup))
    assert get_title(soup) == check

Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
A Structure-Based Platform for Predicting Chemical Reactivity
Artistic movement recognition by consensus of boosted SVM based experts
Semi-automatic inductive construction of reference process models that represent best practices in public administrations: A method


### Doi

In [219]:
def get_doi(bs, type='doi_number'):
    """
    Gets the doi_number of a publication
    :param bs: Bs4 object
    :return:
    """
    regex_doi = re.compile(r'http(s?)://doi.org/.*')
    doi_link = bs.find('a', class_='doi').text.strip()

    if type == 'doi_number':
        # control and clean doi
        if regex_doi.match(doi_link):
            doi = re.sub(r'http(s?)://doi.org/', '', doi_link)
            return doi
        else:
            return None
    if type == 'doi_link':
        if regex_doi.match(doi_link):
            return doi_link
        else:
            return None


In [225]:
get_doi(free1_soup)

'10.1016/j.eneco.2015.09.004'

### Authors

In [244]:
def get_authors(bs):
    """
    Get authors of a publication
    :param bs:
    :return: Author names as list
    """
    authors = []
    try:
        author_boxes = bs.find('div', {'class': 'author-group', 'id': 'author-group'}).find_all('a')

        for box in author_boxes:
            first_name = box.find('span', class_='text given-name').text.strip()
            last_name = box.find('span', class_='text surname').text.strip()
            authors.append(f'{first_name} {last_name}')
        return authors
    except:
        return None

In [253]:
check = [3, 5, 2, 5]
for soup, check in zip(elsevier_soups, check):
    print(get_authors(soup))
    assert len(get_authors(soup)) == check


['Andreas Löschel', 'Frank Pothen', 'Michael Schymura']
['Frederik Sandfort', 'Felix Strieth-Kalthoff', 'Marius Kühnemund', 'Christian Beecks', 'Frank Glorius']
['Corneliu Florea', 'Fabian Gieseke']
['Hendrik Scholta', 'Marco Niemann', 'Patrick Delfmann', 'Michael Räckers', 'Jörg Becker']


### Keywords

In [255]:
def get_keywords(bs):
    """
    Get list of keywords
    :param bs: Received bs of the publication
    :return: List of strings
    """
    keywords = []
    try:
        kwds = bs.find('div', class_='keywords-section').find_all('div', class_='keyword')
        for kwd in kwds:
            keyword = kwd.text.strip()
            keywords.append(keyword)
        return keywords
    except:
        return None

In [257]:
for soup in elsevier_soups:
    print(get_keywords(soup))

['Environmental and climate economics', 'Energy intensity', 'Index decomposition']
['reactivity prediction', 'machine learning', 'molecular structures', 'organic chemistry', 'yield prediction', 'enantioselectivity prediction']
['Randomized boosted SVMs', 'Multi-scale topography', 'Painting style recognition', 'Consensus of experts', 'Ensembles']
['Process management', 'Process modeling', 'Reference modeling', 'Process model merge', 'E-government', 'Public administration', 'Benchmarking', 'Model querying']


### Abstract

In [262]:
def get_abstract(bs):
    """
    Get abstract of a publication
    :param bs: Received bs of the publication
    :return: Abstract : String
    """
    try:
        abstract = bs.find('div', class_='abstract author').div.text.strip()
        return abstract
    except:
        return None

In [269]:
for soup in elsevier_soups:
    print(f'Publication: {get_title(soup)}')
    a = ''
    for x in get_authors(soup):
        a += (x + '; ')
    print(f'Authors: {a}')
    print('Abstract: ')
    print(get_abstract(soup))
    print('-------- \n')

Publication: Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
Authors: Andreas Löschel; Frank Pothen; Michael Schymura; 
Abstract: 
One of the most promising ways of meeting climate policy targets is improving energy efficiency, i.e., reducing the amount of scarce and polluting resources needed to produce a given quantity of output. Relying upon the World Input-Output Database (WIOD), we investigate the decline in energy intensity in the EU27 countries between 1995 and 2009. Changes in energy intensity can be attributed to two different drivers: changes in the industrial composition of an economy and changes in its sectoral energy intensities. We conduct a series of index decomposition analyses (IDA) to isolate the effects exerted by these drivers. We then take the findings from the index decomposition analysis and subject them to panel estimations. The objective is to control for factors that may have shaped the evolution of energy i

## Journal fields
### Journal name

In [310]:
def get_journal_name(bs):
    """
    Get the journal name where the paper has been published
    :param bs: Received bs of the publication
    :return: String
    """
    # Some Journals have their name as text and logo, other have their name as text only and a dedicated logo
    # We have to differentiate between both cases
    journal_bar = bs.find('div', {'id': 'publication'})
    if journal_bar.attrs['class'] == ['Publication', 'wordmark-layout']:
        journal_name = journal_bar.find('h2', class_=lambda c: 'publication-title-link' in c).text.strip()
    else:
        journal_name = journal_bar.find('a', class_='publication-title-link').text.strip()
    return journal_name

In [317]:
check = ['Energy Economics', 'Chem', 'Journal of Visual Communication and Image Representation', 'Information Systems']

for soup, check in zip(elsevier_soups, check):
    print(get_journal_name(soup))
    assert get_journal_name(soup) == check

Energy Economics
Chem
Journal of Visual Communication and Image Representation
Information Systems


### Journal information (Volume, Publication Month/Year, Pages)

In [441]:
# Information are parsed from an an unstructured div-element and thus extracted in the same method
def get_journal_information(bs):
    """
    Returns the journal information of a publication
    :param bs: bs4 object
    :return: Dictionary with journal information: Volume, Release, Start page, End page
    """
    try:
        journal_info = bs.find('div', {'class': 'Publication', 'id': 'publication'}).find('div',
                                                                                          class_='text-xs')
        # dummy comment to identify type
        comment_markup = "<b><!--I am an comment--></b>"
        _x = BeautifulSoup(comment_markup, "html.parser")
        _comment = _x.b.string

        counter = 0  # count HTML comments
        result = ['Volume', 'Year', 'PageRange']  # create list of results
        # iterate over sub-content of textbox
        for x in journal_info.contents:
            if x.text.strip() == ',':  # skip when comma is found
                continue
            # increment counter if comment is found
            if isinstance(x, type(_comment)):
                counter += 1
                continue
            # add text to result list if no special case  (comment, comma
            result[counter] = x.text.strip()

        # clean page range
        result[2] = result[2].removeprefix(', Pages ')
        volume, release, page_range = result
        # Split by hyphen to get start and end_page
        start_page = page_range.split('-')[0]
        end_page = page_range.split('-')[1]

        return {
            'volume': volume, 'release': release, 'start_page': start_page, 'end_page': end_page
        }
    except:
        return None


def get_volume(bs):
    """
    Returns the information about the volume of a jourmal in which the publication has been published
    :param bs: bs4 object
    :return: String
    """
    try:
        return get_journal_information(bs)['volume']
    except:
        return None


def get_release(bs):
    """
    Returns the information about the release of a jourmal in which the publication has been published
    :param bs: bs4 object
    :return: String
    """
    try:
        return get_journal_information(bs)['release']
    except:
        return None


def get_start_page(bs):
    """
    Returns the start page of a publication in a journal
    :param bs: bs4 object
    :return: String
    """
    try:
        return get_journal_information(bs)['start_page']
    except:
        return None


def get_end_page(bs):
    """
    Returns the end page of a publication in a journal
    :param bs: bs4 object
    :return: String
    """
    try:
        return get_journal_information(bs)['end_page']
    except:
        return None

In [446]:
# Small test
for soup in elsevier_soups:
    print(get_title(soup))
    print(get_journal_information(soup))
    print(f'Volume: {get_volume(soup)}')
    print(f'Release: {get_release(soup)}')
    print(f'Start page: {get_start_page(soup)}')
    print(f'End page: {get_end_page(soup)}')
    print('-------- \n')

Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
{'volume': 'Volume 52, Supplement 1', 'release': 'December 2015', 'start_page': 'S63', 'end_page': 'S75'}
Volume: Volume 52, Supplement 1
Release: December 2015
Start page: S63
End page: S75
-------- 

A Structure-Based Platform for Predicting Chemical Reactivity
{'volume': 'Volume 6, Issue 6', 'release': '11 June 2020', 'start_page': '1379', 'end_page': '1390'}
Volume: Volume 6, Issue 6
Release: 11 June 2020
Start page: 1379
End page: 1390
-------- 

Artistic movement recognition by consensus of boosted SVM based experts
{'volume': 'Volume 56', 'release': 'October 2018', 'start_page': '220', 'end_page': '233'}
Volume: Volume 56
Release: October 2018
Start page: 220
End page: 233
-------- 

Semi-automatic inductive construction of reference process models that represent best practices in public administrations: A method
{'volume': 'Volume 84', 'release': 'September 2019', 'start_page': 