# Sciencedirect (Elsevier) commands for publications
Here it is only about the steps, to parse are necessary so that it can be included in the end.

In [1]:


url_free1 = 'https://www.sciencedirect.com/science/article/pii/S0140988315002571?via%3Dihub'
url_free2 = 'https://www.sciencedirect.com/science/article/pii/S2451929420300851?via%3Dihub'  # url of a open access article

url_pay1 = 'https://www.sciencedirect.com/science/article/abs/pii/S104732031830230X?via%3Dihub'  # url of a non subscribed article
url_pay2 = 'https://www.sciencedirect.com/science/article/abs/pii/S0306437918300838?via%3Dihub'  # url of a non subscribed article

In [2]:
import re
import time
import json

import cloudscraper
import requests
from bs4 import BeautifulSoup
from helium import *
# from requests_html import HTMLSession
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

## Accessing HTML (mutliple ways)

In [19]:
def get_HTML_selenium(url, os):
    """
    Get HTML from a website using Selenium and ChromeDriver. Methods runs headless per default and has JS activated.
    Be aware that this method is quite slow and schould only be used if classic requests method cannot access information thus only use that for dynamic data.
    :param url: URL of a website
    :param os: Operating system of the user (Windows, Linux, Mac)
    :return: HTML with all loaded content
    """
    if os == 'mac':
        PATH_MAC = '../driver/chromedriverMAC'
    options = Options()
    options.add_argument("--headless=chrome")
    options.add_argument("--enable-javascript")
    options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])

    start = time.time()

    driver = webdriver.Chrome(PATH_MAC, options=options)
    driver.get(url)
    break_time = 5
    time.sleep(break_time)

    # todo if content owned by WWU add sleep to load more content
    html = driver.page_source
    driver.close()

    end = time.time()

    print(
        f'Browser closed in {end - start} seconds, including {break_time} seconds of waiting (hard value), thus {end - start - break_time} seconds of loading.')
    return html


def get_page_with_requests(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15'}
    r = requests.get(url, headers=headers)
    print(r.status_code)
    assert r.status_code == 200
    return r


def get_page_with_cloudscraper(url):
    scraper = cloudscraper.create_scraper(
        browser={
            'custom': 'ScraperBot/1.0',
        }
    )
    r = scraper.get(url)
    print(r.status_code)
    assert r.status_code == 200
    return r


# def get_page_with_requsts_html(url):
#     s = HTMLSession()
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15'}
#     r = s.get(url, headers=headers)
#     print(r.status_code)
#     assert r.status_code == 200
#     return r


def get_HTML_helium(url):
    """
    Get HTML from a website using helium and ChromeDriver. Helium is a lightweight Selenium adapter. It comes with simple wait and click functions.
    Method runs headless per default and has JS activated. By adding arguments the method mimics a user so that Elesevier returns the full HTML and allows loading.
    Be aware that this method is quite slow and schould only be used if classic requests method cannot access information thus only use that for dynamic data.
    :param url:
    :return:
    """
    ## helium does not work for science direct, so we use selenium instead => works!
    start = time.time()

    # Tricking Elsevier
    options = Options()
    options.add_argument("--headless=chrome")
    options.add_argument("--enable-javascript")
    #options.add_argument('--no-sandbox')
    options.add_argument("--disable-extensions")
    options.add_argument("--start-maximized")
    options.add_argument("window-size=1920,1080")
    browser = start_chrome(url, options=options)

    wait_until_start = time.time()
    wait_until(lambda: not Text("Loading...").exists(), timeout_secs=10,
               interval_secs=0.5)  #Experimental: wait until no 'Loading...' text is visible
    wait_until_end = time.time()

    scroll_start = time.time()
    scroll_down(20000)  # scroll down a lot
    print(f'scrolling took {time.time() - scroll_start} seconds')
    html = browser.page_source
    kill_browser()

    end = time.time()

    print(
        f'Browser closed in {end - start} seconds, including {wait_until_end - wait_until_start} seconds of waiting, thus {end - start - wait_until_end + wait_until_start} seconds of loading.')

    return html



In [20]:
def get_bs(url, method='requests'):
    try:
        if method == 'requests':
            r = get_page_with_requests(url)
            bs = BeautifulSoup(r.content, 'html.parser')
        elif method == 'cloud':
            r = get_page_with_cloudscraper(url)
            bs = BeautifulSoup(r.content, 'html.parser')
        # elif method == 'requests_html':
        #     r = get_page_with_requsts_html(url)
        #     bs = BeautifulSoup(r.content, 'html.parser')
        elif method == 'selenium':
            r = get_HTML_selenium(url, os='mac')
            bs = BeautifulSoup(r, 'html.parser')  # selenium already returns html
        elif method == 'helium':
            r = get_HTML_helium(url)
            bs = BeautifulSoup(r, 'html.parser')  # helium already returns html
        else:
            raise ValueError('Method not supported')

    except Exception as e:
        print(f'Error: {e}', url)
        return None
    return bs



Create soups of test objects with cloudscraper

In [42]:
free1_soup = get_bs(url_free1, method='cloud')
free2_soup = get_bs(url_free2, method='cloud')
pay1_soup = get_bs(url_pay1, method='cloud')
pay2_soup = get_bs(url_pay2, method='cloud')
elsevier_soups = [free1_soup, free2_soup, pay1_soup, pay2_soup]

200
200
200
200


Create soups of test objects with Hellium/Selenium with full content

In [21]:
free1_soup_full = get_bs(url_free1, method='helium')
free2_soup_full = get_bs(url_free2, method='helium')
pay1_soup_full = get_bs(url_pay1, method='helium')
pay2_soup_full = get_bs(url_pay2, method='helium')
elsevier_soups_full = [free1_soup_full, free2_soup_full, pay1_soup_full, pay2_soup_full]

scrolling took 0.07204794883728027 seconds
Browser closed in 26.13229489326477 seconds, including 19.128371000289917 seconds of waiting, thus 7.0039238929748535 seconds of loading.
scrolling took 0.07414674758911133 seconds
Browser closed in 23.928987979888916 seconds, including 15.43142294883728 seconds of waiting, thus 8.497565031051636 seconds of loading.
scrolling took 0.07308125495910645 seconds
Browser closed in 17.034998893737793 seconds, including 11.876393795013428 seconds of waiting, thus 5.158605098724365 seconds of loading.
scrolling took 0.22336888313293457 seconds
Browser closed in 16.168402910232544 seconds, including 11.266273021697998 seconds of waiting, thus 4.902129888534546 seconds of loading.


## Main Fields
### Title


In [22]:
def get_title(bs):
    """
    Get title of a publication
    :param bs: Bs4 object
    :return:
    """
    title = bs.find('span', class_='title-text').text.strip()
    return title

In [23]:
check = ['Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union',
         'A Structure-Based Platform for Predicting Chemical Reactivity',
         'Artistic movement recognition by consensus of boosted SVM based experts',
         'Semi-automatic inductive construction of reference process models that represent best practices in public administrations: A method']

for soup, check in zip(elsevier_soups, check):
    print(get_title(soup))
    assert get_title(soup) == check

Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
A Structure-Based Platform for Predicting Chemical Reactivity
Artistic movement recognition by consensus of boosted SVM based experts
Semi-automatic inductive construction of reference process models that represent best practices in public administrations: A method


### Doi

In [24]:
def get_doi(bs, type='doi_number'):
    """
    Gets the doi_number of a publication
    :param bs: Bs4 object
    :return:
    """
    regex_doi = re.compile(r'http(s?)://doi.org/.*')
    doi_link = bs.find('a', class_='doi').text.strip()

    if type == 'doi_number':
        # control and clean doi
        if regex_doi.match(doi_link):
            doi = re.sub(r'http(s?)://doi.org/', '', doi_link)
            return doi
        else:
            return None
    if type == 'doi_link':
        if regex_doi.match(doi_link):
            return doi_link
        else:
            return None


In [25]:
get_doi(free1_soup)

'10.1016/j.eneco.2015.09.004'

### Authors

In [26]:
def get_authors(bs):
    """
    Get authors of a publication
    :param bs:
    :return: Author names as list
    """
    authors = []
    try:
        author_boxes = bs.find('div', {'class': 'author-group', 'id': 'author-group'}).find_all('a')

        for box in author_boxes:
            first_name = box.find('span', class_='text given-name').text.strip()
            last_name = box.find('span', class_='text surname').text.strip()
            authors.append(f'{first_name} {last_name}')
        return authors
    except:
        return None

In [27]:
check = [3, 5, 2, 5]
for soup, check in zip(elsevier_soups, check):
    print(get_authors(soup))
    assert len(get_authors(soup)) == check


['Andreas Löschel', 'Frank Pothen', 'Michael Schymura']
['Frederik Sandfort', 'Felix Strieth-Kalthoff', 'Marius Kühnemund', 'Christian Beecks', 'Frank Glorius']
['Corneliu Florea', 'Fabian Gieseke']
['Hendrik Scholta', 'Marco Niemann', 'Patrick Delfmann', 'Michael Räckers', 'Jörg Becker']


### Keywords

In [28]:
def get_keywords(bs):
    """
    Get list of keywords
    :param bs: Received bs of the publication
    :return: List of strings
    """
    keywords = []
    try:
        kwds = bs.find('div', class_='keywords-section').find_all('div', class_='keyword')
        for kwd in kwds:
            keyword = kwd.text.strip()
            keywords.append(keyword)
        return keywords
    except:
        return None

In [29]:
for soup in elsevier_soups:
    print(get_keywords(soup))

['Environmental and climate economics', 'Energy intensity', 'Index decomposition']
['reactivity prediction', 'machine learning', 'molecular structures', 'organic chemistry', 'yield prediction', 'enantioselectivity prediction']
['Randomized boosted SVMs', 'Multi-scale topography', 'Painting style recognition', 'Consensus of experts', 'Ensembles']
['Process management', 'Process modeling', 'Reference modeling', 'Process model merge', 'E-government', 'Public administration', 'Benchmarking', 'Model querying']


### Abstract

In [30]:
def get_abstract(bs):
    """
    Get abstract of a publication
    :param bs: Received bs of the publication
    :return: Abstract : String
    """
    try:
        abstract = bs.find('div', class_='abstract author').div.text.strip()
        return abstract
    except:
        return None

In [31]:
for soup in elsevier_soups:
    print(f'Publication: {get_title(soup)}')
    a = ''
    for x in get_authors(soup):
        a += (x + '; ')
    print(f'Authors: {a}')
    print('Abstract: ')
    print(get_abstract(soup))
    print('-------- \n')

Publication: Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
Authors: Andreas Löschel; Frank Pothen; Michael Schymura; 
Abstract: 
One of the most promising ways of meeting climate policy targets is improving energy efficiency, i.e., reducing the amount of scarce and polluting resources needed to produce a given quantity of output. Relying upon the World Input-Output Database (WIOD), we investigate the decline in energy intensity in the EU27 countries between 1995 and 2009. Changes in energy intensity can be attributed to two different drivers: changes in the industrial composition of an economy and changes in its sectoral energy intensities. We conduct a series of index decomposition analyses (IDA) to isolate the effects exerted by these drivers. We then take the findings from the index decomposition analysis and subject them to panel estimations. The objective is to control for factors that may have shaped the evolution of energy i

### Full text

In [34]:
def __extract_text_from_p_tags(p_tags):
    """
    Help method to extract text from multiple p-tags
    :param p_tags: List of p-tags as result of a bs search : bs4.element.ResultSet
    :return: String with total text
    """
    result_text = ''
    for p in p_tags:
        if result_text == '':
            result_text += p.text  # No break for first paragraph
        else:
            result_text += f'\n\n{p.text}'  # Break for paragraph
    return result_text


def __process_text_recursive(section):
    """
    Takes a HTML sections from ScienceDirect and extracts the text from it while regarding (sub-)headings.\n
    You can use that function to apply it to the found h1-sections so the function can
    take care of the potential subsections in the passed section. \n
    Headings are recognized by the function and printed as new lines. \n
    The function is recursive and calls itself for sub-sections.
    :params section: HTML section as bs4.element.Tag
    :return: Text as string with new lines for headings
    """
    # extracts the texts of given section including sub headings
    # todo question: Filter out mathemtical formulas?
    children = section.findAll(recursive=False)
    t = ''
    for child in children:
        p_tags = []
        if child.name == 'h3' or child.name == 'h4':
            t += f'{child.text}\n'
        if child.name == 'div':
            p_new = child.findAll('p')
            p_tags.extend(p_new)
        if child.name == 'section':
            t += __extract_text_from_p_tags(p_tags)
            p_tags = []
            t += __process_text_recursive(child)
        if child.name == 'p':
            p_tags.append(child)
        t += __extract_text_from_p_tags(p_tags) + '\n'
    return t


def __check_text_available(bs):
    """
    Check if test is available for a publication
    :param bs: Received bs of the publication (HTML must be accessed with Selenium or Helium, does not work otherwise)
    :return: Boolean
    """
    try:
        body = bs.find('div', {'class': 'Body u-font-serif', 'id': 'body'})
    except:
        pass
    if body is not None:
        return True
    else:
        return False


def get_full_text(bs):
    # todo can be easy rewritten to produce markdown files
    """
    Get full text of a publication if online available
    :param bs: Received bs of the publication (HTML must be accessed with Selenium or Helium, does not work otherwise)
    :return: Full text : String with little formatting (new-lines for headings)
    """
    if not __check_text_available(bs):
        return None

    try:
        text = []
        body = bs.find('div', {'class': 'Body u-font-serif', 'id': 'body'})
        body_sections = body.div.findAll('section', recursive=False)  #sections of body -> h2 level
        for section in body_sections:
            chapter_name = section.h2.text
            chapter_text = __process_text_recursive(section)
            text.append({
                'chapter_name': chapter_name,
                'chapter_text': chapter_text
            })
        return text
    except:
        return None

In [36]:
# Quick Demonstration of full texts
for soup in elsevier_soups_full:
    print(f'Publication:\n> {get_title(soup)} \n')
    text = get_full_text(soup)
    print('\nFull text: \n')
    if text is None:
        print(f'Publication:\n{get_title(soup)}')
        print('>> No full text available, the WWU probably does not own a license for this publication')
        print('-------- \n')
        continue
    for chapter in text:
        chapter_name = chapter['chapter_name']
        chapter_text = chapter['chapter_text']
        print(f'Kapitel: {chapter_name} \n')
        print(chapter_text)
        print('\n')
    print('-------- \n')

Publication:
> Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union 


Full text: 

Kapitel: 1. Introduction 


Improving energy efficiency is one of the most promising ways of meeting the emission targets set by climate policy. What is more, it may also help reduce dependency on fossil fuels and foster industrial competitiveness (Ang et al., 2010). Fig. 1a to c tells a story about the evolution of energy intensity in Europe between 1995 and 2009. The gross aggregate output of the EU 27 increased by 37.3% (Fig. 1a) and total energy use decreased by 0.4% (Fig. 1b) in this period. As a result of both, energy intensity declined by 27.4% (Fig. 1c). Note that the gross output declined by 6.1% and energy use by 6.7% between 2008 and 2009 due to the financial crises. The energy intensity did not experience a visible shock in 2009.

Fig. 1. Gross output, energy use and energy intensity in the EU27 1995–2009.
These three figures appear to tell us 

## Journal fields
### Journal name

In [310]:
def get_journal_name(bs):
    """
    Get the journal name where the paper has been published
    :param bs: Received bs of the publication
    :return: String
    """
    # Some Journals have their name as text and logo, other have their name as text only and a dedicated logo
    # We have to differentiate between both cases
    journal_bar = bs.find('div', {'id': 'publication'})
    if journal_bar.attrs['class'] == ['Publication', 'wordmark-layout']:
        journal_name = journal_bar.find('h2', class_=lambda c: 'publication-title-link' in c).text.strip()
    else:
        journal_name = journal_bar.find('a', class_='publication-title-link').text.strip()
    return journal_name

In [317]:
check = ['Energy Economics', 'Chem', 'Journal of Visual Communication and Image Representation', 'Information Systems']

for soup, check in zip(elsevier_soups, check):
    print(get_journal_name(soup))
    assert get_journal_name(soup) == check

Energy Economics
Chem
Journal of Visual Communication and Image Representation
Information Systems


### Journal information (Volume, Publication Month/Year, Pages)
#### volume
#### Publication date
#### start end page

In [441]:
# Information are parsed from an an unstructured div-element and thus extracted in the same method
def get_journal_information(bs):
    """
    Returns the journal information of a publication
    :param bs: bs4 object
    :return: Dictionary with journal information: Volume, Release, Start page, End page
    """
    try:
        journal_info = bs.find('div', {'class': 'Publication', 'id': 'publication'}).find('div',
                                                                                          class_='text-xs')
        # dummy comment to identify type
        comment_markup = "<b><!--I am an comment--></b>"
        _x = BeautifulSoup(comment_markup, "html.parser")
        _comment = _x.b.string

        counter = 0  # count HTML comments
        result = ['Volume', 'Year', 'PageRange']  # create list of results
        # iterate over sub-content of textbox
        for x in journal_info.contents:
            if x.text.strip() == ',':  # skip when comma is found
                continue
            # increment counter if comment is found
            if isinstance(x, type(_comment)):
                counter += 1
                continue
            # add text to result list if no special case  (comment, comma
            result[counter] = x.text.strip()

        # clean page range
        result[2] = result[2].removeprefix(', Pages ')
        volume, release, page_range = result
        # Split by hyphen to get start and end_page
        start_page = page_range.split('-')[0]
        end_page = page_range.split('-')[1]

        return {
            'volume': volume, 'release': release, 'start_page': start_page, 'end_page': end_page
        }
    except:
        return None


def get_volume(bs):
    """
    Returns the information about the volume of a jourmal in which the publication has been published
    :param bs: bs4 object
    :return: String
    """
    try:
        return get_journal_information(bs)['volume']
    except:
        return None


def get_release(bs):
    """
    Returns the information about the release of a jourmal in which the publication has been published
    :param bs: bs4 object
    :return: String
    """
    try:
        return get_journal_information(bs)['release']
    except:
        return None


def get_start_page(bs):
    """
    Returns the start page of a publication in a journal
    :param bs: bs4 object
    :return: String
    """
    try:
        return get_journal_information(bs)['start_page']
    except:
        return None


def get_end_page(bs):
    """
    Returns the end page of a publication in a journal
    :param bs: bs4 object
    :return: String
    """
    try:
        return get_journal_information(bs)['end_page']
    except:
        return None

In [446]:
# Small test
for soup in elsevier_soups:
    print(get_title(soup))
    print(get_journal_information(soup))
    print(f'Volume: {get_volume(soup)}')
    print(f'Release: {get_release(soup)}')
    print(f'Start page: {get_start_page(soup)}')
    print(f'End page: {get_end_page(soup)}')
    print('-------- \n')

Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
{'volume': 'Volume 52, Supplement 1', 'release': 'December 2015', 'start_page': 'S63', 'end_page': 'S75'}
Volume: Volume 52, Supplement 1
Release: December 2015
Start page: S63
End page: S75
-------- 

A Structure-Based Platform for Predicting Chemical Reactivity
{'volume': 'Volume 6, Issue 6', 'release': '11 June 2020', 'start_page': '1379', 'end_page': '1390'}
Volume: Volume 6, Issue 6
Release: 11 June 2020
Start page: 1379
End page: 1390
-------- 

Artistic movement recognition by consensus of boosted SVM based experts
{'volume': 'Volume 56', 'release': 'October 2018', 'start_page': '220', 'end_page': '233'}
Volume: Volume 56
Release: October 2018
Start page: 220
End page: 233
-------- 

Semi-automatic inductive construction of reference process models that represent best practices in public administrations: A method
{'volume': 'Volume 84', 'release': 'September 2019', 'start_page': 

## Speciel sciencedirect fields
### Author highlights

In [43]:
def get_author_highlights(bs):
    """
    Returns the author highlights of a publication in bullet points
    :param bs: bs4 object
    :return: [String] containing bulletpoints
    """
    try:
        highlight_box = bs.find('div', class_='abstract author-highlights')
        bullet_points = [x.text.strip() for x in highlight_box.find_all('dd', class_='list-description')]
        return bullet_points
    except:
        return None

In [44]:
for soup in elsevier_soups:
    print(get_title(soup))
    if get_author_highlights(soup) is not None:
        for x in get_author_highlights(soup):
            print(f' > {x}')
        print('-------- \n')

Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
A Structure-Based Platform for Predicting Chemical Reactivity
 > Quantitative modeling of reaction outcomes via machine learning
 > Prediction of properties, yields, stereoselectivities, and relative conversion
 > Multiple fingerprint features as a versatile and robust molecular representation
 > Readily applicable machine learning tool, directly starting from molecular structures
-------- 

Artistic movement recognition by consensus of boosted SVM based experts
 > We introduce a system for automatic art movement recognition of a digitized painting.
 > To describe images we combine Color Structure Descriptor and Multi-Scale Topography.
 > Boosted ensemble of SVMs regularized with random feature category does classification.
 > We introduce a new collection of digitized paintings to ease evaluation.
 > The accuracy compares favorably with classical and deep learning.
-------- 

Semi-auto

### Editor highlights

In [478]:
def get_editor_highlights(bs):
    """
    Returns the editor highlights of a publication in bullet points
    :param bs: bs4 object
    :return: String
    """
    try:
        highlight_box = bs.find('div', class_='abstract editor-highlights')
        title = ''
        text = ''
        try:
            title = highlight_box.h2.text.strip()
        except:
            pass
        try:
            text = highlight_box.find('p').text.strip()
        except:
            return None
        if title == '':
            return text
        else:
            return f'{title}: {text}'
    except:
        return None

In [495]:
for soup in elsevier_soups:
    print(get_title(soup))
    print(f'> {get_editor_highlights(soup)}')
    print('-------- \n')

Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
> None
-------- 

A Structure-Based Platform for Predicting Chemical Reactivity
> The Bigger Picture: Statistical data-based prediction models have found widespread application in nearly all areas of science, including chemistry. In this context, the prediction of molecular properties or biological activities for a target molecule (quantitative structure-property relationships [QSPRs]) has been widely investigated, with great focus on developing new and general molecular representations. However, the underlying fundamentals have not been transferred to the prediction of chemical reactivity. In contrast, although recent progress in high-throughput data generation has enabled the generation of uniform reaction-based datasets, current prediction models suggest that complex parameterization is required for each individual case to achieve good results. Applying universal (structure-based) mo

### References

In [773]:
def __check_references_available(bs):
    """
    Check if test is available for a publication
    :param bs: Received bs of the publication (HTML must be accessed with Selenium or Helium, does not work otherwise)
    :return: Boolean
    """
    try:
        body = bs.find('dl', class_='references')
    except:
        pass
    if body is not None:
        return True
    else:
        return False


def get_references(bs):
    """
    Returns a list of references and their links, if available. \n
    Basic extraction of meta data, though not the focus since we can scrape in detail with doi numbers.
    :param bs: bs4 object
    :return:
    """

    if not __check_references_available(bs):
        return None
    try:
        references_result = []
        ref_list = bs.find('dl', {'class': 'references', 'id': re.compile(r'reference-links-.*')})
        found_references = ref_list.find_all('dd', class_='reference')
        for ref in found_references:

            authors = None
            title = None
            source = None
            article_link = None
            doi_link = None
            google_scholar_link = None

            try:
                authors = ref.find('div', class_='contribution').contents[0].strip().split(',')
            except:
                pass
            try:
                title = ref.find('div', class_='contribution').contents[1].text.strip()
            except:
                pass
            try:
                source = ref.find('div', class_='host').text.strip()
            except:
                pass

            link_box = ref.find('div', class_='ReferenceLinks u-font-sans')

            try:
                article_link = link_box.find('a', text='Article').get('href')
                if 'http' not in article_link:
                    article_link = 'https://www.sciencedirect.com' + article_link
            except:
                pass
            try:
                doi_link = link_box.find('a', text='CrossRef').get('href')
            except:
                pass
            try:
                google_scholar_link = link_box.find('a', text='Google Scholar').get('href')
            except:
                pass

            references_result.append({
                'authors': authors,
                'title': title,
                'source': source,
                'article_link': article_link,
                'doi_link': doi_link,
                'google_scholar_link': google_scholar_link
            })
        return references_result

    except:
        print("Error in get_references")
        return None


In [817]:
# Quick Test -> works
assert len(get_references(free1_soup_full)) == 46  # correct values
assert len(get_references(free2_soup_full)) == 64  # correct values

for soup in elsevier_soups_full:
    print('###########################################################################')
    print(f'Publikation: {get_title(soup)}')
    print('Referenzen: \n')
    if get_references(soup) is not None:
        for idx, ref in enumerate(get_references(soup)):
            print(f'Quelle {idx} : {ref.get("title")}, {ref.get("authors")}, {ref.get("source")}')
            print(f' > Article link: {ref.get("article_link")}')
            print(f' > DOI link: {ref.get("doi_link")}')
            # print(f' > Google Scholar link: {ref.get("google_scholar_link")}') # skipped for readability
    print('\n \n \n')


###########################################################################
Publikation: Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
Referenzen: 

Quelle 0 : Trade and productivity, ['F. Alcalá', ' A. Ciccone'], Q. J. Econ., 119 (2) (2004), pp. 613-646
 > Article link: None
 > DOI link: None
Quelle 1 : Comparison of energy intensities in European Union countries. Results of a structural decomposition analysis, ['V. Alcantara', ' R. Duarte'], Energy Policy, 32 (2004), pp. 177-189
 > Article link: https://www.sciencedirect.com/science/article/pii/S030142150200263X
 > DOI link: None
Quelle 2 : Decomposition of industrial energy consumption, ['B.W. Ang'], Energy Econ., 16 (1994), pp. 163-174
 > Article link: https://www.sciencedirect.com/science/article/pii/0140988394900302
 > DOI link: None
Quelle 3 : Decomposition of aggregate energy and gas emission intensities for industry: a refined Divisia index method, ['B.W. Ang', ' K.H. Choi

### #Citations
According Elsevier-Plumx metric

In [820]:
def get_amount_citations(bs):
    """
    Retuns the amount of citations according to ScienceDirect metric
    :param bs: bs4 object
    :return: # Citations
    """
    try:
        amount_citations = bs.find('li', class_='plx-citation').find('span', class_='pps-count').text.strip()
        return amount_citations
    except:
        return None


In [822]:
for soups in elsevier_soups_full:
    print(get_title(soups))
    print(f' > Amount of citations: {get_amount_citations(soups)}')
    print('-------- \n')

Peeling the onion: Analyzing aggregate, national and sectoral energy intensity in the European Union
 > Amount of citations: 34
-------- 

A Structure-Based Platform for Predicting Chemical Reactivity
 > Amount of citations: 77
-------- 

Artistic movement recognition by consensus of boosted SVM based experts
 > Amount of citations: 7
-------- 

Semi-automatic inductive construction of reference process models that represent best practices in public administrations: A method
 > Amount of citations: 9
-------- 

