In [279]:
from bs4 import BeautifulSoup
from helium import *
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [280]:
url = 'https://www.sciencedirect.com/science/article/pii/S2213133715000657?via%3Dihub'  # open access paper with a lot of information
url2 = 'https://www.sciencedirect.com/science/article/abs/pii/S0306437918300838?via%3Dihub'  # pay per paper view with less information (available at least)

# A: Selenium

In [282]:
def get_HTML_selenium(url, os):
    """
    Get HTML from a website using Selenium and ChromeDriver. Methods runs headless per default and has JS activated.
    Be aware that this method is quite slow and schould only be used if classic requests method cannot access information thus only use that for dynamic data.
    :param url: URL of a website
    :param os: Operating system of the user (Windows, Linux, Mac)
    :return: HTML with all loaded content
    """
    if os == 'mac':
        PATH_MAC = '../driver/chromedriverMAC'
    options = Options()
    options.add_argument("--headless=chrome")
    options.add_argument("--enable-javascript")

    start = time.time()

    driver = webdriver.Chrome(PATH_MAC, options=options)
    driver.get(url)
    break_time = 5
    time.sleep(break_time)

    # todo if content owned by WWU add sleep to load more content
    html = driver.page_source
    driver.close()

    end = time.time()

    print(
        f'Browser closed in {end - start} seconds, including {break_time} seconds of waiting, thus {end - start - break_time} seconds of loading.')
    return html

In [281]:
html1 = get_HTML_selenium(url, 'mac')

KeyboardInterrupt: 

In [None]:
soup1 = BeautifulSoup(html1, 'html.parser')
print(soup1.text)

In [283]:
def quick_Selenium_Test():
    html = get_HTML_selenium('https://www.sciencedirect.com/science/article/pii/S2213133715000657?via%3Dihub', 'mac')
    soup = BeautifulSoup(html, 'html.parser')
    title_test = soup.find('span', class_='title-text').text.strip()
    print('Testing accessing title:')
    print(title_test)

    assert title_test == 'Nearest neighbor density ratio estimation for large-scale applications in astronomy', f'Expted title not matched'


quick_Selenium_Test()

Browser closed in 22.284823179244995 seconds, including 5 seconds of waiting, thus 17.284823179244995 seconds of loading.
Testing accessing title:
Nearest neighbor density ratio estimation for large-scale applications in astronomy


# B: Trying helium

In [284]:
def get_HTML_helium(url):
    """
    Get HTML from a website using helium and ChromeDriver. Helium is a lightweight Selenium adapter. It comes with simple wait and click functions.
    Method runs headless per default and has JS activated. By adding arguments the method mimics a user so that Elesevier returns the full HTML and allows loading.
    Be aware that this method is quite slow and schould only be used if classic requests method cannot access information thus only use that for dynamic data.
    :param url:
    :return:
    """
    ## helium does not work for science direct, so we use selenium instead => works!
    url = 'https://www.sciencedirect.com/science/article/pii/S2213133715000657?via%3Dihub'

    start = time.time()

    # Tricking Elsevier
    options = Options()
    options.add_argument("--headless=chrome")
    options.add_argument("--enable-javascript")
    #options.add_argument('--no-sandbox')
    options.add_argument("--disable-extensions")
    options.add_argument("--start-maximized")
    options.add_argument("window-size=1920,1080")
    browser = start_chrome(url, options=options)

    wait_until_start = time.time()
    wait_until(lambda: not Text("Loading...").exists(), timeout_secs=10, interval_secs=0.5)
    wait_until_end = time.time()

    html = browser.page_source
    kill_browser()

    end = time.time()
    
    print(f'Waited {wait_until_end - wait_until_start} seconds for the wait_until method')
    print(f'and Total waiting time: {end - start} seconds')
    return html


In [None]:
html = get_HTML_helium(url)

In [285]:
def quick_Helium_Test():
    html = get_HTML_helium('https://www.sciencedirect.com/science/article/pii/S2213133715000657?via%3Dihub')
    soup = BeautifulSoup(html, 'html.parser')
    title_test = soup.find('span', class_='title-text').text.strip()
    print('Testing accessing title:')
    print(title_test)

    assert title_test == 'Nearest neighbor density ratio estimation for large-scale applications in astronomy', f'Expted title not matched'


quick_Helium_Test()

Waited 21.20144271850586 seconds for the wait_until method
and Total waiting time: 28.12459969520569 seconds
Testing accessing title:
Nearest neighbor density ratio estimation for large-scale applications in astronomy


In [None]:
# small test if the last reference is recognized (hard-copied!)

print("ref test")
print(soup.find('div', {'id': 'ref-id-sbref29'}))

# misc
Kill Helium

In [None]:
kill_browser()  # kills Helium browser

Kill Selenium

In [None]:
driver.close()  # Kills Selenium browser