In [1]:
from bs4 import BeautifulSoup
from helium import *
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
url = 'https://www.sciencedirect.com/science/article/pii/S2213133715000657?via%3Dihub'  # open access paper with a lot of information
url2 = 'https://www.sciencedirect.com/science/article/abs/pii/S0306437918300838?via%3Dihub'  # pay per paper view with less information (available at least)

# A: Selenium

In [15]:
def get_HTML_selenium(url, os):
    """
    Get HTML from a website using Selenium and ChromeDriver. Methods runs headless per default and has JS activated.
    Be aware that this method is quite slow and schould only be used if classic requests method cannot access information thus only use that for dynamic data.
    :param url: URL of a website
    :param os: Operating system of the user (Windows, Linux, Mac)
    :return: HTML with all loaded content
    """
    # print current path
    if os == 'mac':
        PATH_MAC = 'Research_Scraper_Code/driver/chromedriverMAC'
    options = Options()
    options.add_argument("--headless=chrome")
    options.add_argument("--enable-javascript")
    options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])

    start = time.time()

    driver = webdriver.Chrome(PATH_MAC, options=options)
    driver.get(url)
    break_time = 5
    time.sleep(break_time)

    # todo if content owned by WWU add sleep to load more content
    html = driver.page_source
    driver.close()

    end = time.time()

    print(
        f'Browser closed in {end - start} seconds, including {break_time} seconds of waiting, thus {end - start - break_time} seconds of loading.')
    return html

In [16]:
html1 = get_HTML_selenium(url, 'mac')

WebDriverException: Message: 'chromedriverMAC' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


In [288]:
soup1 = BeautifulSoup(html1, 'html.parser')
print(soup1.text)




















Nearest neighbor density ratio estimation for large-scale applications in astronomy - ScienceDirect













Processing math: 12%

      JavaScript is disabled on your browser.
      Please enable JavaScript to use all the features on this page.
      

Skip to main content
Skip to article

ScienceDirectJournals & BooksHelpSearchInstitutionRegisterSign in PDFView PDFDownload Full IssueView Open ManuscriptOther access optionsNavigate DownSearchOutlineAbstractKeywords1. Introduction2. Kernel-based density ratio estimation3. Nearest neighbor density ratio estimation revisited4. Experiments5. ConclusionAcknowledgmentsReferencesShow full outlineNavigate DownCited By (20)Figures (5)Astronomy and ComputingVolume 12, September 2015, Pages 67-72Full length articleNearest neighbor density ratio estimation for large-scale applications in astronomyAuthor links open overlay panelJ.KremeraPersonEnvelopeF.GiesekebK.Steenstrup PedersenacC.IgelacShow moreNavigate DownListOutlineP

In [283]:
def quick_Selenium_Test():
    html = get_HTML_selenium('https://www.sciencedirect.com/science/article/pii/S2213133715000657?via%3Dihub', 'mac')
    soup = BeautifulSoup(html, 'html.parser')
    title_test = soup.find('span', class_='title-text').text.strip()
    print('Testing accessing title:')
    print(title_test)

    assert title_test == 'Nearest neighbor density ratio estimation for large-scale applications in astronomy', f'Expted title not matched'


quick_Selenium_Test()

Browser closed in 22.284823179244995 seconds, including 5 seconds of waiting, thus 17.284823179244995 seconds of loading.
Testing accessing title:
Nearest neighbor density ratio estimation for large-scale applications in astronomy


# B: Trying helium

In [19]:
def get_HTML_helium(url):
    """
    Get HTML from a website using helium and ChromeDriver. Helium is a lightweight Selenium adapter. It comes with simple wait and click functions.
    Method runs headless per default and has JS activated. By adding arguments the method mimics a user so that Elesevier returns the full HTML and allows loading.
    Be aware that this method is quite slow and schould only be used if classic requests method cannot access information thus only use that for dynamic data.
    :param url:
    :return:
    """
    ## helium does not work for science direct, so we use selenium instead => works!
    url = 'https://www.sciencedirect.com/science/article/pii/S2213133715000657?via%3Dihub'

    start = time.time()

    # Tricking Elsevier
    options = Options()
    options.add_argument("--headless=chrome")
    options.add_argument("--enable-javascript")
    #options.add_argument('--no-sandbox')
    options.add_argument("--disable-extensions")
    options.add_argument("--start-maximized")
    options.add_argument("window-size=1920,1080")
    browser = start_chrome(url, options=options)

    wait_until_start = time.time()
    wait_until(lambda: not Text("Loading...").exists(), timeout_secs=10, interval_secs=0.5)
    wait_until_end = time.time()

    html = browser.page_source
    kill_browser()

    end = time.time()

    print(f'Waited {wait_until_end - wait_until_start} seconds for the wait_until method')
    print(f'and Total waiting time: {end - start} seconds')
    return html


In [20]:
html = get_HTML_helium(url)

WebDriverException: Message: Service /Users/leoncena/opt/anaconda3/envs/Research_Scraper/lib/python3.9/site-packages/helium/_impl/webdrivers/mac/chromedriver unexpectedly exited. Status code was: -9


In [21]:
def quick_Helium_Test():
    html = get_HTML_helium('https://www.sciencedirect.com/science/article/pii/S2213133715000657?via%3Dihub')
    soup = BeautifulSoup(html, 'html.parser')
    title_test = soup.find('span', class_='title-text').text.strip()
    print('Testing accessing title:')
    print(title_test)

    assert title_test == 'Nearest neighbor density ratio estimation for large-scale applications in astronomy', f'Expted title not matched'


quick_Helium_Test()

Waited 15.016202926635742 seconds for the wait_until method
and Total waiting time: 26.3247127532959 seconds
Testing accessing title:
Nearest neighbor density ratio estimation for large-scale applications in astronomy


In [None]:
# small test if the last reference is recognized (hard-copied!)

print("ref test")
print(soup.find('div', {'id': 'ref-id-sbref29'}))

# misc
Kill Helium

In [None]:
kill_browser()  # kills Helium browser

Kill Selenium