# Web Scraping from PubMed

This notebook showcases a data extraction process from **PubMed**, focusing on **neuroscience** articles published within the years 2013 and 2023. Leveraging the powerful `selenium` framework, the script navigates through the PubMed platform, systematically retrieving pertinent data from a variety of articles.

![pubmed.svg](attachment:pubmed.svg)

In [1]:
# Libraries Importation

import time
import pandas as pd

from selenium import webdriver
from tqdm.notebook import tqdm
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.support.ui import Select
from joblib import Parallel, delayed, parallel_backend, effective_n_jobs

import warnings
warnings.filterwarnings('ignore')

### Definition of the searching terms

This search query in PubMed focuses on retrieving Journal Articles published in the years specified, written in English. The Terms to filter neuroscience discipline are: 'neuron', 'nervous' OR 'CNS'
    
    (neuron[Other Term]) 
    AND ("2013/01/01"[Date - Publication] : "3000"[Date - Publication])
    AND (english[Language])) 
    AND (Journal Article[Publication Type])

### Define the # of articles following those parametters.

In [2]:
url = 'https://pubmed.ncbi.nlm.nih.gov/?term=%28neuron%5BOther+Term%5D%29+AND+%28%222013%2F01%2F01%22%5BDate+-+Publication%5D+%3A+%223000%22%5BDate+-+Publication%5D%29+AND+%28english%5BLanguage%5D%29+AND+%28Journal+Article%5BPublication+Type%5D%29&filter=simsearch1.fha&filter=dates.2013%2F1%2F1-3000%2F12%2F12&filter=lang.english&filter=other.excludepreprints'

In [4]:
errors = [NoSuchElementException, ElementNotInteractableException]
wait = WebDriverWait(driver, timeout = 2, poll_frequency = .2, ignored_exceptions = errors)

NameError: name 'NoSuchElementException' is not defined

In [3]:
num_pages = ''

PATH = webdriver.FirefoxOptions()
driver = webdriver.Firefox(options = PATH)

driver.get(url)
wait.until(lambda d : revealed.send_keys("Displayed") or True)

# Number of articles
num_art_selector = '.results-amount-container > div:nth-child(1)'
num_art = driver.find_elements(By.CSS_SELECTOR, num_art_selector)[0].text.split(' ')[0]

# # Number of pages
num_pag_selector = '.top-pagination > div:nth-child(3) > label:nth-child(3)'
num_pag = driver.find_elements(By.CSS_SELECTOR, num_pag_selector)[0].text.split(' ')[1]
num_pages += num_pag

driver.quit()

print(f'This search presents {num_art} articles in {num_pag} pages')

AttributeError: 'str' object has no attribute 'capabilities'

In [4]:
# Transform the numbers to integers

num_pag = num_pag.replace(',', '')
num_pag = int(num_pag)

num_art = num_art.replace(',', '')
num_art = int(num_art)

In [5]:
# Create a list of URLs

nums = [i for i in range(2, num_pag)]
url_list = []
url_list.append('https://pubmed.ncbi.nlm.nih.gov/?term=%28neuron%5BOther+Term%5D%29+AND+%28%222013%2F01%2F01%22%5BDate+-+Publication%5D+%3A+%223000%22%5BDate+-+Publication%5D%29+AND+%28english%5BLanguage%5D%29+AND+%28Journal+Article%5BPublication+Type%5D%29&filter=simsearch1.fha&filter=dates.2013%2F1%2F1-3000%2F12%2F12&filter=lang.english&filter=other.excludepreprints')

for num in nums:
    url = f'https://pubmed.ncbi.nlm.nih.gov/?term=(neuron%5BOther%20Term%5D)%20AND%20(%222013%2F01%2F01%22%5BDate%20-%20Publication%5D%20%3A%20%223000%22%5BDate%20-%20Publication%5D)%20AND%20(english%5BLanguage%5D)%20AND%20(Journal%20Article%5BPublication%20Type%5D)&filter=simsearch1.fha&filter=dates.2013%2F1%2F1-3000%2F12%2F12&filter=lang.english&filter=other.excludepreprints&page={num}'
    url_list.append(url)

In [6]:
# Create a list to save the results

pubmed = {'article': []}

In [13]:
def extract_article(url):
    """
    Extracts information from a series of articles on PubMed.

    Args: 
        url (str): The URL of the webpage containing the articles.

    Returns:
        list: A list containing information extracted from each article.
    """
    PATH = webdriver.FirefoxOptions()
    driver = webdriver.Firefox(options = PATH)
    
    driver.get(url)
    time.sleep(1)
    
    # open the first article
    selector_1 = 'a.docsum-title'
    driver.find_element(By.CSS_SELECTOR, selector_1).click()
    time.sleep(3)
    
    for i in range(10):
        # Display Options
        selector_2 = 'div.display-options:nth-child(3) > button:nth-child(1)'    
        driver.find_element(By.CSS_SELECTOR, selector_2).click()
        
        # Display Options 2
        selector_3 = '#article-display-format'    
        driver.find_element(By.CSS_SELECTOR, selector_3).click()

        # Select PubMed format
        selector_4 = '#article-display-format > option:nth-child(2)'
        driver.find_element(By.CSS_SELECTOR, selector_4).click()

        # Save information
        selector_5 = '#article-details'
        pubmed['article'].append(driver.find_element(By.CSS_SELECTOR, selector_5).text)

        driver.back()
        time.sleep(3)

        # Go to the next article:
        next_button = driver.find_element(By.CLASS_NAME, 'next.side-link.visible')
        next_button.find_element(By.CLASS_NAME, 'arrow-link.adj-nav-link').click()
        time.sleep(2)
    
    driver.quit()
    return pubmed

In [14]:
print(f"Number of cores: {effective_n_jobs()}")

Number of cores: 16


In [15]:
paralelo = Parallel(n_jobs = 10, 
                    verbose = True)

In [16]:
pages = paralelo(delayed(extract_article)(url) for url in tqdm(url_list[:-1]))

  0%|          | 0/1161 [00:00<?, ?it/s]

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  4.6min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed: 27.3min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 65.4min


NoSuchElementException: Message: Unable to locate element: div.display-options:nth-child(3) > button:nth-child(1); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:189:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:507:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:132:16


In [23]:
len(pages)

NameError: name 'pages' is not defined

In [29]:
articles_raw = pd.DataFrame(pages)

In [30]:
articles_raw.head()

Unnamed: 0,article
0,[PMID- 29723499\nOWN - NLM\nSTAT- MEDLINE\nDCO...
1,[PMID- 31134902\nOWN - NLM\nSTAT- MEDLINE\nDCO...
2,[PMID- 27878473\nOWN - NLM\nSTAT- MEDLINE\nDCO...
3,[PMID- 23572569\nOWN - NLM\nSTAT- MEDLINE\nDCO...
4,[PMID- 31011228\nOWN - NLM\nSTAT- MEDLINE\nDCO...


In [31]:
articles_raw.to_csv('../data/articles_raw_df.csv', 
                   index = False)