# 01. Web Scraping from PubMed

This notebook showcases a data extraction process from **PubMed**, focusing on **neuroscience** articles published within the years 2022 and 2023. Leveraging the powerful `selenium` framework, the script navigates through the PubMed platform, systematically retrieving pertinent data from a variety of articles.

![pubmed.svg](attachment:pubmed.svg)

In [1]:
# Libraries Importation

import time
import pandas as pd

from selenium import webdriver
from tqdm.notebook import tqdm
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.support.ui import Select

import warnings
warnings.filterwarnings('ignore')

In [22]:
PATH = webdriver.FirefoxOptions()
driver = webdriver.Firefox(options = PATH)

### Definition of the searching terms

This search query in PubMed focuses on retrieving journal articles published in the years specified (2022 and 2023), written in English, containing the term `neuron` in their content, while excluding any articles categorized as reviews.

    (("{year}"[Date - Publication]) 
    AND (english[Language]) 
    AND (neuron[term]) 
    AND (journal article[Publication Type]) 
    NOT (review[Publication Type]))

### Define the # of articles following those parametters per year.

In [23]:
years = [2022, 2023]
num_pages = []
url_list = []

for year in tqdm(years):
    url = f'https://pubmed.ncbi.nlm.nih.gov/?term=%28%28%22{year}%22%5BDate+-+Publication%5D%29+AND+%28english%5BLanguage%5D%29+AND+%28neuron%5Bterm%5D%29+AND+%28journal+article%5BPublication+Type%5D%29+NOT+%28review%5BPublication+Type%5D%29%292&sort=pubdate'
    url_list.append(url)
    driver.get(url)
    time.sleep(10)

    # Number of articles
    num_art_selector = '.results-amount-container > div:nth-child(1)'
    num_art = driver.find_elements(By.CSS_SELECTOR, num_art_selector)[0].text.split(' ')[0]

    # # Number of pages
    num_pag_selector = 'html body main#search-page.search-page div.inner-wrap div#search-results.search-results div.top-wrapper div.top-pagination div.page-number-wrapper label.of-total-pages'
    num_pag = driver.find_elements(By.CSS_SELECTOR, num_pag_selector)[0].text.split(' ')[1]
    num_pages.append(num_pag)

    print(f'The search {year} presents {num_art} articles in {num_pag} pages')

  0%|          | 0/2 [00:00<?, ?it/s]

The search 2022 presents 443 articles in 45 pages
The search 2023 presents 336 articles in 34 pages


In [26]:
# Create a Dictionary to save the results

articles = {'DOI': [],
            'title': [],
            'authors': [],
            'affiliations': [],
            'journal': [],
            'year': [],
            'month': [],
            'volume': [],
            'first_page': [],
            'last_page': [],
            'PMID': [],
            'PMCID': [],
            'abstract': []}

In [27]:
%%time

for url in tqdm(url_list):
    driver.get(url)
    
    # open the first article website
    driver.find_elements(By.CSS_SELECTOR, 'a.docsum-title')[0].click()
    time.sleep(1)
    
    while True:
        
        # toogle the affiliations
        try:
            driver.find_elements(By.CSS_SELECTOR, '#toggle-authors')[0].click()
        except:
            continue

        # Feed the dictionary    
        articles['title'].append(driver.find_element(By.CSS_SELECTOR, 'h1.heading-title').text)
        articles['journal'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split('\n')[0])
        
        try:
            articles['DOI'].append(driver.find_element(By.CSS_SELECTOR, '#full-view-identifiers').text.split(' ')[-1])
        except:
            articles['DOI'].append('')
            
        try:
            articles['authors'].append(driver.find_element(By.CSS_SELECTOR, 'div.authors').text.split(','))
        except:
            articles['authors'].append('')

        try:
            articles['affiliations'].append(driver.find_element(By.CSS_SELECTOR, 'div.affiliations').text.split('\n')[2::2])
        except:
            articles['affiliations'].append('')

        try:
            articles['year'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split('.')[1].strip()[:4])
        except:
            articles['year'].append('')

        try:
            articles['month'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split('.')[1].strip()[5:8])  
        except:
            articles['month'].append('')

        try:
            articles['volume'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split(';')[1].strip().split(':')[0])
        except:
            articles['volume'].append('')

        try:
            articles['first_page'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split(';')[1].strip().split(':')[1].split('.')[0].split('-')[0])
        except:
            articles['first_page'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text)

        try:        
            articles['last_page'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split(';')[1].strip().split(':')[1].split('.')[0].split('-')[1])
        except:
            articles['last_page'].append('')

        try:        
            articles['PMID'].append(driver.find_element(By.CSS_SELECTOR, '#full-view-identifiers').text.split(' ')[1])
        except:
            articles['PMID'].append('')

        try:
            articles['PMCID'].append(driver.find_element(By.CSS_SELECTOR, '#full-view-identifiers').text.split(' ')[3])
        except:
            articles['PMCID'].append('')

        try:
            articles['abstract'].append(driver.find_element(By.CSS_SELECTOR, '#abstract').text.split('\n')[1:])
        except:
            articles['abstract'].append('')
        
        try:
            # Go to the next article
            if len(driver.find_elements(By.CSS_SELECTOR, 'a.arrow-link:nth-child(1)')) == 1:
                driver.find_elements(By.CSS_SELECTOR, 'a.arrow-link:nth-child(1)')[0].click()
            else:
                driver.find_elements(By.CSS_SELECTOR, 'a.arrow-link:nth-child(1)')[1].click()

            # Wait 2 seconds
            time.sleep(1)
            
        except:
            break

  0%|          | 0/2 [00:00<?, ?it/s]

CPU times: total: 4min 59s
Wall time: 46min 53s


In [28]:
driver.quit()

In [None]:
articles_df = pd.DataFrame(articles)
articles_df
len(articles_df)

In [78]:
# Save the articles df as .csv

articles_df.to_csv('../data/pubmed_articles_raw.csv', index = False)