In [28]:
# Libraries Importation

import time
import asyncio
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.support.ui import Select

import warnings
warnings.filterwarnings('ignore')

# Data Extraction

![pubmed.svg](attachment:pubmed.svg)

In [2]:
PATH = webdriver.FirefoxOptions()
driver = webdriver.Firefox(options = PATH)

### Definition of the searching terms

    (
    ("2022"[Date - Publication]) 
    AND 
    (english[Language]) 
    AND 
    (neurosciences[MeSH Major Topic]) 
    AND 
    (journal article[Publication Type]) 
    NOT 
    (review[Publication Type])
    )

### First page search: extract parameters

In [23]:
years = ['2022', '2023']
page = ''

for year in years:
    url = f'https://pubmed.ncbi.nlm.nih.gov/?term=((%22{year}%22%5BDate%20-%20Publication%5D)%20AND%20(english%5BLanguage%5D)%20AND%20(neurosciences%5BMeSH%20Major%20Topic%5D)%20AND%20(journal%20article%5BPublication%20Type%5D)%20NOT%20(review%5BPublication%20Type%5D)){page}'
    driver.get(url)
    
    # Number of articles
    num_art_selector = '.results-amount-container > div:nth-child(1)'
    num_art_{year} = int(driver.find_elements(By.CSS_SELECTOR, num_art_selector)[0].text.split(' ')[0])

    # Number of pages
    num_pag_selector = '.top-pagination > div:nth-child(3) > label:nth-child(3)'
    num_pag_{year} = int(driver.find_elements(By.CSS_SELECTOR, num_pag_selector)[0].text.split(' ')[1])

    print(f'The search {year} presents {num_art_{year}} articles in {num_pag_{year}} pages')

In [14]:
# Create a Dictionary to save the results

articles = {'DOI': [],
            'title': [],
            'authors': [],
            'affiliations': [],
            'journal': [],
            'year': [],
            'month': [],
            'volume': [],
            'first_page': [],
            'last_page': [],
            'PMID': [],
            'PMCID': [],
            'abstract': []}

In [None]:
# Iterate over the pages of each search

years = ['2022', '2023']

for year in years:
    pages = [f'&page={i}' for i in range(2, num_pag_{year})]
    for page in pages:
        url = f'https://pubmed.ncbi.nlm.nih.gov/?term=((%22{year}%22%5BDate%20-%20Publication%5D)%20AND%20(english%5BLanguage%5D)%20AND%20(neurosciences%5BMeSH%20Major%20Topic%5D)%20AND%20(journal%20article%5BPublication%20Type%5D)%20NOT%20(review%5BPublication%20Type%5D)){page}'
        driver.get(url)
        
        # open the first article website
        driver.find_elements(By.CSS_SELECTOR, 'a.docsum-title')[0].click()
        time.sleep(3)

        for i in range(9):

        # toogle the affiliations
        driver.find_elements(By.CSS_SELECTOR, '#toggle-authors')[0].click()
        time.sleep(2)

        # Feed the dictionary    
        articles['title'].append(driver.find_element(By.CSS_SELECTOR, 'h1.heading-title').text)
        articles['authors'].append(driver.find_element(By.CSS_SELECTOR, 'div.authors').text.split(','))
        articles['affiliations'].append(driver.find_element(By.CSS_SELECTOR, 'div.affiliations').text.split('\n')[2::2])
        articles['journal'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split('\n')[0])
        articles['year'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split('.')[1].strip()[:4])
        articles['month'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split('.')[1].strip()[5:8])  
        articles['volume'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split(';')[1].strip().split(':')[0])
        articles['first_page'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split(';')[1].strip().split(':')[1].split('.')[0].split('-')[0])
        articles['last_page'].append(driver.find_element(By.CSS_SELECTOR, 'div.article-source').text.split(';')[1].strip().split(':')[1].split('.')[0].split('-')[1])
        articles['DOI'].append(driver.find_element(By.CSS_SELECTOR, '#full-view-identifiers').text.split(' ')[-1])
        articles['PMID'].append(driver.find_element(By.CSS_SELECTOR, '#full-view-identifiers').text.split(' ')[1])
        articles['PMCID'].append(driver.find_element(By.CSS_SELECTOR, '#full-view-identifiers').text.split(' ')[3])

        try:
            articles['abstract'].append(driver.find_element(By.CSS_SELECTOR, '#abstract').text.split('\n')[1:])
        except:
            articles['abstract'].append('No abstract available')

        # Go to the next article
        if len(driver.find_elements(By.CSS_SELECTOR, 'a.arrow-link:nth-child(1)')) == 1:
            driver.find_elements(By.CSS_SELECTOR, 'a.arrow-link:nth-child(1)')[0].click()
        else:
            driver.find_elements(By.CSS_SELECTOR, 'a.arrow-link:nth-child(1)')[1].click()

        # Wait 3 seconds
        time.sleep(3)
    
driver.quit()

In [17]:
articles_df = pd.DataFrame(articles)
articles_df

Unnamed: 0,title,authors,affiliations,journal,year,month,volume,first_page,last_page,DOI,PMID,PMCID,abstract
0,The Boston criteria version 2.0 for cerebral a...,"[Andreas Charidimou 1 , Gregoire Boulouis 2...","[Hemorrhagic Stroke Research Program, J Philip...",Lancet Neurol,2022,Aug,21(8),714,725,10.1016/S1474-4422(22)00208-3,35841910,PMC9389452,[Background: Cerebral amyloid angiopathy (CAA)...
1,Psychedelics and Neural Plasticity: Therapeuti...,"[Steven F Grieco 1 , Eero Castrén 2 , Gitt...","[Department of Anatomy and Neurobiology, Schoo...",J Neurosci,2022,Nov,42(45),8439,8449,10.1523/JNEUROSCI.1121-22.2022,36351821,PMC9665925,[Psychedelic drugs have reemerged as tools to ...
2,"""I Am Not I"": The Neuroscience of Dissociative...","[Lauren A M Lebois 1 , David A Ross 2 , Mi...","[McLean Hospital, Belmont, Massachusetts; Depa...",Biol Psychiatry,2022,Feb,91(3),e11,e13,10.1016/j.biopsych.2021.11.004,34961597,PMC9045405,No abstract available
3,Molecular basis for selective activation of DR...,"[Shicheng Zhang # 1 , Ryan H Gumpper # 1 , ...","[Department of Pharmacology, School of Medicin...",Nature,2022,Dec,612(7939),354,362,10.1038/s41586-022-05489-0,36450989,10.1038/s41586-022-05489-0,[Designer receptors exclusively activated by d...
4,Prevalence of cerebral amyloid angiopathy: A s...,"[Lieke Jäkel 1 , Anna M De Kort 1 , Cathar...","[Department of Neurology, Donders Institute fo...",Alzheimers Dement,2022,Jan,18(1),10,28,10.1002/alz.12366,34057813,PMC9290643,[Reported prevalence estimates of sporadic cer...
