In [129]:
from IPython.core.display import display, HTML
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import requests
import re
import random
import time
from collections import namedtuple
import numpy as np

## Web scraping of paper metadata and abstracts

Helper functions for web scraping. 

In [139]:
def get_soup_from_url(url):
    
    """
    Returns beautifulsoup object from given URL
    """
    
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    response  = requests.get(url, headers = user_agent)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup

In [140]:
  def scrape_data_from_link_soup(link_soup, link):    
        
    """
    Returns all data from paper link 
    """
 
    title = link_soup.find('meta', attrs={'name': 'dc.title'})['content']
    date = link_soup.find('time').text
    journal = re.sub(r'[0-9:]+', '', link_soup.find('meta', attrs={'name': 'dc.source'})['content']).rstrip()
    abstract = link_soup.find('meta', attrs={'name': 'description'})['content']
    
    authors = []
    authors_data = link_soup.findAll('meta', {'name': 'citation_author'})
    for author in authors_data:
        authors.append(author['content'])
    
    if len(authors)==0:
        first_author = 'NaN'
        uni = 'NaN'
        PI = 'NaN'
    else:   
        first_author = authors[0]
        PI_data = authors_data[-1]
        uni = PI_data.find_next_sibling('meta')['content']
        PI = authors[-1]
    
    metrics_link = link + "/metrics"
    metrics_soup = get_soup_from_url(metrics_link)
    accesses = metrics_soup.find('dl',{"class":"c-article-metrics__access-citation"}).dt.text.strip()
    citations = metrics_soup.find('dl',{"class":"c-article-metrics__access-citation"}).find_next('dt').find_next('dt').text.strip()
    
    return([title, accesses, citations, date, journal, first_author, PI, uni, authors, abstract])

In [141]:
def scrape_links_from_page_soup(page_soup): 
    
    """
    Returns list of links for one page
    """
    
    links = []
    for line in page_soup.findAll('a',{"data-track-action":"search result"}):
        links.append("https://www.nature.com" + str(line['href']))
    return links          

In [142]:
def scrape_page(page_soup):
    
    """
    Returns all data from all links in one page
    """
    
    links = scrape_links_from_page_soup(page_soup)
    data_from_one_page = []
    for link in links:
        link_soup = get_soup_from_url(link)
        link_data = scrape_data_from_link_soup(link_soup, link)
        data_from_one_page.append(link_data)
        print(link)
        time.sleep(.5+2*random.random())
    return pd.DataFrame(data_from_one_page)

In [134]:
def scrape_all_pages(start_page, end_page):
    
    """
    Scrape all pages 
    """
    
    start_page = start_page
    all_data = pd.DataFrame()
    url = "https://www.nature.com/search?article_type=protocols%2Cresearch%2Creviews&subject=genetics&page=" + str(start_page)
    while start_page < end_page:
        start_page = start_page + 1
        page = requests.get(url)
        if page.status_code != 200:
            break
        print(start_page)
        url = "https://www.nature.com/search?article_type=protocols%2Cresearch%2Creviews&subject=genetics&page=" + str(start_page)
        print(url)
        page_soup = get_soup_from_url(url)
        page_df = scrape_page(page_soup)
        all_data = all_data.append(page_df)
        time.sleep(.5+2*random.random())
    return all_data

Scrape data from specified page numbers and save in flat file.

In [None]:
all_data = scrape_all_pages(250,261)
all_data.to_csv("paper_data_250_260.csv",index=False)