# 01 - Web Scraping

We will be using a combination of BeautifulSoup and Selenium to fetch data. <br>
We then save the data to .csv files for later use.

## 1. Import Packages <a name="import"></a>

In [1]:
import re
import pandas as pd
import numpy as np
import random

In [2]:
import selenium
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.keys import Keys

import requests
from bs4 import BeautifulSoup
import time

In [27]:
%matplotlib inline
import csv
import seaborn as sns
import matplotlib.pyplot as plt

import pprint
import pandas as pd
import numpy as np

import os

## Table of Contents <a name="table"></a>
1. [Import Packages](#import)
2. [Scraping Data from...](#scrape)
    1. [U.S. News and World Report](#usnews)
    2. [Nature](#nature)
    3. [Google Scholar](#google)
3. [Scraping Methods Assemble](#assemble)

## 2. Scraping Data from... <a name="scrape"></a>

### U.S. News & World Report <a name="usnews"></a>

In [5]:
def scroller(driver, num_scrolls):
    """
        Helper function to scroll down pages that need
        time to render due to infinite pagination
        
        :param driver: Selenium driver for scrolling down
        :param num_scrolls: the number of times to scroll down
    """
    
    scroll_pause_time = 1

    scroll_ct = 0

    while scroll_ct < num_scrolls:

        # Get scroll height
        #This is the difference. Moving this *inside* the loop
        # means that it checks if scrollTo is still scrolling 
        last_height = driver.execute_script("return document.body.scrollHeight")

        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:

            # try again (can be removed)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            # Wait to load page
            time.sleep(SCROLL_PAUSE_TIME)

            # Calculate new scroll height and compare with last scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")

            # check if the page height has remained the same
            if new_height == last_height:
                # if so, you are done
                break
            # if not, move on to the next loop
            else:
                last_height = new_height
                continue
                
        scroll_ct+=1

In [115]:
def get_rankings():
    """
        Function to get best biological school ranking info
        from U.S. News & Worlds Report
        
        :returns: uni_df, pandas Dataframe of rankings for schools
    """
    
    rankings_url = 'https://www.usnews.com/best-graduate-schools/\
                    top-science-schools/biological-sciences-rankings'
    driver = webdriver.Chrome()
    driver.get(rankings_url)
    scroller(driver, 10)
    rankings_soup = BeautifulSoup(driver.page_source, 'lxml')
    
    uni_list = rankings_soup.find_all('div', {'class':'Box-s85n6m5-0 kKdFhD'})

    uni_dict = {}
    for idx, uni in enumerate(uni_list):
        if idx == 100: #only want 100 schools
            break

        name = uni.get('name')
        name = name.replace('--', ', ') #change '--' in name like 'University of California -- Berkeley' to
                                        #'University of California, Berkeley'

        rank = (uni.find('strong', {'class':'s144f3me-0-Strong-kDSDFS eULIZs'}).contents) #get tag with rank
        rank = ''.join(rank) #join elements in list to make string including rank
        rank = re.findall('\d+',rank)[0] #find the number
        uni_dict[name] = int(rank)
        
    uni_df = pd.DataFrame.from_dict({'name':list(uni_dict.keys()), 
                                     'rank':list(uni_dict.values())})
    uni_df = (uni_df.sort_values(by=['rank'])
                    .reset_index()
                    .drop(columns = ['index']))
    
    return uni_df

In [121]:
def check_uni_ranking(uni_list, uni_df):
    """
        Function that checks how many universities are in 
        the top 20, top 50, or top 100 of the list 
        from U.S. News and Worlds Report
        
        :param uni_list: list of universities to check
        :param uni_df: list of ranking from U.S. News and Worlds Report

        :returns articles: list of Nature article hrefs for that year
    """
    seen_uni = set()
    top20, top50, top100, other = 0, 0, 0, 0
    for uni in uni_list:
        if uni in seen_uni:
            continue
        rank_cnt = uni_df[uni_df['name'].str.contains(uni)]['rank'].sum()
        if rank_cnt != 0:
            for ranking in uni_df[uni_df['name'].str.contains(uni)]['rank']:
                ranking = int(ranking)
                if ranking <= 100:
                    top100 = 1
                if ranking <= 50:
                    top50 = 1
                if ranking <= 20:
                    top20 = 1
            seen_uni.add(uni)
        else:
            other = 1
                
    return top20, top50, top100, other

Return to [Table of Contents](#table)

### 2B. Nature <a name="nature"></a>

In [117]:
def generate_nature_links(year):
     """
        Function to fetch links to all the articles written
        for Nature for a specified year
        
        :param year: year during which Nature has articles for e.g. 2019
        
        :returns articles: a list of Nature article hrefs for that year
    """
    
    nature_url = 'http://www.nature.com/nature/articles?searchType\
                    =journalSearch&sort=PubDate&type=article&year=' 
    nature_url = nature_url + str(year)
   
    articles = []
    
    nature_response  = requests.get(nature_url)
    nature_page = nature_response.text
    nature_soup = BeautifulSoup(nature_page, 'lxml')
    
    #find the number of pages there are to go through for this specific year in Nature
    num_pages = len(set(page.get('href') for page in 
                        nature_soup.select('a[href^="/nature/articles?"]'))) + 1
                    
    nature_url = nature_url + '&page='
                    
    while page_cnt < num_pages + 1: 
        nature_response  = requests.get(nature_url + str(page_cnt))
        nature_page = nature_response.text
        nature_soup = BeautifulSoup(nature_page, 'lxml')
        entries = nature_soup.find_all('a', {'data-track-action':'view article'})
        for entry in entries:
            articles.append(entry.get('href'))
        page_cnt += 1
                                       
    return articles

In [10]:
def article_info(href):
    """
        Function to scrape information from a Nature article
        from a specific href
        
        :param href: a href link to a specific Nature article
        
        :returns title_length: title length for Nature article
        :returns altmetric: almetric score for Nature article
        :returns num_times_cited: number of times Nature article was cited
        :returns abstract_length: numbers of words in article abstract
        :returns page_length: number of pages for Nature article
        :returns fig_count: number of figures in Nature article
        :returns ref_cnt: number of cited references in Nature article
        :returns authors: list of authors who wrote Nature article
        :returns uni_list: list of institutions authors came from
        :returns num_institutions: number of institutions authors came from
    """
   
    nature_url = 'https://www.nature.com' + href
    user_agent = {'User-agent': 'Mozilla/5.0'}
    
    try:
        nature_response  = requests.get(nature_url, headers = user_agent) 
        time.sleep(0.5+2*random.random())
    except:
        time.sleep(60)
        nature_response  = requests.get(nature_url, headers = user_agent)
    
    #find the title and the number of words in the title
    nature_page = nature_response.text
    nature_soup = BeautifulSoup(nature_page, 'lxml')
    title = nature_soup.find('meta', {'name':'citation_title'}).get('content')
    title_length = len(title.split(' '))
    
    #find the abstract and the length of the abstract
    abstract = nature_soup.find('meta', {'name':'dc.description'})
    abstract = abstract.get('content').split(' ')
    abstract_length = len(abstract)
    
    #find the page length
    pageStart = int(nature_soup.find('span', {'itemprop':'pageStart'}).text)
    pageEnd = int(nature_soup.find('span', {'itemprop':'pageEnd'}).text)
    page_length = pageEnd - pageStart
    
    #find the number of figures in an article
    fig_items = nature_soup.find_all('div', class_="c-article-section__figure-item")
    fig_count = len(fig_items) 

    if fig_count == 0:
        try:
            fig_items = nature_soup.find_all('img', {'data-component':'rc-content-image'})
        except:
            pass
    
    #find the number of references in an article
    references = nature_soup.find_all('p', {'class':'c-article-references__text'})
    ref_cnt = len(references)
    
    if ref_cnt == 0:
        try:
            #sometimes you have to use a differet 
            references = nature_soup.find_all('cite')
            ref_cnt = len(references)
        except:
            pass
    
    try:
        num_times_cited = nature_soup.find('p', {'data-test':'citation-count'}).text.split(' ')[0]
        altmetric = nature_soup.find('p', {'data-test':'altmetric-score'}).text.split(' ')[0]
    except:
        metrics = nature_soup.find_all('p', {'class':'c-article-metrics-bar__count'})
        num_times_cited = metrics[0].contents[0]
        altmetric = metrics[1].contents[0] #metric used to help measure impact of paper 
    
    authors = [author.get('content') for author in nature_soup.find_all('meta', {'name':'dc.creator'})]

    
    uni_addr_list = nature_soup.find_all('h4', {'class': 'c-article-author-affiliation__address'})
    
    
    if len(uni_addr_list) == 0:
        uni_addr_list = nature_soup.find_all('h3', {'class':'emphasis'})
    
    uni_list = []
    num_institutions = 0

    for uni_addr in uni_addr_list:
        uni = uni_addr.text.split(', ')[:2]
        num_institutions += 1
        try:
            #some addresses are not clear, but the name of the institution
            #is usually included in either of the first two values, 
            #so we add both values
            for i in range(0, 2):             
                uni_list.append(uni[i]) 
        except:
            continue
            
    return (title_length, altmetric, num_times_cited, 
            abstract_length, page_length, fig_count, 
            ref_cnt, authors, uni_list, num_institutions)

Return to [Table of Contents](#table)

### 2C. Google Scholar <a name="google"></a>

In [119]:
def author_info(name):
    """
        Function to search up an author on Google Scholar 
        and find info on them from their profile
        
        :param href: a href link to a specific Nature article
        
        :returns citations: numbers of citations
        :returns h_index: impact factor h_index of author
        :returns num_keywords: number of keyword tags author uses
    """
    
    google_url = 'https://scholar.google.com'
    scholar_url = google_url + '/citations?view_op=search_\
                                    authors&hl=en&mauthors=' + str(name)
    
    user_agent = {'User-agent': 'Mozilla/5.0'}
    scholar_response  = requests.get(scholar_url, headers = user_agent)

    time.sleep(4+2*random.random())

    scholar_page = scholar_response.text
    scholar_soup = BeautifulSoup(scholar_page, 'lxml')
        
    href = (scholar_soup.find('h3', {'class':'gs_ai_name'}) #find the parent tag
                        .findChild('a') #find the child tag that has the href
                        .get('href')) #get the href
    
    profile_url = google_url + href
    profile_response  = requests.get(profile_url)
    profile_page = profile_response.text
    profile_soup = BeautifulSoup(profile_page, 'lxml')

    metrics = profile_soup.find_all('td', {'class':"gsc_rsb_std"})[::2]
    citations = int(metrics[0].text)
    h_index = int(metrics[1].text)
    i10_index = int(metrics[2].text)

    num_keywords = len(profile_soup.find_all('a', {'class': 'gsc_prf_inta gs_ibl'}))
    
    return citations, h_index, i10_index, num_keywords

Return to [Table of Contents](#table)

## 3. Scraping Methods Assemble <a name="assemble"></a>

Now that we have written all the scraping functions, we can actually begin running our functions to start collecting the data for our linear regression model.

In [17]:
#only need to run the follow two lines of code once 
#comment out once done
if 'data' not in os.listdir():
    os.mkdir('data')
    
file_dir = os.path.abspath('.')
csv_folder = 'data'
path = os.path.join(file_dir, csv_folder, year + 'university_rankings.csv')

rankings = get_rankings()
rankings.to_csv(path)

rankings = pd.read_csv(path)
rankings = rankings.drop(columns = 'Unnamed: 0')
uni_df = rankings
uni_df.head()

Unnamed: 0,name,rank
0,Massachusetts Institute of Technology,1
1,Stanford University,1
2,"University of California, Berkeley",1
3,California Institute of Technology,4
4,Harvard University,4


In [25]:
def collect_info(href):
    """
        Function to scrape info from Nature and Google Scholar
        
        :param href: article link to scrape data from 
        
        :returns df: one-row pandas dataframe containing scraped results
    """

    info = article_info(href)     

    authors = info[-3]
    num_authors = len(authors)

    if len(authors) > 5:
        authors = authors[:5]

    total_citations = []
    total_h_index = []
    total_il0_index = []

    for author in authors:
        try: 
            author_metrics = author_info(author)
            citations = author_metrics[0]
            total_citations.append(citations)
            h_index = author_metrics[1]
            total_h_index.append(h_index)
            i10_index = author_metrics[2]
            total_il0_index.append(i10_index)
        except:
            continue

    if len(total_citations) != 0:
        mean_author_citations = round(sum(total_citations)/len(total_citations), 2)
        mean_h_index = round(sum(total_h_index)/len(total_h_index), 2)
        mean_il0_index = round(sum(total_il0_index)/len(total_il0_index), 2)
    else:
        mean_author_citations = np.nan
        mean_h_index = np.nan
        mean_il0_index = np.nan

    uni_list = info[-2]
    try:
        top20, top50, top100, other = check_uni_ranking(uni_list, uni_df)
    except:
        top20, top50, top100, other = np.nan, np.nan, np.nan, 1

    df = pd.DataFrame()
    df['title_length']= [info[0]]
    df['altmetric'] = info[1]
    df['num_times_cited'] = info[2]
    df['abstract_length'] = info[3]
    df['page_length'] = info[4]
    df['fig_count'] = info[5]
    df['ref_cnt'] = info[-4]
    df['mean_author_citations'] = mean_author_citations
    df['mean_h_index'] = mean_h_index
    df['mean_i10_index'] = mean_il0_index
    df['num_authors'] = num_authors
    df['top20'] = top20
    df['top50'] = top50
    df['top100'] = top100
    df['other'] = other
    df['num_insitutions'] = info[-1]
    df['year'] = year
        
    return df

For each year given below, we want to fetch data and save it to a .csv file.

In [None]:
years = [str(2009+i) for i in range(0, 10)]

if 'data' not in os.listdir():
    os.mkdir('data')
    
file_dir = os.path.abspath('.')
csv_folder = 'data'
    
for year in years:
    article_df = pd.DataFrame()
    hrefs = generate_nature_links(year)
    for href in hrefs:
        df = collect_info()
        article_df = pd.concat([article_df, df])
        
    path = os.path.join(file_dir, csv_folder, year + '_article_metrics.csv')
    article_df.to_csv(path)

Return to [Table of Contents](#table)