In [1]:
from IPython.core.display import display, HTML
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import requests
import re
import random
import time
from collections import namedtuple
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## Data cleaning and normalization between data sets

In [2]:
def get_soup_from_url(url):
    
    """
    Returns beautifulsoup object from given URL
    """
    
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    response  = requests.get(url, headers = user_agent)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup

In [3]:
## Tune white/blacklists to filter out departmental labs and keep data at university-level.
inst_whitelist = ["University", "Institute", "College", "School", "Medicine", "Hospital", "Academy", "Medical", "School of Medicine"]
inst_blacklist = ["Department", "Laboratory", "Faculty", "Public Health", "Genome"]

def normalize_institution_names(inst):
    
    """
    Normalize institution names between scraped data set and Nature Institution stats
    """
    
    split = inst.split(',')

    first_passes = []
    final = []
    
    ## Restructure University of California names to join campus name with UC name, then proceed.
    if "California" or "Carolina" in inst:
        for i in range(len(split[:-1])):
            split[i] = split[i].strip()
            if split[i] == 'University of California' or split[i] == 'University of North Carolina':
                split[i] = "".join(split[i] + split.pop(i+1))

    for word in split:
        res = [ele for ele in inst_whitelist if(ele in word)]
        if res:
            first_passes.append(word)
        for passed in first_passes:
            if not any([black in passed for black in inst_blacklist]):
                final.append(passed.strip()) 
    return (" , ".join(list(set(final))))

Read in scraped data from pages 230-270.

In [4]:
data_set_1 = pd.read_csv("paper_data_230_240.csv")
data_set_2 = pd.read_csv("paper_data_241_251.csv")
data_set_3 = pd.read_csv("paper_data_252_260.csv")
data_set_4 = pd.read_csv("paper_data_261_270.csv")

all_data = data_set_1.append([data_set_2,data_set_3,data_set_4])
all_data = all_data.dropna()

Read in working data set.

In [None]:
#all_data = pd.concat([data_set_1,data_set_2,data_set_3,data_set_4],axis=1)

Clean up scraped data set.

In [None]:
all_data.columns = ['Title','Accesses','Citations','Date','Journal','Author','PI', 'Institution','AllAuthors','Abstract']
dates = all_data.Date.str.split(expand=True)
dates.columns = ['Day','Month','Year']
all_data = pd.concat([all_data,dates], axis=1)
all_data.Month = pd.to_datetime(all_data.Month, format='%B').dt.month

all_data['NumAuthors'] = all_data.AllAuthors.str.split(',').str.len()
all_data['lenTitle'] = all_data.Title.str.split(' ').str.len()

all_data = all_data.drop(['Date', 'AllAuthors','Day'],1)

Clean up and normalize Impact Factor data.

In [68]:
impact = pd.read_csv('tabula-Journal-Citation-Report-2019.csv')
impact.columns = ['Journal', 'ImpactFactor', 'toDelete']
impact = impact.drop('toDelete',1)
impact = impact[~impact.ImpactFactor.str.contains("Not Available")]
impact = impact.fillna(0)

impact.Journal = impact.Journal.str.upper()
all_data.Journal = all_data.Journal.str.upper()

all_data = pd.merge(all_data, impact, on=['Journal'], sort=False)

Normalize institution names in Nature stats data set.

In [69]:
nature_stats = pd.read_csv("nature_stats.csv")
nature_stats = nature_stats.drop(['Country'],1)
nature_stats.Institution = nature_stats.Institution.str.upper()
nature_stats['norm_Institution'] = nature_stats.Institution.str.replace(r'\([^)]*\)', '', regex=True).str.strip()
nature_stats.norm_Institution = nature_stats.norm_Institution.str.replace(',', '', regex=True)
nature_stats.norm_Institution = nature_stats.norm_Institution.str.upper()

Normalizing institution names in working data set.

In [None]:
all_data['norm_Institution'] = all_data.Institution.apply(normalize_institution_names).str.upper()
all_data.norm_Institution = all_data.norm_Institution.replace(r'^\s*$', np.nan, regex=True)
all_data.norm_Institution = all_data.norm_Institution.fillna(all_data.Institution.str.upper())

Use fuzzy string match to combine Nature stats with working data set.

In [70]:
scores = []
for row in all_data.itertuples():
    fuzzy = process.extractOne(row.norm_Institution, nature_stats.norm_Institution,scorer=fuzz.partial_ratio)
    nature_match = fuzzy[0]
    nature_match_score = fuzzy[1]
    
    if fuzzy[1] > 90:
        match_index = nature_stats[nature_stats.norm_Institution == fuzzy[0]].index.values[0]
        scores.append([nature_stats.AC.loc[match_index], nature_stats.FC.loc[match_index]])
    else:
        scores.append([0, 0])

scores_df = pd.DataFrame(scores,columns=['NatureAC','NatureFC'])
all_data = pd.concat([all_data,scores_df],1)

Clean up final working data set and output to flat file.

In [72]:
all_data = all_data[['Citations','Accesses','Month','Year','NumAuthors','lenTitle','ImpactFactor','NatureAC','NatureFC','Title','Abstract']]

In [74]:
all_data.to_csv("working_data.csv",index=False)